summaryrefslogtreecommitdiffstats
path: root/src/rocksdb/db
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
commit19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/rocksdb/db
parentInitial commit. (diff)
downloadceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.tar.xz
ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.zip
Adding upstream version 16.2.11+ds.upstream/16.2.11+dsupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/rocksdb/db')
-rw-r--r--src/rocksdb/db/arena_wrapped_db_iter.cc106
-rw-r--r--src/rocksdb/db/arena_wrapped_db_iter.h112
-rw-r--r--src/rocksdb/db/blob_index.h179
-rw-r--r--src/rocksdb/db/builder.cc263
-rw-r--r--src/rocksdb/db/builder.h88
-rw-r--r--src/rocksdb/db/c.cc4451
-rw-r--r--src/rocksdb/db/c_test.c1866
-rw-r--r--src/rocksdb/db/column_family.cc1523
-rw-r--r--src/rocksdb/db/column_family.h757
-rw-r--r--src/rocksdb/db/column_family_test.cc3387
-rw-r--r--src/rocksdb/db/compact_files_test.cc421
-rw-r--r--src/rocksdb/db/compacted_db_impl.cc160
-rw-r--r--src/rocksdb/db/compacted_db_impl.h113
-rw-r--r--src/rocksdb/db/compaction/compaction.cc564
-rw-r--r--src/rocksdb/db/compaction/compaction.h384
-rw-r--r--src/rocksdb/db/compaction/compaction_iteration_stats.h37
-rw-r--r--src/rocksdb/db/compaction/compaction_iterator.cc774
-rw-r--r--src/rocksdb/db/compaction/compaction_iterator.h240
-rw-r--r--src/rocksdb/db/compaction/compaction_iterator_test.cc976
-rw-r--r--src/rocksdb/db/compaction/compaction_job.cc1700
-rw-r--r--src/rocksdb/db/compaction/compaction_job.h198
-rw-r--r--src/rocksdb/db/compaction/compaction_job_stats_test.cc1043
-rw-r--r--src/rocksdb/db/compaction/compaction_job_test.cc1082
-rw-r--r--src/rocksdb/db/compaction/compaction_picker.cc1131
-rw-r--r--src/rocksdb/db/compaction/compaction_picker.h313
-rw-r--r--src/rocksdb/db/compaction/compaction_picker_fifo.cc244
-rw-r--r--src/rocksdb/db/compaction/compaction_picker_fifo.h53
-rw-r--r--src/rocksdb/db/compaction/compaction_picker_level.cc558
-rw-r--r--src/rocksdb/db/compaction/compaction_picker_level.h32
-rw-r--r--src/rocksdb/db/compaction/compaction_picker_test.cc1741
-rw-r--r--src/rocksdb/db/compaction/compaction_picker_universal.cc1105
-rw-r--r--src/rocksdb/db/compaction/compaction_picker_universal.h31
-rw-r--r--src/rocksdb/db/comparator_db_test.cc660
-rw-r--r--src/rocksdb/db/convenience.cc77
-rw-r--r--src/rocksdb/db/corruption_test.cc613
-rw-r--r--src/rocksdb/db/cuckoo_table_db_test.cc351
-rw-r--r--src/rocksdb/db/db_basic_test.cc2545
-rw-r--r--src/rocksdb/db/db_blob_index_test.cc436
-rw-r--r--src/rocksdb/db/db_block_cache_test.cc761
-rw-r--r--src/rocksdb/db/db_bloom_filter_test.cc1910
-rw-r--r--src/rocksdb/db/db_compaction_filter_test.cc872
-rw-r--r--src/rocksdb/db/db_compaction_test.cc5167
-rw-r--r--src/rocksdb/db/db_dynamic_level_test.cc505
-rw-r--r--src/rocksdb/db/db_encryption_test.cc122
-rw-r--r--src/rocksdb/db/db_filesnapshot.cc177
-rw-r--r--src/rocksdb/db/db_flush_test.cc784
-rw-r--r--src/rocksdb/db/db_impl/db_impl.cc4550
-rw-r--r--src/rocksdb/db/db_impl/db_impl.h2107
-rw-r--r--src/rocksdb/db/db_impl/db_impl_compaction_flush.cc3116
-rw-r--r--src/rocksdb/db/db_impl/db_impl_debug.cc294
-rw-r--r--src/rocksdb/db/db_impl/db_impl_experimental.cc151
-rw-r--r--src/rocksdb/db/db_impl/db_impl_files.cc667
-rw-r--r--src/rocksdb/db/db_impl/db_impl_open.cc1651
-rw-r--r--src/rocksdb/db/db_impl/db_impl_readonly.cc221
-rw-r--r--src/rocksdb/db/db_impl/db_impl_readonly.h137
-rw-r--r--src/rocksdb/db/db_impl/db_impl_secondary.cc671
-rw-r--r--src/rocksdb/db/db_impl/db_impl_secondary.h333
-rw-r--r--src/rocksdb/db/db_impl/db_impl_write.cc1839
-rw-r--r--src/rocksdb/db/db_impl/db_secondary_test.cc869
-rw-r--r--src/rocksdb/db/db_info_dumper.cc123
-rw-r--r--src/rocksdb/db/db_info_dumper.h14
-rw-r--r--src/rocksdb/db/db_inplace_update_test.cc177
-rw-r--r--src/rocksdb/db/db_io_failure_test.cc568
-rw-r--r--src/rocksdb/db/db_iter.cc1310
-rw-r--r--src/rocksdb/db/db_iter.h344
-rw-r--r--src/rocksdb/db/db_iter_stress_test.cc654
-rw-r--r--src/rocksdb/db/db_iter_test.cc3175
-rw-r--r--src/rocksdb/db/db_iterator_test.cc2998
-rw-r--r--src/rocksdb/db/db_log_iter_test.cc294
-rw-r--r--src/rocksdb/db/db_memtable_test.cc340
-rw-r--r--src/rocksdb/db/db_merge_operand_test.cc240
-rw-r--r--src/rocksdb/db/db_merge_operator_test.cc666
-rw-r--r--src/rocksdb/db/db_options_test.cc870
-rw-r--r--src/rocksdb/db/db_properties_test.cc1711
-rw-r--r--src/rocksdb/db/db_range_del_test.cc1660
-rw-r--r--src/rocksdb/db/db_sst_test.cc1227
-rw-r--r--src/rocksdb/db/db_statistics_test.cc149
-rw-r--r--src/rocksdb/db/db_table_properties_test.cc336
-rw-r--r--src/rocksdb/db/db_tailing_iter_test.cc547
-rw-r--r--src/rocksdb/db/db_test.cc6605
-rw-r--r--src/rocksdb/db/db_test2.cc4695
-rw-r--r--src/rocksdb/db/db_test_util.cc1564
-rw-r--r--src/rocksdb/db/db_test_util.h1000
-rw-r--r--src/rocksdb/db/db_universal_compaction_test.cc2254
-rw-r--r--src/rocksdb/db/db_wal_test.cc1586
-rw-r--r--src/rocksdb/db/db_write_test.cc329
-rw-r--r--src/rocksdb/db/dbformat.cc197
-rw-r--r--src/rocksdb/db/dbformat.h671
-rw-r--r--src/rocksdb/db/dbformat_test.cc207
-rw-r--r--src/rocksdb/db/deletefile_test.cc571
-rw-r--r--src/rocksdb/db/error_handler.cc344
-rw-r--r--src/rocksdb/db/error_handler.h75
-rw-r--r--src/rocksdb/db/error_handler_test.cc871
-rw-r--r--src/rocksdb/db/event_helpers.cc223
-rw-r--r--src/rocksdb/db/event_helpers.h55
-rw-r--r--src/rocksdb/db/experimental.cc50
-rw-r--r--src/rocksdb/db/external_sst_file_basic_test.cc1128
-rw-r--r--src/rocksdb/db/external_sst_file_ingestion_job.cc731
-rw-r--r--src/rocksdb/db/external_sst_file_ingestion_job.h180
-rw-r--r--src/rocksdb/db/external_sst_file_test.cc2832
-rw-r--r--src/rocksdb/db/fault_injection_test.cc555
-rw-r--r--src/rocksdb/db/file_indexer.cc216
-rw-r--r--src/rocksdb/db/file_indexer.h142
-rw-r--r--src/rocksdb/db/file_indexer_test.cc350
-rw-r--r--src/rocksdb/db/filename_test.cc180
-rw-r--r--src/rocksdb/db/flush_job.cc466
-rw-r--r--src/rocksdb/db/flush_job.h158
-rw-r--r--src/rocksdb/db/flush_job_test.cc498
-rw-r--r--src/rocksdb/db/flush_scheduler.cc86
-rw-r--r--src/rocksdb/db/flush_scheduler.h54
-rw-r--r--src/rocksdb/db/forward_iterator.cc975
-rw-r--r--src/rocksdb/db/forward_iterator.h160
-rw-r--r--src/rocksdb/db/forward_iterator_bench.cc377
-rw-r--r--src/rocksdb/db/import_column_family_job.cc276
-rw-r--r--src/rocksdb/db/import_column_family_job.h72
-rw-r--r--src/rocksdb/db/import_column_family_test.cc567
-rw-r--r--src/rocksdb/db/internal_stats.cc1424
-rw-r--r--src/rocksdb/db/internal_stats.h697
-rw-r--r--src/rocksdb/db/job_context.h219
-rw-r--r--src/rocksdb/db/listener_test.cc1042
-rw-r--r--src/rocksdb/db/log_format.h48
-rw-r--r--src/rocksdb/db/log_reader.cc624
-rw-r--r--src/rocksdb/db/log_reader.h189
-rw-r--r--src/rocksdb/db/log_test.cc928
-rw-r--r--src/rocksdb/db/log_writer.cc162
-rw-r--r--src/rocksdb/db/log_writer.h114
-rw-r--r--src/rocksdb/db/logs_with_prep_tracker.cc67
-rw-r--r--src/rocksdb/db/logs_with_prep_tracker.h63
-rw-r--r--src/rocksdb/db/lookup_key.h66
-rw-r--r--src/rocksdb/db/malloc_stats.cc54
-rw-r--r--src/rocksdb/db/malloc_stats.h24
-rw-r--r--src/rocksdb/db/manual_compaction_test.cc160
-rw-r--r--src/rocksdb/db/memtable.cc1122
-rw-r--r--src/rocksdb/db/memtable.h542
-rw-r--r--src/rocksdb/db/memtable_list.cc771
-rw-r--r--src/rocksdb/db/memtable_list.h422
-rw-r--r--src/rocksdb/db/memtable_list_test.cc922
-rw-r--r--src/rocksdb/db/merge_context.h134
-rw-r--r--src/rocksdb/db/merge_helper.cc417
-rw-r--r--src/rocksdb/db/merge_helper.h194
-rw-r--r--src/rocksdb/db/merge_helper_test.cc290
-rw-r--r--src/rocksdb/db/merge_operator.cc86
-rw-r--r--src/rocksdb/db/merge_test.cc504
-rw-r--r--src/rocksdb/db/obsolete_files_test.cc222
-rw-r--r--src/rocksdb/db/options_file_test.cc119
-rw-r--r--src/rocksdb/db/perf_context_test.cc981
-rw-r--r--src/rocksdb/db/pinned_iterators_manager.h87
-rw-r--r--src/rocksdb/db/plain_table_db_test.cc1375
-rw-r--r--src/rocksdb/db/pre_release_callback.h38
-rw-r--r--src/rocksdb/db/prefix_test.cc895
-rw-r--r--src/rocksdb/db/range_del_aggregator.cc484
-rw-r--r--src/rocksdb/db/range_del_aggregator.h441
-rw-r--r--src/rocksdb/db/range_del_aggregator_bench.cc260
-rw-r--r--src/rocksdb/db/range_del_aggregator_test.cc709
-rw-r--r--src/rocksdb/db/range_tombstone_fragmenter.cc439
-rw-r--r--src/rocksdb/db/range_tombstone_fragmenter.h256
-rw-r--r--src/rocksdb/db/range_tombstone_fragmenter_test.cc552
-rw-r--r--src/rocksdb/db/read_callback.h53
-rw-r--r--src/rocksdb/db/repair.cc691
-rw-r--r--src/rocksdb/db/repair_test.cc369
-rw-r--r--src/rocksdb/db/snapshot_checker.h61
-rw-r--r--src/rocksdb/db/snapshot_impl.cc26
-rw-r--r--src/rocksdb/db/snapshot_impl.h167
-rw-r--r--src/rocksdb/db/table_cache.cc668
-rw-r--r--src/rocksdb/db/table_cache.h226
-rw-r--r--src/rocksdb/db/table_properties_collector.cc74
-rw-r--r--src/rocksdb/db/table_properties_collector.h107
-rw-r--r--src/rocksdb/db/table_properties_collector_test.cc515
-rw-r--r--src/rocksdb/db/transaction_log_impl.cc315
-rw-r--r--src/rocksdb/db/transaction_log_impl.h127
-rw-r--r--src/rocksdb/db/trim_history_scheduler.cc54
-rw-r--r--src/rocksdb/db/trim_history_scheduler.h44
-rw-r--r--src/rocksdb/db/version_builder.cc545
-rw-r--r--src/rocksdb/db/version_builder.h48
-rw-r--r--src/rocksdb/db/version_builder_test.cc349
-rw-r--r--src/rocksdb/db/version_edit.cc826
-rw-r--r--src/rocksdb/db/version_edit.h438
-rw-r--r--src/rocksdb/db/version_edit_test.cc286
-rw-r--r--src/rocksdb/db/version_set.cc6005
-rw-r--r--src/rocksdb/db/version_set.h1251
-rw-r--r--src/rocksdb/db/version_set_test.cc1287
-rw-r--r--src/rocksdb/db/wal_manager.cc510
-rw-r--r--src/rocksdb/db/wal_manager.h114
-rw-r--r--src/rocksdb/db/wal_manager_test.cc338
-rw-r--r--src/rocksdb/db/write_batch.cc2092
-rw-r--r--src/rocksdb/db/write_batch_base.cc94
-rw-r--r--src/rocksdb/db/write_batch_internal.h250
-rw-r--r--src/rocksdb/db/write_batch_test.cc888
-rw-r--r--src/rocksdb/db/write_callback.h27
-rw-r--r--src/rocksdb/db/write_callback_test.cc452
-rw-r--r--src/rocksdb/db/write_controller.cc128
-rw-r--r--src/rocksdb/db/write_controller.h144
-rw-r--r--src/rocksdb/db/write_controller_test.cc135
-rw-r--r--src/rocksdb/db/write_thread.cc777
-rw-r--r--src/rocksdb/db/write_thread.h431
195 files changed, 144976 insertions, 0 deletions
diff --git a/src/rocksdb/db/arena_wrapped_db_iter.cc b/src/rocksdb/db/arena_wrapped_db_iter.cc
new file mode 100644
index 000000000..f43282a75
--- /dev/null
+++ b/src/rocksdb/db/arena_wrapped_db_iter.cc
@@ -0,0 +1,106 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/arena_wrapped_db_iter.h"
+#include "memory/arena.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "table/internal_iterator.h"
+#include "table/iterator_wrapper.h"
+#include "util/user_comparator_wrapper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status ArenaWrappedDBIter::GetProperty(std::string prop_name,
+ std::string* prop) {
+ if (prop_name == "rocksdb.iterator.super-version-number") {
+ // First try to pass the value returned from inner iterator.
+ if (!db_iter_->GetProperty(prop_name, prop).ok()) {
+ *prop = ToString(sv_number_);
+ }
+ return Status::OK();
+ }
+ return db_iter_->GetProperty(prop_name, prop);
+}
+
+void ArenaWrappedDBIter::Init(Env* env, const ReadOptions& read_options,
+ const ImmutableCFOptions& cf_options,
+ const MutableCFOptions& mutable_cf_options,
+ const SequenceNumber& sequence,
+ uint64_t max_sequential_skip_in_iteration,
+ uint64_t version_number,
+ ReadCallback* read_callback, DBImpl* db_impl,
+ ColumnFamilyData* cfd, bool allow_blob,
+ bool allow_refresh) {
+ auto mem = arena_.AllocateAligned(sizeof(DBIter));
+ db_iter_ = new (mem) DBIter(env, read_options, cf_options, mutable_cf_options,
+ cf_options.user_comparator, nullptr, sequence,
+ true, max_sequential_skip_in_iteration,
+ read_callback, db_impl, cfd, allow_blob);
+ sv_number_ = version_number;
+ allow_refresh_ = allow_refresh;
+}
+
+Status ArenaWrappedDBIter::Refresh() {
+ if (cfd_ == nullptr || db_impl_ == nullptr || !allow_refresh_) {
+ return Status::NotSupported("Creating renew iterator is not allowed.");
+ }
+ assert(db_iter_ != nullptr);
+ // TODO(yiwu): For last_seq_same_as_publish_seq_==false, this is not the
+ // correct behavior. Will be corrected automatically when we take a snapshot
+ // here for the case of WritePreparedTxnDB.
+ SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber();
+ uint64_t cur_sv_number = cfd_->GetSuperVersionNumber();
+ if (sv_number_ != cur_sv_number) {
+ Env* env = db_iter_->env();
+ db_iter_->~DBIter();
+ arena_.~Arena();
+ new (&arena_) Arena();
+
+ SuperVersion* sv = cfd_->GetReferencedSuperVersion(db_impl_);
+ if (read_callback_) {
+ read_callback_->Refresh(latest_seq);
+ }
+ Init(env, read_options_, *(cfd_->ioptions()), sv->mutable_cf_options,
+ latest_seq, sv->mutable_cf_options.max_sequential_skip_in_iterations,
+ cur_sv_number, read_callback_, db_impl_, cfd_, allow_blob_,
+ allow_refresh_);
+
+ InternalIterator* internal_iter = db_impl_->NewInternalIterator(
+ read_options_, cfd_, sv, &arena_, db_iter_->GetRangeDelAggregator(),
+ latest_seq);
+ SetIterUnderDBIter(internal_iter);
+ } else {
+ db_iter_->set_sequence(latest_seq);
+ db_iter_->set_valid(false);
+ }
+ return Status::OK();
+}
+
+ArenaWrappedDBIter* NewArenaWrappedDbIterator(
+ Env* env, const ReadOptions& read_options,
+ const ImmutableCFOptions& cf_options,
+ const MutableCFOptions& mutable_cf_options, const SequenceNumber& sequence,
+ uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
+ ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
+ bool allow_blob, bool allow_refresh) {
+ ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
+ iter->Init(env, read_options, cf_options, mutable_cf_options, sequence,
+ max_sequential_skip_in_iterations, version_number, read_callback,
+ db_impl, cfd, allow_blob, allow_refresh);
+ if (db_impl != nullptr && cfd != nullptr && allow_refresh) {
+ iter->StoreRefreshInfo(read_options, db_impl, cfd, read_callback,
+ allow_blob);
+ }
+
+ return iter;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/arena_wrapped_db_iter.h b/src/rocksdb/db/arena_wrapped_db_iter.h
new file mode 100644
index 000000000..0c135f857
--- /dev/null
+++ b/src/rocksdb/db/arena_wrapped_db_iter.h
@@ -0,0 +1,112 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+#include <string>
+#include "db/db_impl/db_impl.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/range_del_aggregator.h"
+#include "memory/arena.h"
+#include "options/cf_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/iterator.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Arena;
+
+// A wrapper iterator which wraps DB Iterator and the arena, with which the DB
+// iterator is supposed to be allocated. This class is used as an entry point of
+// a iterator hierarchy whose memory can be allocated inline. In that way,
+// accessing the iterator tree can be more cache friendly. It is also faster
+// to allocate.
+// When using the class's Iterator interface, the behavior is exactly
+// the same as the inner DBIter.
+class ArenaWrappedDBIter : public Iterator {
+ public:
+ virtual ~ArenaWrappedDBIter() { db_iter_->~DBIter(); }
+
+ // Get the arena to be used to allocate memory for DBIter to be wrapped,
+ // as well as child iterators in it.
+ virtual Arena* GetArena() { return &arena_; }
+ virtual ReadRangeDelAggregator* GetRangeDelAggregator() {
+ return db_iter_->GetRangeDelAggregator();
+ }
+
+ // Set the internal iterator wrapped inside the DB Iterator. Usually it is
+ // a merging iterator.
+ virtual void SetIterUnderDBIter(InternalIterator* iter) {
+ db_iter_->SetIter(iter);
+ }
+
+ bool Valid() const override { return db_iter_->Valid(); }
+ void SeekToFirst() override { db_iter_->SeekToFirst(); }
+ void SeekToLast() override { db_iter_->SeekToLast(); }
+ void Seek(const Slice& target) override { db_iter_->Seek(target); }
+ void SeekForPrev(const Slice& target) override {
+ db_iter_->SeekForPrev(target);
+ }
+ void Next() override { db_iter_->Next(); }
+ void Prev() override { db_iter_->Prev(); }
+ Slice key() const override { return db_iter_->key(); }
+ Slice value() const override { return db_iter_->value(); }
+ Status status() const override { return db_iter_->status(); }
+ bool IsBlob() const { return db_iter_->IsBlob(); }
+
+ Status GetProperty(std::string prop_name, std::string* prop) override;
+
+ Status Refresh() override;
+
+ void Init(Env* env, const ReadOptions& read_options,
+ const ImmutableCFOptions& cf_options,
+ const MutableCFOptions& mutable_cf_options,
+ const SequenceNumber& sequence,
+ uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
+ ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
+ bool allow_blob, bool allow_refresh);
+
+ // Store some parameters so we can refresh the iterator at a later point
+ // with these same params
+ void StoreRefreshInfo(const ReadOptions& read_options, DBImpl* db_impl,
+ ColumnFamilyData* cfd, ReadCallback* read_callback,
+ bool allow_blob) {
+ read_options_ = read_options;
+ db_impl_ = db_impl;
+ cfd_ = cfd;
+ read_callback_ = read_callback;
+ allow_blob_ = allow_blob;
+ }
+
+ private:
+ DBIter* db_iter_;
+ Arena arena_;
+ uint64_t sv_number_;
+ ColumnFamilyData* cfd_ = nullptr;
+ DBImpl* db_impl_ = nullptr;
+ ReadOptions read_options_;
+ ReadCallback* read_callback_;
+ bool allow_blob_ = false;
+ bool allow_refresh_ = true;
+};
+
+// Generate the arena wrapped iterator class.
+// `db_impl` and `cfd` are used for reneweal. If left null, renewal will not
+// be supported.
+extern ArenaWrappedDBIter* NewArenaWrappedDbIterator(
+ Env* env, const ReadOptions& read_options,
+ const ImmutableCFOptions& cf_options,
+ const MutableCFOptions& mutable_cf_options, const SequenceNumber& sequence,
+ uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
+ ReadCallback* read_callback, DBImpl* db_impl = nullptr,
+ ColumnFamilyData* cfd = nullptr, bool allow_blob = false,
+ bool allow_refresh = true);
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob_index.h b/src/rocksdb/db/blob_index.h
new file mode 100644
index 000000000..483a7b97b
--- /dev/null
+++ b/src/rocksdb/db/blob_index.h
@@ -0,0 +1,179 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <sstream>
+#include <string>
+
+#include "rocksdb/options.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// BlobIndex is a pointer to the blob and metadata of the blob. The index is
+// stored in base DB as ValueType::kTypeBlobIndex.
+// There are three types of blob index:
+//
+// kInlinedTTL:
+// +------+------------+---------------+
+// | type | expiration | value |
+// +------+------------+---------------+
+// | char | varint64 | variable size |
+// +------+------------+---------------+
+//
+// kBlob:
+// +------+-------------+----------+----------+-------------+
+// | type | file number | offset | size | compression |
+// +------+-------------+----------+----------+-------------+
+// | char | varint64 | varint64 | varint64 | char |
+// +------+-------------+----------+----------+-------------+
+//
+// kBlobTTL:
+// +------+------------+-------------+----------+----------+-------------+
+// | type | expiration | file number | offset | size | compression |
+// +------+------------+-------------+----------+----------+-------------+
+// | char | varint64 | varint64 | varint64 | varint64 | char |
+// +------+------------+-------------+----------+----------+-------------+
+//
+// There isn't a kInlined (without TTL) type since we can store it as a plain
+// value (i.e. ValueType::kTypeValue).
+class BlobIndex {
+ public:
+ enum class Type : unsigned char {
+ kInlinedTTL = 0,
+ kBlob = 1,
+ kBlobTTL = 2,
+ kUnknown = 3,
+ };
+
+ BlobIndex() : type_(Type::kUnknown) {}
+
+ bool IsInlined() const { return type_ == Type::kInlinedTTL; }
+
+ bool HasTTL() const {
+ return type_ == Type::kInlinedTTL || type_ == Type::kBlobTTL;
+ }
+
+ uint64_t expiration() const {
+ assert(HasTTL());
+ return expiration_;
+ }
+
+ const Slice& value() const {
+ assert(IsInlined());
+ return value_;
+ }
+
+ uint64_t file_number() const {
+ assert(!IsInlined());
+ return file_number_;
+ }
+
+ uint64_t offset() const {
+ assert(!IsInlined());
+ return offset_;
+ }
+
+ uint64_t size() const {
+ assert(!IsInlined());
+ return size_;
+ }
+
+ Status DecodeFrom(Slice slice) {
+ static const std::string kErrorMessage = "Error while decoding blob index";
+ assert(slice.size() > 0);
+ type_ = static_cast<Type>(*slice.data());
+ if (type_ >= Type::kUnknown) {
+ return Status::Corruption(
+ kErrorMessage,
+ "Unknown blob index type: " + ToString(static_cast<char>(type_)));
+ }
+ slice = Slice(slice.data() + 1, slice.size() - 1);
+ if (HasTTL()) {
+ if (!GetVarint64(&slice, &expiration_)) {
+ return Status::Corruption(kErrorMessage, "Corrupted expiration");
+ }
+ }
+ if (IsInlined()) {
+ value_ = slice;
+ } else {
+ if (GetVarint64(&slice, &file_number_) && GetVarint64(&slice, &offset_) &&
+ GetVarint64(&slice, &size_) && slice.size() == 1) {
+ compression_ = static_cast<CompressionType>(*slice.data());
+ } else {
+ return Status::Corruption(kErrorMessage, "Corrupted blob offset");
+ }
+ }
+ return Status::OK();
+ }
+
+ std::string DebugString(bool output_hex) const {
+ std::ostringstream oss;
+
+ if (IsInlined()) {
+ oss << "[inlined blob] value:" << value_.ToString(output_hex);
+ } else {
+ oss << "[blob ref] file:" << file_number_ << " offset:" << offset_
+ << " size:" << size_;
+ }
+
+ if (HasTTL()) {
+ oss << " exp:" << expiration_;
+ }
+
+ return oss.str();
+ }
+
+ static void EncodeInlinedTTL(std::string* dst, uint64_t expiration,
+ const Slice& value) {
+ assert(dst != nullptr);
+ dst->clear();
+ dst->reserve(1 + kMaxVarint64Length + value.size());
+ dst->push_back(static_cast<char>(Type::kInlinedTTL));
+ PutVarint64(dst, expiration);
+ dst->append(value.data(), value.size());
+ }
+
+ static void EncodeBlob(std::string* dst, uint64_t file_number,
+ uint64_t offset, uint64_t size,
+ CompressionType compression) {
+ assert(dst != nullptr);
+ dst->clear();
+ dst->reserve(kMaxVarint64Length * 3 + 2);
+ dst->push_back(static_cast<char>(Type::kBlob));
+ PutVarint64(dst, file_number);
+ PutVarint64(dst, offset);
+ PutVarint64(dst, size);
+ dst->push_back(static_cast<char>(compression));
+ }
+
+ static void EncodeBlobTTL(std::string* dst, uint64_t expiration,
+ uint64_t file_number, uint64_t offset,
+ uint64_t size, CompressionType compression) {
+ assert(dst != nullptr);
+ dst->clear();
+ dst->reserve(kMaxVarint64Length * 4 + 2);
+ dst->push_back(static_cast<char>(Type::kBlobTTL));
+ PutVarint64(dst, expiration);
+ PutVarint64(dst, file_number);
+ PutVarint64(dst, offset);
+ PutVarint64(dst, size);
+ dst->push_back(static_cast<char>(compression));
+ }
+
+ private:
+ Type type_ = Type::kUnknown;
+ uint64_t expiration_ = 0;
+ Slice value_;
+ uint64_t file_number_ = 0;
+ uint64_t offset_ = 0;
+ uint64_t size_ = 0;
+ CompressionType compression_ = kNoCompression;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/db/builder.cc b/src/rocksdb/db/builder.cc
new file mode 100644
index 000000000..fdb814cbb
--- /dev/null
+++ b/src/rocksdb/db/builder.cc
@@ -0,0 +1,263 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/builder.h"
+
+#include <algorithm>
+#include <deque>
+#include <vector>
+
+#include "db/compaction/compaction_iterator.h"
+#include "db/dbformat.h"
+#include "db/event_helpers.h"
+#include "db/internal_stats.h"
+#include "db/merge_helper.h"
+#include "db/range_del_aggregator.h"
+#include "db/table_cache.h"
+#include "db/version_edit.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/thread_status_util.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/format.h"
+#include "table/internal_iterator.h"
+#include "test_util/sync_point.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TableFactory;
+
+TableBuilder* NewTableBuilder(
+ const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions,
+ const InternalKeyComparator& internal_comparator,
+ const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+ int_tbl_prop_collector_factories,
+ uint32_t column_family_id, const std::string& column_family_name,
+ WritableFileWriter* file, const CompressionType compression_type,
+ uint64_t sample_for_compression, const CompressionOptions& compression_opts,
+ int level, const bool skip_filters, const uint64_t creation_time,
+ const uint64_t oldest_key_time, const uint64_t target_file_size,
+ const uint64_t file_creation_time) {
+ assert((column_family_id ==
+ TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
+ column_family_name.empty());
+ return ioptions.table_factory->NewTableBuilder(
+ TableBuilderOptions(ioptions, moptions, internal_comparator,
+ int_tbl_prop_collector_factories, compression_type,
+ sample_for_compression, compression_opts,
+ skip_filters, column_family_name, level,
+ creation_time, oldest_key_time, target_file_size,
+ file_creation_time),
+ column_family_id, file);
+}
+
+Status BuildTable(
+ const std::string& dbname, Env* env, FileSystem* fs,
+ const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options, const FileOptions& file_options,
+ TableCache* table_cache, InternalIterator* iter,
+ std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+ range_del_iters,
+ FileMetaData* meta, const InternalKeyComparator& internal_comparator,
+ const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+ int_tbl_prop_collector_factories,
+ uint32_t column_family_id, const std::string& column_family_name,
+ std::vector<SequenceNumber> snapshots,
+ SequenceNumber earliest_write_conflict_snapshot,
+ SnapshotChecker* snapshot_checker, const CompressionType compression,
+ uint64_t sample_for_compression, const CompressionOptions& compression_opts,
+ bool paranoid_file_checks, InternalStats* internal_stats,
+ TableFileCreationReason reason, EventLogger* event_logger, int job_id,
+ const Env::IOPriority io_priority, TableProperties* table_properties,
+ int level, const uint64_t creation_time, const uint64_t oldest_key_time,
+ Env::WriteLifeTimeHint write_hint, const uint64_t file_creation_time) {
+ assert((column_family_id ==
+ TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
+ column_family_name.empty());
+ // Reports the IOStats for flush for every following bytes.
+ const size_t kReportFlushIOStatsEvery = 1048576;
+ Status s;
+ meta->fd.file_size = 0;
+ iter->SeekToFirst();
+ std::unique_ptr<CompactionRangeDelAggregator> range_del_agg(
+ new CompactionRangeDelAggregator(&internal_comparator, snapshots));
+ for (auto& range_del_iter : range_del_iters) {
+ range_del_agg->AddTombstones(std::move(range_del_iter));
+ }
+
+ std::string fname = TableFileName(ioptions.cf_paths, meta->fd.GetNumber(),
+ meta->fd.GetPathId());
+#ifndef ROCKSDB_LITE
+ EventHelpers::NotifyTableFileCreationStarted(
+ ioptions.listeners, dbname, column_family_name, fname, job_id, reason);
+#endif // !ROCKSDB_LITE
+ TableProperties tp;
+
+ if (iter->Valid() || !range_del_agg->IsEmpty()) {
+ TableBuilder* builder;
+ std::unique_ptr<WritableFileWriter> file_writer;
+ // Currently we only enable dictionary compression during compaction to the
+ // bottommost level.
+ CompressionOptions compression_opts_for_flush(compression_opts);
+ compression_opts_for_flush.max_dict_bytes = 0;
+ compression_opts_for_flush.zstd_max_train_bytes = 0;
+ {
+ std::unique_ptr<FSWritableFile> file;
+#ifndef NDEBUG
+ bool use_direct_writes = file_options.use_direct_writes;
+ TEST_SYNC_POINT_CALLBACK("BuildTable:create_file", &use_direct_writes);
+#endif // !NDEBUG
+ s = NewWritableFile(fs, fname, &file, file_options);
+ if (!s.ok()) {
+ EventHelpers::LogAndNotifyTableFileCreationFinished(
+ event_logger, ioptions.listeners, dbname, column_family_name, fname,
+ job_id, meta->fd, kInvalidBlobFileNumber, tp, reason, s);
+ return s;
+ }
+ file->SetIOPriority(io_priority);
+ file->SetWriteLifeTimeHint(write_hint);
+
+ file_writer.reset(new WritableFileWriter(
+ std::move(file), fname, file_options, env, ioptions.statistics,
+ ioptions.listeners, ioptions.sst_file_checksum_func));
+
+ builder = NewTableBuilder(
+ ioptions, mutable_cf_options, internal_comparator,
+ int_tbl_prop_collector_factories, column_family_id,
+ column_family_name, file_writer.get(), compression,
+ sample_for_compression, compression_opts_for_flush, level,
+ false /* skip_filters */, creation_time, oldest_key_time,
+ 0 /*target_file_size*/, file_creation_time);
+ }
+
+ MergeHelper merge(env, internal_comparator.user_comparator(),
+ ioptions.merge_operator, nullptr, ioptions.info_log,
+ true /* internal key corruption is not ok */,
+ snapshots.empty() ? 0 : snapshots.back(),
+ snapshot_checker);
+
+ CompactionIterator c_iter(
+ iter, internal_comparator.user_comparator(), &merge, kMaxSequenceNumber,
+ &snapshots, earliest_write_conflict_snapshot, snapshot_checker, env,
+ ShouldReportDetailedTime(env, ioptions.statistics),
+ true /* internal key corruption is not ok */, range_del_agg.get());
+ c_iter.SeekToFirst();
+ for (; c_iter.Valid(); c_iter.Next()) {
+ const Slice& key = c_iter.key();
+ const Slice& value = c_iter.value();
+ const ParsedInternalKey& ikey = c_iter.ikey();
+ builder->Add(key, value);
+ meta->UpdateBoundaries(key, value, ikey.sequence, ikey.type);
+
+ // TODO(noetzli): Update stats after flush, too.
+ if (io_priority == Env::IO_HIGH &&
+ IOSTATS(bytes_written) >= kReportFlushIOStatsEvery) {
+ ThreadStatusUtil::SetThreadOperationProperty(
+ ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written));
+ }
+ }
+
+ auto range_del_it = range_del_agg->NewIterator();
+ for (range_del_it->SeekToFirst(); range_del_it->Valid();
+ range_del_it->Next()) {
+ auto tombstone = range_del_it->Tombstone();
+ auto kv = tombstone.Serialize();
+ builder->Add(kv.first.Encode(), kv.second);
+ meta->UpdateBoundariesForRange(kv.first, tombstone.SerializeEndKey(),
+ tombstone.seq_, internal_comparator);
+ }
+
+ // Finish and check for builder errors
+ tp = builder->GetTableProperties();
+ bool empty = builder->NumEntries() == 0 && tp.num_range_deletions == 0;
+ s = c_iter.status();
+ if (!s.ok() || empty) {
+ builder->Abandon();
+ } else {
+ s = builder->Finish();
+ }
+
+ if (s.ok() && !empty) {
+ uint64_t file_size = builder->FileSize();
+ meta->fd.file_size = file_size;
+ meta->marked_for_compaction = builder->NeedCompact();
+ assert(meta->fd.GetFileSize() > 0);
+ tp = builder->GetTableProperties(); // refresh now that builder is finished
+ if (table_properties) {
+ *table_properties = tp;
+ }
+ // Add the checksum information to file metadata.
+ meta->file_checksum = builder->GetFileChecksum();
+ meta->file_checksum_func_name = builder->GetFileChecksumFuncName();
+ }
+ delete builder;
+
+ // Finish and check for file errors
+ if (s.ok() && !empty) {
+ StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS);
+ s = file_writer->Sync(ioptions.use_fsync);
+ }
+ if (s.ok() && !empty) {
+ s = file_writer->Close();
+ }
+
+ if (s.ok() && !empty) {
+ // Verify that the table is usable
+ // We set for_compaction to false and don't OptimizeForCompactionTableRead
+ // here because this is a special case after we finish the table building
+ // No matter whether use_direct_io_for_flush_and_compaction is true,
+ // we will regrad this verification as user reads since the goal is
+ // to cache it here for further user reads
+ std::unique_ptr<InternalIterator> it(table_cache->NewIterator(
+ ReadOptions(), file_options, internal_comparator, *meta,
+ nullptr /* range_del_agg */,
+ mutable_cf_options.prefix_extractor.get(), nullptr,
+ (internal_stats == nullptr) ? nullptr
+ : internal_stats->GetFileReadHist(0),
+ TableReaderCaller::kFlush, /*arena=*/nullptr,
+ /*skip_filter=*/false, level, /*smallest_compaction_key=*/nullptr,
+ /*largest_compaction_key*/ nullptr));
+ s = it->status();
+ if (s.ok() && paranoid_file_checks) {
+ for (it->SeekToFirst(); it->Valid(); it->Next()) {
+ }
+ s = it->status();
+ }
+ }
+ }
+
+ // Check for input iterator errors
+ if (!iter->status().ok()) {
+ s = iter->status();
+ }
+
+ if (!s.ok() || meta->fd.GetFileSize() == 0) {
+ fs->DeleteFile(fname, IOOptions(), nullptr);
+ }
+
+ if (meta->fd.GetFileSize() == 0) {
+ fname = "(nil)";
+ }
+ // Output to event logger and fire events.
+ EventHelpers::LogAndNotifyTableFileCreationFinished(
+ event_logger, ioptions.listeners, dbname, column_family_name, fname,
+ job_id, meta->fd, meta->oldest_blob_file_number, tp, reason, s);
+
+ return s;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/builder.h b/src/rocksdb/db/builder.h
new file mode 100644
index 000000000..062f1fb80
--- /dev/null
+++ b/src/rocksdb/db/builder.h
@@ -0,0 +1,88 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include <string>
+#include <utility>
+#include <vector>
+#include "db/range_tombstone_fragmenter.h"
+#include "db/table_properties_collector.h"
+#include "logging/event_logger.h"
+#include "options/cf_options.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/types.h"
+#include "table/scoped_arena_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct Options;
+struct FileMetaData;
+
+class Env;
+struct EnvOptions;
+class Iterator;
+class SnapshotChecker;
+class TableCache;
+class VersionEdit;
+class TableBuilder;
+class WritableFileWriter;
+class InternalStats;
+
+// @param column_family_name Name of the column family that is also identified
+// by column_family_id, or empty string if unknown. It must outlive the
+// TableBuilder returned by this function.
+TableBuilder* NewTableBuilder(
+ const ImmutableCFOptions& options, const MutableCFOptions& moptions,
+ const InternalKeyComparator& internal_comparator,
+ const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+ int_tbl_prop_collector_factories,
+ uint32_t column_family_id, const std::string& column_family_name,
+ WritableFileWriter* file, const CompressionType compression_type,
+ const uint64_t sample_for_compression,
+ const CompressionOptions& compression_opts, int level,
+ const bool skip_filters = false, const uint64_t creation_time = 0,
+ const uint64_t oldest_key_time = 0, const uint64_t target_file_size = 0,
+ const uint64_t file_creation_time = 0);
+
+// Build a Table file from the contents of *iter. The generated file
+// will be named according to number specified in meta. On success, the rest of
+// *meta will be filled with metadata about the generated table.
+// If no data is present in *iter, meta->file_size will be set to
+// zero, and no Table file will be produced.
+//
+// @param column_family_name Name of the column family that is also identified
+// by column_family_id, or empty string if unknown.
+extern Status BuildTable(
+ const std::string& dbname, Env* env, FileSystem* fs,
+ const ImmutableCFOptions& options,
+ const MutableCFOptions& mutable_cf_options, const FileOptions& file_options,
+ TableCache* table_cache, InternalIterator* iter,
+ std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+ range_del_iters,
+ FileMetaData* meta, const InternalKeyComparator& internal_comparator,
+ const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+ int_tbl_prop_collector_factories,
+ uint32_t column_family_id, const std::string& column_family_name,
+ std::vector<SequenceNumber> snapshots,
+ SequenceNumber earliest_write_conflict_snapshot,
+ SnapshotChecker* snapshot_checker, const CompressionType compression,
+ const uint64_t sample_for_compression,
+ const CompressionOptions& compression_opts, bool paranoid_file_checks,
+ InternalStats* internal_stats, TableFileCreationReason reason,
+ EventLogger* event_logger = nullptr, int job_id = 0,
+ const Env::IOPriority io_priority = Env::IO_HIGH,
+ TableProperties* table_properties = nullptr, int level = -1,
+ const uint64_t creation_time = 0, const uint64_t oldest_key_time = 0,
+ Env::WriteLifeTimeHint write_hint = Env::WLTH_NOT_SET,
+ const uint64_t file_creation_time = 0);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/c.cc b/src/rocksdb/db/c.cc
new file mode 100644
index 000000000..db78030df
--- /dev/null
+++ b/src/rocksdb/db/c.cc
@@ -0,0 +1,4451 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/c.h"
+
+#include <stdlib.h>
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/rate_limiter.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "rocksdb/universal_compaction.h"
+#include "rocksdb/utilities/backupable_db.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "rocksdb/utilities/memory_util.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "rocksdb/write_batch.h"
+#include "rocksdb/perf_context.h"
+#include "utilities/merge_operators.h"
+
+#include <vector>
+#include <unordered_set>
+#include <map>
+
+using ROCKSDB_NAMESPACE::BackupableDBOptions;
+using ROCKSDB_NAMESPACE::BackupEngine;
+using ROCKSDB_NAMESPACE::BackupID;
+using ROCKSDB_NAMESPACE::BackupInfo;
+using ROCKSDB_NAMESPACE::BatchResult;
+using ROCKSDB_NAMESPACE::BlockBasedTableOptions;
+using ROCKSDB_NAMESPACE::BottommostLevelCompaction;
+using ROCKSDB_NAMESPACE::BytewiseComparator;
+using ROCKSDB_NAMESPACE::Cache;
+using ROCKSDB_NAMESPACE::Checkpoint;
+using ROCKSDB_NAMESPACE::ColumnFamilyDescriptor;
+using ROCKSDB_NAMESPACE::ColumnFamilyHandle;
+using ROCKSDB_NAMESPACE::ColumnFamilyOptions;
+using ROCKSDB_NAMESPACE::CompactionFilter;
+using ROCKSDB_NAMESPACE::CompactionFilterContext;
+using ROCKSDB_NAMESPACE::CompactionFilterFactory;
+using ROCKSDB_NAMESPACE::CompactionOptionsFIFO;
+using ROCKSDB_NAMESPACE::CompactRangeOptions;
+using ROCKSDB_NAMESPACE::Comparator;
+using ROCKSDB_NAMESPACE::CompressionType;
+using ROCKSDB_NAMESPACE::CuckooTableOptions;
+using ROCKSDB_NAMESPACE::DB;
+using ROCKSDB_NAMESPACE::DBOptions;
+using ROCKSDB_NAMESPACE::DbPath;
+using ROCKSDB_NAMESPACE::Env;
+using ROCKSDB_NAMESPACE::EnvOptions;
+using ROCKSDB_NAMESPACE::FileLock;
+using ROCKSDB_NAMESPACE::FilterPolicy;
+using ROCKSDB_NAMESPACE::FlushOptions;
+using ROCKSDB_NAMESPACE::InfoLogLevel;
+using ROCKSDB_NAMESPACE::IngestExternalFileOptions;
+using ROCKSDB_NAMESPACE::Iterator;
+using ROCKSDB_NAMESPACE::LiveFileMetaData;
+using ROCKSDB_NAMESPACE::Logger;
+using ROCKSDB_NAMESPACE::MemoryUtil;
+using ROCKSDB_NAMESPACE::MergeOperator;
+using ROCKSDB_NAMESPACE::MergeOperators;
+using ROCKSDB_NAMESPACE::NewBloomFilterPolicy;
+using ROCKSDB_NAMESPACE::NewGenericRateLimiter;
+using ROCKSDB_NAMESPACE::NewLRUCache;
+using ROCKSDB_NAMESPACE::OptimisticTransactionDB;
+using ROCKSDB_NAMESPACE::OptimisticTransactionOptions;
+using ROCKSDB_NAMESPACE::Options;
+using ROCKSDB_NAMESPACE::PerfContext;
+using ROCKSDB_NAMESPACE::PerfLevel;
+using ROCKSDB_NAMESPACE::PinnableSlice;
+using ROCKSDB_NAMESPACE::RandomAccessFile;
+using ROCKSDB_NAMESPACE::Range;
+using ROCKSDB_NAMESPACE::RateLimiter;
+using ROCKSDB_NAMESPACE::ReadOptions;
+using ROCKSDB_NAMESPACE::RestoreOptions;
+using ROCKSDB_NAMESPACE::SequentialFile;
+using ROCKSDB_NAMESPACE::Slice;
+using ROCKSDB_NAMESPACE::SliceParts;
+using ROCKSDB_NAMESPACE::SliceTransform;
+using ROCKSDB_NAMESPACE::Snapshot;
+using ROCKSDB_NAMESPACE::SstFileWriter;
+using ROCKSDB_NAMESPACE::Status;
+using ROCKSDB_NAMESPACE::Transaction;
+using ROCKSDB_NAMESPACE::TransactionDB;
+using ROCKSDB_NAMESPACE::TransactionDBOptions;
+using ROCKSDB_NAMESPACE::TransactionLogIterator;
+using ROCKSDB_NAMESPACE::TransactionOptions;
+using ROCKSDB_NAMESPACE::WALRecoveryMode;
+using ROCKSDB_NAMESPACE::WritableFile;
+using ROCKSDB_NAMESPACE::WriteBatch;
+using ROCKSDB_NAMESPACE::WriteBatchWithIndex;
+using ROCKSDB_NAMESPACE::WriteOptions;
+
+using std::shared_ptr;
+using std::vector;
+using std::unordered_set;
+using std::map;
+
+extern "C" {
+
+struct rocksdb_t { DB* rep; };
+struct rocksdb_backup_engine_t { BackupEngine* rep; };
+struct rocksdb_backup_engine_info_t { std::vector<BackupInfo> rep; };
+struct rocksdb_restore_options_t { RestoreOptions rep; };
+struct rocksdb_iterator_t { Iterator* rep; };
+struct rocksdb_writebatch_t { WriteBatch rep; };
+struct rocksdb_writebatch_wi_t { WriteBatchWithIndex* rep; };
+struct rocksdb_snapshot_t { const Snapshot* rep; };
+struct rocksdb_flushoptions_t { FlushOptions rep; };
+struct rocksdb_fifo_compaction_options_t { CompactionOptionsFIFO rep; };
+struct rocksdb_readoptions_t {
+ ReadOptions rep;
+ // stack variables to set pointers to in ReadOptions
+ Slice upper_bound;
+ Slice lower_bound;
+};
+struct rocksdb_writeoptions_t { WriteOptions rep; };
+struct rocksdb_options_t { Options rep; };
+struct rocksdb_compactoptions_t {
+ CompactRangeOptions rep;
+};
+struct rocksdb_block_based_table_options_t { BlockBasedTableOptions rep; };
+struct rocksdb_cuckoo_table_options_t { CuckooTableOptions rep; };
+struct rocksdb_seqfile_t { SequentialFile* rep; };
+struct rocksdb_randomfile_t { RandomAccessFile* rep; };
+struct rocksdb_writablefile_t { WritableFile* rep; };
+struct rocksdb_wal_iterator_t { TransactionLogIterator* rep; };
+struct rocksdb_wal_readoptions_t { TransactionLogIterator::ReadOptions rep; };
+struct rocksdb_filelock_t { FileLock* rep; };
+struct rocksdb_logger_t {
+ std::shared_ptr<Logger> rep;
+};
+struct rocksdb_cache_t {
+ std::shared_ptr<Cache> rep;
+};
+struct rocksdb_livefiles_t { std::vector<LiveFileMetaData> rep; };
+struct rocksdb_column_family_handle_t { ColumnFamilyHandle* rep; };
+struct rocksdb_envoptions_t { EnvOptions rep; };
+struct rocksdb_ingestexternalfileoptions_t { IngestExternalFileOptions rep; };
+struct rocksdb_sstfilewriter_t { SstFileWriter* rep; };
+struct rocksdb_ratelimiter_t {
+ std::shared_ptr<RateLimiter> rep;
+};
+struct rocksdb_perfcontext_t { PerfContext* rep; };
+struct rocksdb_pinnableslice_t {
+ PinnableSlice rep;
+};
+struct rocksdb_transactiondb_options_t {
+ TransactionDBOptions rep;
+};
+struct rocksdb_transactiondb_t {
+ TransactionDB* rep;
+};
+struct rocksdb_transaction_options_t {
+ TransactionOptions rep;
+};
+struct rocksdb_transaction_t {
+ Transaction* rep;
+};
+struct rocksdb_checkpoint_t {
+ Checkpoint* rep;
+};
+struct rocksdb_optimistictransactiondb_t {
+ OptimisticTransactionDB* rep;
+};
+struct rocksdb_optimistictransaction_options_t {
+ OptimisticTransactionOptions rep;
+};
+
+struct rocksdb_compactionfiltercontext_t {
+ CompactionFilter::Context rep;
+};
+
+struct rocksdb_compactionfilter_t : public CompactionFilter {
+ void* state_;
+ void (*destructor_)(void*);
+ unsigned char (*filter_)(
+ void*,
+ int level,
+ const char* key, size_t key_length,
+ const char* existing_value, size_t value_length,
+ char** new_value, size_t *new_value_length,
+ unsigned char* value_changed);
+ const char* (*name_)(void*);
+ unsigned char ignore_snapshots_;
+
+ ~rocksdb_compactionfilter_t() override { (*destructor_)(state_); }
+
+ bool Filter(int level, const Slice& key, const Slice& existing_value,
+ std::string* new_value, bool* value_changed) const override {
+ char* c_new_value = nullptr;
+ size_t new_value_length = 0;
+ unsigned char c_value_changed = 0;
+ unsigned char result = (*filter_)(
+ state_,
+ level,
+ key.data(), key.size(),
+ existing_value.data(), existing_value.size(),
+ &c_new_value, &new_value_length, &c_value_changed);
+ if (c_value_changed) {
+ new_value->assign(c_new_value, new_value_length);
+ *value_changed = true;
+ }
+ return result;
+ }
+
+ const char* Name() const override { return (*name_)(state_); }
+
+ bool IgnoreSnapshots() const override { return ignore_snapshots_; }
+};
+
+struct rocksdb_compactionfilterfactory_t : public CompactionFilterFactory {
+ void* state_;
+ void (*destructor_)(void*);
+ rocksdb_compactionfilter_t* (*create_compaction_filter_)(
+ void*, rocksdb_compactionfiltercontext_t* context);
+ const char* (*name_)(void*);
+
+ ~rocksdb_compactionfilterfactory_t() override { (*destructor_)(state_); }
+
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& context) override {
+ rocksdb_compactionfiltercontext_t ccontext;
+ ccontext.rep = context;
+ CompactionFilter* cf = (*create_compaction_filter_)(state_, &ccontext);
+ return std::unique_ptr<CompactionFilter>(cf);
+ }
+
+ const char* Name() const override { return (*name_)(state_); }
+};
+
+struct rocksdb_comparator_t : public Comparator {
+ void* state_;
+ void (*destructor_)(void*);
+ int (*compare_)(
+ void*,
+ const char* a, size_t alen,
+ const char* b, size_t blen);
+ const char* (*name_)(void*);
+
+ ~rocksdb_comparator_t() override { (*destructor_)(state_); }
+
+ int Compare(const Slice& a, const Slice& b) const override {
+ return (*compare_)(state_, a.data(), a.size(), b.data(), b.size());
+ }
+
+ const char* Name() const override { return (*name_)(state_); }
+
+ // No-ops since the C binding does not support key shortening methods.
+ void FindShortestSeparator(std::string*, const Slice&) const override {}
+ void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+struct rocksdb_filterpolicy_t : public FilterPolicy {
+ void* state_;
+ void (*destructor_)(void*);
+ const char* (*name_)(void*);
+ char* (*create_)(
+ void*,
+ const char* const* key_array, const size_t* key_length_array,
+ int num_keys,
+ size_t* filter_length);
+ unsigned char (*key_match_)(
+ void*,
+ const char* key, size_t length,
+ const char* filter, size_t filter_length);
+ void (*delete_filter_)(
+ void*,
+ const char* filter, size_t filter_length);
+
+ ~rocksdb_filterpolicy_t() override { (*destructor_)(state_); }
+
+ const char* Name() const override { return (*name_)(state_); }
+
+ void CreateFilter(const Slice* keys, int n, std::string* dst) const override {
+ std::vector<const char*> key_pointers(n);
+ std::vector<size_t> key_sizes(n);
+ for (int i = 0; i < n; i++) {
+ key_pointers[i] = keys[i].data();
+ key_sizes[i] = keys[i].size();
+ }
+ size_t len;
+ char* filter = (*create_)(state_, &key_pointers[0], &key_sizes[0], n, &len);
+ dst->append(filter, len);
+
+ if (delete_filter_ != nullptr) {
+ (*delete_filter_)(state_, filter, len);
+ } else {
+ free(filter);
+ }
+ }
+
+ bool KeyMayMatch(const Slice& key, const Slice& filter) const override {
+ return (*key_match_)(state_, key.data(), key.size(),
+ filter.data(), filter.size());
+ }
+};
+
+struct rocksdb_mergeoperator_t : public MergeOperator {
+ void* state_;
+ void (*destructor_)(void*);
+ const char* (*name_)(void*);
+ char* (*full_merge_)(
+ void*,
+ const char* key, size_t key_length,
+ const char* existing_value, size_t existing_value_length,
+ const char* const* operands_list, const size_t* operands_list_length,
+ int num_operands,
+ unsigned char* success, size_t* new_value_length);
+ char* (*partial_merge_)(void*, const char* key, size_t key_length,
+ const char* const* operands_list,
+ const size_t* operands_list_length, int num_operands,
+ unsigned char* success, size_t* new_value_length);
+ void (*delete_value_)(
+ void*,
+ const char* value, size_t value_length);
+
+ ~rocksdb_mergeoperator_t() override { (*destructor_)(state_); }
+
+ const char* Name() const override { return (*name_)(state_); }
+
+ bool FullMergeV2(const MergeOperationInput& merge_in,
+ MergeOperationOutput* merge_out) const override {
+ size_t n = merge_in.operand_list.size();
+ std::vector<const char*> operand_pointers(n);
+ std::vector<size_t> operand_sizes(n);
+ for (size_t i = 0; i < n; i++) {
+ Slice operand(merge_in.operand_list[i]);
+ operand_pointers[i] = operand.data();
+ operand_sizes[i] = operand.size();
+ }
+
+ const char* existing_value_data = nullptr;
+ size_t existing_value_len = 0;
+ if (merge_in.existing_value != nullptr) {
+ existing_value_data = merge_in.existing_value->data();
+ existing_value_len = merge_in.existing_value->size();
+ }
+
+ unsigned char success;
+ size_t new_value_len;
+ char* tmp_new_value = (*full_merge_)(
+ state_, merge_in.key.data(), merge_in.key.size(), existing_value_data,
+ existing_value_len, &operand_pointers[0], &operand_sizes[0],
+ static_cast<int>(n), &success, &new_value_len);
+ merge_out->new_value.assign(tmp_new_value, new_value_len);
+
+ if (delete_value_ != nullptr) {
+ (*delete_value_)(state_, tmp_new_value, new_value_len);
+ } else {
+ free(tmp_new_value);
+ }
+
+ return success;
+ }
+
+ bool PartialMergeMulti(const Slice& key,
+ const std::deque<Slice>& operand_list,
+ std::string* new_value,
+ Logger* /*logger*/) const override {
+ size_t operand_count = operand_list.size();
+ std::vector<const char*> operand_pointers(operand_count);
+ std::vector<size_t> operand_sizes(operand_count);
+ for (size_t i = 0; i < operand_count; ++i) {
+ Slice operand(operand_list[i]);
+ operand_pointers[i] = operand.data();
+ operand_sizes[i] = operand.size();
+ }
+
+ unsigned char success;
+ size_t new_value_len;
+ char* tmp_new_value = (*partial_merge_)(
+ state_, key.data(), key.size(), &operand_pointers[0], &operand_sizes[0],
+ static_cast<int>(operand_count), &success, &new_value_len);
+ new_value->assign(tmp_new_value, new_value_len);
+
+ if (delete_value_ != nullptr) {
+ (*delete_value_)(state_, tmp_new_value, new_value_len);
+ } else {
+ free(tmp_new_value);
+ }
+
+ return success;
+ }
+};
+
+struct rocksdb_dbpath_t {
+ DbPath rep;
+};
+
+struct rocksdb_env_t {
+ Env* rep;
+ bool is_default;
+};
+
+struct rocksdb_slicetransform_t : public SliceTransform {
+ void* state_;
+ void (*destructor_)(void*);
+ const char* (*name_)(void*);
+ char* (*transform_)(
+ void*,
+ const char* key, size_t length,
+ size_t* dst_length);
+ unsigned char (*in_domain_)(
+ void*,
+ const char* key, size_t length);
+ unsigned char (*in_range_)(
+ void*,
+ const char* key, size_t length);
+
+ ~rocksdb_slicetransform_t() override { (*destructor_)(state_); }
+
+ const char* Name() const override { return (*name_)(state_); }
+
+ Slice Transform(const Slice& src) const override {
+ size_t len;
+ char* dst = (*transform_)(state_, src.data(), src.size(), &len);
+ return Slice(dst, len);
+ }
+
+ bool InDomain(const Slice& src) const override {
+ return (*in_domain_)(state_, src.data(), src.size());
+ }
+
+ bool InRange(const Slice& src) const override {
+ return (*in_range_)(state_, src.data(), src.size());
+ }
+};
+
+struct rocksdb_universal_compaction_options_t {
+ ROCKSDB_NAMESPACE::CompactionOptionsUniversal* rep;
+};
+
+static bool SaveError(char** errptr, const Status& s) {
+ assert(errptr != nullptr);
+ if (s.ok()) {
+ return false;
+ } else if (*errptr == nullptr) {
+ *errptr = strdup(s.ToString().c_str());
+ } else {
+ // TODO(sanjay): Merge with existing error?
+ // This is a bug if *errptr is not created by malloc()
+ free(*errptr);
+ *errptr = strdup(s.ToString().c_str());
+ }
+ return true;
+}
+
+static char* CopyString(const std::string& str) {
+ char* result = reinterpret_cast<char*>(malloc(sizeof(char) * str.size()));
+ memcpy(result, str.data(), sizeof(char) * str.size());
+ return result;
+}
+
+rocksdb_t* rocksdb_open(
+ const rocksdb_options_t* options,
+ const char* name,
+ char** errptr) {
+ DB* db;
+ if (SaveError(errptr, DB::Open(options->rep, std::string(name), &db))) {
+ return nullptr;
+ }
+ rocksdb_t* result = new rocksdb_t;
+ result->rep = db;
+ return result;
+}
+
+rocksdb_t* rocksdb_open_with_ttl(
+ const rocksdb_options_t* options,
+ const char* name,
+ int ttl,
+ char** errptr) {
+ ROCKSDB_NAMESPACE::DBWithTTL* db;
+ if (SaveError(errptr, ROCKSDB_NAMESPACE::DBWithTTL::Open(
+ options->rep, std::string(name), &db, ttl))) {
+ return nullptr;
+ }
+ rocksdb_t* result = new rocksdb_t;
+ result->rep = db;
+ return result;
+}
+
+rocksdb_t* rocksdb_open_for_read_only(
+ const rocksdb_options_t* options,
+ const char* name,
+ unsigned char error_if_log_file_exist,
+ char** errptr) {
+ DB* db;
+ if (SaveError(errptr, DB::OpenForReadOnly(options->rep, std::string(name), &db, error_if_log_file_exist))) {
+ return nullptr;
+ }
+ rocksdb_t* result = new rocksdb_t;
+ result->rep = db;
+ return result;
+}
+
+rocksdb_t* rocksdb_open_as_secondary(const rocksdb_options_t* options,
+ const char* name,
+ const char* secondary_path,
+ char** errptr) {
+ DB* db;
+ if (SaveError(errptr,
+ DB::OpenAsSecondary(options->rep, std::string(name),
+ std::string(secondary_path), &db))) {
+ return nullptr;
+ }
+ rocksdb_t* result = new rocksdb_t;
+ result->rep = db;
+ return result;
+}
+
+rocksdb_backup_engine_t* rocksdb_backup_engine_open(
+ const rocksdb_options_t* options, const char* path, char** errptr) {
+ BackupEngine* be;
+ if (SaveError(errptr, BackupEngine::Open(options->rep.env,
+ BackupableDBOptions(path,
+ nullptr,
+ true,
+ options->rep.info_log.get()),
+ &be))) {
+ return nullptr;
+ }
+ rocksdb_backup_engine_t* result = new rocksdb_backup_engine_t;
+ result->rep = be;
+ return result;
+}
+
+void rocksdb_backup_engine_create_new_backup(rocksdb_backup_engine_t* be,
+ rocksdb_t* db,
+ char** errptr) {
+ SaveError(errptr, be->rep->CreateNewBackup(db->rep));
+}
+
+void rocksdb_backup_engine_create_new_backup_flush(rocksdb_backup_engine_t* be,
+ rocksdb_t* db,
+ unsigned char flush_before_backup,
+ char** errptr) {
+ SaveError(errptr, be->rep->CreateNewBackup(db->rep, flush_before_backup));
+}
+
+void rocksdb_backup_engine_purge_old_backups(rocksdb_backup_engine_t* be,
+ uint32_t num_backups_to_keep,
+ char** errptr) {
+ SaveError(errptr, be->rep->PurgeOldBackups(num_backups_to_keep));
+}
+
+rocksdb_restore_options_t* rocksdb_restore_options_create() {
+ return new rocksdb_restore_options_t;
+}
+
+void rocksdb_restore_options_destroy(rocksdb_restore_options_t* opt) {
+ delete opt;
+}
+
+void rocksdb_restore_options_set_keep_log_files(rocksdb_restore_options_t* opt,
+ int v) {
+ opt->rep.keep_log_files = v;
+}
+
+
+void rocksdb_backup_engine_verify_backup(rocksdb_backup_engine_t* be,
+ uint32_t backup_id, char** errptr) {
+ SaveError(errptr, be->rep->VerifyBackup(static_cast<BackupID>(backup_id)));
+}
+
+void rocksdb_backup_engine_restore_db_from_latest_backup(
+ rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir,
+ const rocksdb_restore_options_t* restore_options, char** errptr) {
+ SaveError(errptr, be->rep->RestoreDBFromLatestBackup(std::string(db_dir),
+ std::string(wal_dir),
+ restore_options->rep));
+}
+
+const rocksdb_backup_engine_info_t* rocksdb_backup_engine_get_backup_info(
+ rocksdb_backup_engine_t* be) {
+ rocksdb_backup_engine_info_t* result = new rocksdb_backup_engine_info_t;
+ be->rep->GetBackupInfo(&result->rep);
+ return result;
+}
+
+int rocksdb_backup_engine_info_count(const rocksdb_backup_engine_info_t* info) {
+ return static_cast<int>(info->rep.size());
+}
+
+int64_t rocksdb_backup_engine_info_timestamp(
+ const rocksdb_backup_engine_info_t* info, int index) {
+ return info->rep[index].timestamp;
+}
+
+uint32_t rocksdb_backup_engine_info_backup_id(
+ const rocksdb_backup_engine_info_t* info, int index) {
+ return info->rep[index].backup_id;
+}
+
+uint64_t rocksdb_backup_engine_info_size(
+ const rocksdb_backup_engine_info_t* info, int index) {
+ return info->rep[index].size;
+}
+
+uint32_t rocksdb_backup_engine_info_number_files(
+ const rocksdb_backup_engine_info_t* info, int index) {
+ return info->rep[index].number_files;
+}
+
+void rocksdb_backup_engine_info_destroy(
+ const rocksdb_backup_engine_info_t* info) {
+ delete info;
+}
+
+void rocksdb_backup_engine_close(rocksdb_backup_engine_t* be) {
+ delete be->rep;
+ delete be;
+}
+
+rocksdb_checkpoint_t* rocksdb_checkpoint_object_create(rocksdb_t* db,
+ char** errptr) {
+ Checkpoint* checkpoint;
+ if (SaveError(errptr, Checkpoint::Create(db->rep, &checkpoint))) {
+ return nullptr;
+ }
+ rocksdb_checkpoint_t* result = new rocksdb_checkpoint_t;
+ result->rep = checkpoint;
+ return result;
+}
+
+void rocksdb_checkpoint_create(rocksdb_checkpoint_t* checkpoint,
+ const char* checkpoint_dir,
+ uint64_t log_size_for_flush, char** errptr) {
+ SaveError(errptr, checkpoint->rep->CreateCheckpoint(
+ std::string(checkpoint_dir), log_size_for_flush));
+}
+
+void rocksdb_checkpoint_object_destroy(rocksdb_checkpoint_t* checkpoint) {
+ delete checkpoint->rep;
+ delete checkpoint;
+}
+
+void rocksdb_close(rocksdb_t* db) {
+ delete db->rep;
+ delete db;
+}
+
+void rocksdb_options_set_uint64add_merge_operator(rocksdb_options_t* opt) {
+ opt->rep.merge_operator =
+ ROCKSDB_NAMESPACE::MergeOperators::CreateUInt64AddOperator();
+}
+
+rocksdb_t* rocksdb_open_column_families(
+ const rocksdb_options_t* db_options, const char* name,
+ int num_column_families, const char* const* column_family_names,
+ const rocksdb_options_t* const* column_family_options,
+ rocksdb_column_family_handle_t** column_family_handles, char** errptr) {
+ std::vector<ColumnFamilyDescriptor> column_families;
+ for (int i = 0; i < num_column_families; i++) {
+ column_families.push_back(ColumnFamilyDescriptor(
+ std::string(column_family_names[i]),
+ ColumnFamilyOptions(column_family_options[i]->rep)));
+ }
+
+ DB* db;
+ std::vector<ColumnFamilyHandle*> handles;
+ if (SaveError(errptr, DB::Open(DBOptions(db_options->rep),
+ std::string(name), column_families, &handles, &db))) {
+ return nullptr;
+ }
+
+ for (size_t i = 0; i < handles.size(); i++) {
+ rocksdb_column_family_handle_t* c_handle = new rocksdb_column_family_handle_t;
+ c_handle->rep = handles[i];
+ column_family_handles[i] = c_handle;
+ }
+ rocksdb_t* result = new rocksdb_t;
+ result->rep = db;
+ return result;
+}
+
+rocksdb_t* rocksdb_open_for_read_only_column_families(
+ const rocksdb_options_t* db_options, const char* name,
+ int num_column_families, const char* const* column_family_names,
+ const rocksdb_options_t* const* column_family_options,
+ rocksdb_column_family_handle_t** column_family_handles,
+ unsigned char error_if_log_file_exist, char** errptr) {
+ std::vector<ColumnFamilyDescriptor> column_families;
+ for (int i = 0; i < num_column_families; i++) {
+ column_families.push_back(ColumnFamilyDescriptor(
+ std::string(column_family_names[i]),
+ ColumnFamilyOptions(column_family_options[i]->rep)));
+ }
+
+ DB* db;
+ std::vector<ColumnFamilyHandle*> handles;
+ if (SaveError(errptr, DB::OpenForReadOnly(DBOptions(db_options->rep),
+ std::string(name), column_families, &handles, &db, error_if_log_file_exist))) {
+ return nullptr;
+ }
+
+ for (size_t i = 0; i < handles.size(); i++) {
+ rocksdb_column_family_handle_t* c_handle = new rocksdb_column_family_handle_t;
+ c_handle->rep = handles[i];
+ column_family_handles[i] = c_handle;
+ }
+ rocksdb_t* result = new rocksdb_t;
+ result->rep = db;
+ return result;
+}
+
+rocksdb_t* rocksdb_open_as_secondary_column_families(
+ const rocksdb_options_t* db_options, const char* name,
+ const char* secondary_path, int num_column_families,
+ const char* const* column_family_names,
+ const rocksdb_options_t* const* column_family_options,
+ rocksdb_column_family_handle_t** column_family_handles, char** errptr) {
+ std::vector<ColumnFamilyDescriptor> column_families;
+ for (int i = 0; i != num_column_families; ++i) {
+ column_families.emplace_back(
+ std::string(column_family_names[i]),
+ ColumnFamilyOptions(column_family_options[i]->rep));
+ }
+ DB* db;
+ std::vector<ColumnFamilyHandle*> handles;
+ if (SaveError(errptr, DB::OpenAsSecondary(DBOptions(db_options->rep),
+ std::string(name),
+ std::string(secondary_path),
+ column_families, &handles, &db))) {
+ return nullptr;
+ }
+ for (size_t i = 0; i != handles.size(); ++i) {
+ rocksdb_column_family_handle_t* c_handle =
+ new rocksdb_column_family_handle_t;
+ c_handle->rep = handles[i];
+ column_family_handles[i] = c_handle;
+ }
+ rocksdb_t* result = new rocksdb_t;
+ result->rep = db;
+ return result;
+}
+
+char** rocksdb_list_column_families(
+ const rocksdb_options_t* options,
+ const char* name,
+ size_t* lencfs,
+ char** errptr) {
+ std::vector<std::string> fams;
+ SaveError(errptr,
+ DB::ListColumnFamilies(DBOptions(options->rep),
+ std::string(name), &fams));
+
+ *lencfs = fams.size();
+ char** column_families = static_cast<char**>(malloc(sizeof(char*) * fams.size()));
+ for (size_t i = 0; i < fams.size(); i++) {
+ column_families[i] = strdup(fams[i].c_str());
+ }
+ return column_families;
+}
+
+void rocksdb_list_column_families_destroy(char** list, size_t len) {
+ for (size_t i = 0; i < len; ++i) {
+ free(list[i]);
+ }
+ free(list);
+}
+
+rocksdb_column_family_handle_t* rocksdb_create_column_family(
+ rocksdb_t* db,
+ const rocksdb_options_t* column_family_options,
+ const char* column_family_name,
+ char** errptr) {
+ rocksdb_column_family_handle_t* handle = new rocksdb_column_family_handle_t;
+ SaveError(errptr,
+ db->rep->CreateColumnFamily(ColumnFamilyOptions(column_family_options->rep),
+ std::string(column_family_name), &(handle->rep)));
+ return handle;
+}
+
+void rocksdb_drop_column_family(
+ rocksdb_t* db,
+ rocksdb_column_family_handle_t* handle,
+ char** errptr) {
+ SaveError(errptr, db->rep->DropColumnFamily(handle->rep));
+}
+
+void rocksdb_column_family_handle_destroy(rocksdb_column_family_handle_t* handle) {
+ delete handle->rep;
+ delete handle;
+}
+
+void rocksdb_put(
+ rocksdb_t* db,
+ const rocksdb_writeoptions_t* options,
+ const char* key, size_t keylen,
+ const char* val, size_t vallen,
+ char** errptr) {
+ SaveError(errptr,
+ db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_put_cf(
+ rocksdb_t* db,
+ const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t keylen,
+ const char* val, size_t vallen,
+ char** errptr) {
+ SaveError(errptr,
+ db->rep->Put(options->rep, column_family->rep,
+ Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_delete(
+ rocksdb_t* db,
+ const rocksdb_writeoptions_t* options,
+ const char* key, size_t keylen,
+ char** errptr) {
+ SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen)));
+}
+
+void rocksdb_delete_cf(
+ rocksdb_t* db,
+ const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t keylen,
+ char** errptr) {
+ SaveError(errptr, db->rep->Delete(options->rep, column_family->rep,
+ Slice(key, keylen)));
+}
+
+void rocksdb_delete_range_cf(rocksdb_t* db,
+ const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family,
+ const char* start_key, size_t start_key_len,
+ const char* end_key, size_t end_key_len,
+ char** errptr) {
+ SaveError(errptr, db->rep->DeleteRange(options->rep, column_family->rep,
+ Slice(start_key, start_key_len),
+ Slice(end_key, end_key_len)));
+}
+
+void rocksdb_merge(
+ rocksdb_t* db,
+ const rocksdb_writeoptions_t* options,
+ const char* key, size_t keylen,
+ const char* val, size_t vallen,
+ char** errptr) {
+ SaveError(errptr,
+ db->rep->Merge(options->rep, Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_merge_cf(
+ rocksdb_t* db,
+ const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t keylen,
+ const char* val, size_t vallen,
+ char** errptr) {
+ SaveError(errptr,
+ db->rep->Merge(options->rep, column_family->rep,
+ Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_write(
+ rocksdb_t* db,
+ const rocksdb_writeoptions_t* options,
+ rocksdb_writebatch_t* batch,
+ char** errptr) {
+ SaveError(errptr, db->rep->Write(options->rep, &batch->rep));
+}
+
+char* rocksdb_get(
+ rocksdb_t* db,
+ const rocksdb_readoptions_t* options,
+ const char* key, size_t keylen,
+ size_t* vallen,
+ char** errptr) {
+ char* result = nullptr;
+ std::string tmp;
+ Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp);
+ if (s.ok()) {
+ *vallen = tmp.size();
+ result = CopyString(tmp);
+ } else {
+ *vallen = 0;
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ }
+ return result;
+}
+
+char* rocksdb_get_cf(
+ rocksdb_t* db,
+ const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t keylen,
+ size_t* vallen,
+ char** errptr) {
+ char* result = nullptr;
+ std::string tmp;
+ Status s = db->rep->Get(options->rep, column_family->rep,
+ Slice(key, keylen), &tmp);
+ if (s.ok()) {
+ *vallen = tmp.size();
+ result = CopyString(tmp);
+ } else {
+ *vallen = 0;
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ }
+ return result;
+}
+
+void rocksdb_multi_get(
+ rocksdb_t* db,
+ const rocksdb_readoptions_t* options,
+ size_t num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes,
+ char** values_list, size_t* values_list_sizes,
+ char** errs) {
+ std::vector<Slice> keys(num_keys);
+ for (size_t i = 0; i < num_keys; i++) {
+ keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ }
+ std::vector<std::string> values(num_keys);
+ std::vector<Status> statuses = db->rep->MultiGet(options->rep, keys, &values);
+ for (size_t i = 0; i < num_keys; i++) {
+ if (statuses[i].ok()) {
+ values_list[i] = CopyString(values[i]);
+ values_list_sizes[i] = values[i].size();
+ errs[i] = nullptr;
+ } else {
+ values_list[i] = nullptr;
+ values_list_sizes[i] = 0;
+ if (!statuses[i].IsNotFound()) {
+ errs[i] = strdup(statuses[i].ToString().c_str());
+ } else {
+ errs[i] = nullptr;
+ }
+ }
+ }
+}
+
+void rocksdb_multi_get_cf(
+ rocksdb_t* db,
+ const rocksdb_readoptions_t* options,
+ const rocksdb_column_family_handle_t* const* column_families,
+ size_t num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes,
+ char** values_list, size_t* values_list_sizes,
+ char** errs) {
+ std::vector<Slice> keys(num_keys);
+ std::vector<ColumnFamilyHandle*> cfs(num_keys);
+ for (size_t i = 0; i < num_keys; i++) {
+ keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ cfs[i] = column_families[i]->rep;
+ }
+ std::vector<std::string> values(num_keys);
+ std::vector<Status> statuses = db->rep->MultiGet(options->rep, cfs, keys, &values);
+ for (size_t i = 0; i < num_keys; i++) {
+ if (statuses[i].ok()) {
+ values_list[i] = CopyString(values[i]);
+ values_list_sizes[i] = values[i].size();
+ errs[i] = nullptr;
+ } else {
+ values_list[i] = nullptr;
+ values_list_sizes[i] = 0;
+ if (!statuses[i].IsNotFound()) {
+ errs[i] = strdup(statuses[i].ToString().c_str());
+ } else {
+ errs[i] = nullptr;
+ }
+ }
+ }
+}
+
+rocksdb_iterator_t* rocksdb_create_iterator(
+ rocksdb_t* db,
+ const rocksdb_readoptions_t* options) {
+ rocksdb_iterator_t* result = new rocksdb_iterator_t;
+ result->rep = db->rep->NewIterator(options->rep);
+ return result;
+}
+
+rocksdb_wal_iterator_t* rocksdb_get_updates_since(
+ rocksdb_t* db, uint64_t seq_number,
+ const rocksdb_wal_readoptions_t* options,
+ char** errptr) {
+ std::unique_ptr<TransactionLogIterator> iter;
+ TransactionLogIterator::ReadOptions ro;
+ if (options!=nullptr) {
+ ro = options->rep;
+ }
+ if (SaveError(errptr, db->rep->GetUpdatesSince(seq_number, &iter, ro))) {
+ return nullptr;
+ }
+ rocksdb_wal_iterator_t* result = new rocksdb_wal_iterator_t;
+ result->rep = iter.release();
+ return result;
+}
+
+void rocksdb_wal_iter_next(rocksdb_wal_iterator_t* iter) {
+ iter->rep->Next();
+}
+
+unsigned char rocksdb_wal_iter_valid(const rocksdb_wal_iterator_t* iter) {
+ return iter->rep->Valid();
+}
+
+void rocksdb_wal_iter_status (const rocksdb_wal_iterator_t* iter, char** errptr) {
+ SaveError(errptr, iter->rep->status());
+}
+
+void rocksdb_wal_iter_destroy (const rocksdb_wal_iterator_t* iter) {
+ delete iter->rep;
+ delete iter;
+}
+
+rocksdb_writebatch_t* rocksdb_wal_iter_get_batch (const rocksdb_wal_iterator_t* iter, uint64_t* seq) {
+ rocksdb_writebatch_t* result = rocksdb_writebatch_create();
+ BatchResult wal_batch = iter->rep->GetBatch();
+ result->rep = std::move(*wal_batch.writeBatchPtr);
+ if (seq != nullptr) {
+ *seq = wal_batch.sequence;
+ }
+ return result;
+}
+
+uint64_t rocksdb_get_latest_sequence_number (rocksdb_t *db) {
+ return db->rep->GetLatestSequenceNumber();
+}
+
+rocksdb_iterator_t* rocksdb_create_iterator_cf(
+ rocksdb_t* db,
+ const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family) {
+ rocksdb_iterator_t* result = new rocksdb_iterator_t;
+ result->rep = db->rep->NewIterator(options->rep, column_family->rep);
+ return result;
+}
+
+void rocksdb_create_iterators(
+ rocksdb_t *db,
+ rocksdb_readoptions_t* opts,
+ rocksdb_column_family_handle_t** column_families,
+ rocksdb_iterator_t** iterators,
+ size_t size,
+ char** errptr) {
+ std::vector<ColumnFamilyHandle*> column_families_vec;
+ for (size_t i = 0; i < size; i++) {
+ column_families_vec.push_back(column_families[i]->rep);
+ }
+
+ std::vector<Iterator*> res;
+ Status status = db->rep->NewIterators(opts->rep, column_families_vec, &res);
+ assert(res.size() == size);
+ if (SaveError(errptr, status)) {
+ return;
+ }
+
+ for (size_t i = 0; i < size; i++) {
+ iterators[i] = new rocksdb_iterator_t;
+ iterators[i]->rep = res[i];
+ }
+}
+
+const rocksdb_snapshot_t* rocksdb_create_snapshot(
+ rocksdb_t* db) {
+ rocksdb_snapshot_t* result = new rocksdb_snapshot_t;
+ result->rep = db->rep->GetSnapshot();
+ return result;
+}
+
+void rocksdb_release_snapshot(
+ rocksdb_t* db,
+ const rocksdb_snapshot_t* snapshot) {
+ db->rep->ReleaseSnapshot(snapshot->rep);
+ delete snapshot;
+}
+
+char* rocksdb_property_value(
+ rocksdb_t* db,
+ const char* propname) {
+ std::string tmp;
+ if (db->rep->GetProperty(Slice(propname), &tmp)) {
+ // We use strdup() since we expect human readable output.
+ return strdup(tmp.c_str());
+ } else {
+ return nullptr;
+ }
+}
+
+int rocksdb_property_int(
+ rocksdb_t* db,
+ const char* propname,
+ uint64_t *out_val) {
+ if (db->rep->GetIntProperty(Slice(propname), out_val)) {
+ return 0;
+ } else {
+ return -1;
+ }
+}
+
+int rocksdb_property_int_cf(
+ rocksdb_t* db,
+ rocksdb_column_family_handle_t* column_family,
+ const char* propname,
+ uint64_t *out_val) {
+ if (db->rep->GetIntProperty(column_family->rep, Slice(propname), out_val)) {
+ return 0;
+ } else {
+ return -1;
+ }
+}
+
+char* rocksdb_property_value_cf(
+ rocksdb_t* db,
+ rocksdb_column_family_handle_t* column_family,
+ const char* propname) {
+ std::string tmp;
+ if (db->rep->GetProperty(column_family->rep, Slice(propname), &tmp)) {
+ // We use strdup() since we expect human readable output.
+ return strdup(tmp.c_str());
+ } else {
+ return nullptr;
+ }
+}
+
+void rocksdb_approximate_sizes(
+ rocksdb_t* db,
+ int num_ranges,
+ const char* const* range_start_key, const size_t* range_start_key_len,
+ const char* const* range_limit_key, const size_t* range_limit_key_len,
+ uint64_t* sizes) {
+ Range* ranges = new Range[num_ranges];
+ for (int i = 0; i < num_ranges; i++) {
+ ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]);
+ ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]);
+ }
+ db->rep->GetApproximateSizes(ranges, num_ranges, sizes);
+ delete[] ranges;
+}
+
+void rocksdb_approximate_sizes_cf(
+ rocksdb_t* db,
+ rocksdb_column_family_handle_t* column_family,
+ int num_ranges,
+ const char* const* range_start_key, const size_t* range_start_key_len,
+ const char* const* range_limit_key, const size_t* range_limit_key_len,
+ uint64_t* sizes) {
+ Range* ranges = new Range[num_ranges];
+ for (int i = 0; i < num_ranges; i++) {
+ ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]);
+ ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]);
+ }
+ db->rep->GetApproximateSizes(column_family->rep, ranges, num_ranges, sizes);
+ delete[] ranges;
+}
+
+void rocksdb_delete_file(
+ rocksdb_t* db,
+ const char* name) {
+ db->rep->DeleteFile(name);
+}
+
+const rocksdb_livefiles_t* rocksdb_livefiles(
+ rocksdb_t* db) {
+ rocksdb_livefiles_t* result = new rocksdb_livefiles_t;
+ db->rep->GetLiveFilesMetaData(&result->rep);
+ return result;
+}
+
+void rocksdb_compact_range(
+ rocksdb_t* db,
+ const char* start_key, size_t start_key_len,
+ const char* limit_key, size_t limit_key_len) {
+ Slice a, b;
+ db->rep->CompactRange(
+ CompactRangeOptions(),
+ // Pass nullptr Slice if corresponding "const char*" is nullptr
+ (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+ (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
+}
+
+void rocksdb_compact_range_cf(
+ rocksdb_t* db,
+ rocksdb_column_family_handle_t* column_family,
+ const char* start_key, size_t start_key_len,
+ const char* limit_key, size_t limit_key_len) {
+ Slice a, b;
+ db->rep->CompactRange(
+ CompactRangeOptions(), column_family->rep,
+ // Pass nullptr Slice if corresponding "const char*" is nullptr
+ (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+ (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
+}
+
+void rocksdb_compact_range_opt(rocksdb_t* db, rocksdb_compactoptions_t* opt,
+ const char* start_key, size_t start_key_len,
+ const char* limit_key, size_t limit_key_len) {
+ Slice a, b;
+ db->rep->CompactRange(
+ opt->rep,
+ // Pass nullptr Slice if corresponding "const char*" is nullptr
+ (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+ (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
+}
+
+void rocksdb_compact_range_cf_opt(rocksdb_t* db,
+ rocksdb_column_family_handle_t* column_family,
+ rocksdb_compactoptions_t* opt,
+ const char* start_key, size_t start_key_len,
+ const char* limit_key, size_t limit_key_len) {
+ Slice a, b;
+ db->rep->CompactRange(
+ opt->rep, column_family->rep,
+ // Pass nullptr Slice if corresponding "const char*" is nullptr
+ (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+ (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
+}
+
+void rocksdb_flush(
+ rocksdb_t* db,
+ const rocksdb_flushoptions_t* options,
+ char** errptr) {
+ SaveError(errptr, db->rep->Flush(options->rep));
+}
+
+void rocksdb_flush_cf(
+ rocksdb_t* db,
+ const rocksdb_flushoptions_t* options,
+ rocksdb_column_family_handle_t* column_family,
+ char** errptr) {
+ SaveError(errptr, db->rep->Flush(options->rep, column_family->rep));
+}
+
+void rocksdb_disable_file_deletions(
+ rocksdb_t* db,
+ char** errptr) {
+ SaveError(errptr, db->rep->DisableFileDeletions());
+}
+
+void rocksdb_enable_file_deletions(
+ rocksdb_t* db,
+ unsigned char force,
+ char** errptr) {
+ SaveError(errptr, db->rep->EnableFileDeletions(force));
+}
+
+void rocksdb_destroy_db(
+ const rocksdb_options_t* options,
+ const char* name,
+ char** errptr) {
+ SaveError(errptr, DestroyDB(name, options->rep));
+}
+
+void rocksdb_repair_db(
+ const rocksdb_options_t* options,
+ const char* name,
+ char** errptr) {
+ SaveError(errptr, RepairDB(name, options->rep));
+}
+
+void rocksdb_iter_destroy(rocksdb_iterator_t* iter) {
+ delete iter->rep;
+ delete iter;
+}
+
+unsigned char rocksdb_iter_valid(const rocksdb_iterator_t* iter) {
+ return iter->rep->Valid();
+}
+
+void rocksdb_iter_seek_to_first(rocksdb_iterator_t* iter) {
+ iter->rep->SeekToFirst();
+}
+
+void rocksdb_iter_seek_to_last(rocksdb_iterator_t* iter) {
+ iter->rep->SeekToLast();
+}
+
+void rocksdb_iter_seek(rocksdb_iterator_t* iter, const char* k, size_t klen) {
+ iter->rep->Seek(Slice(k, klen));
+}
+
+void rocksdb_iter_seek_for_prev(rocksdb_iterator_t* iter, const char* k,
+ size_t klen) {
+ iter->rep->SeekForPrev(Slice(k, klen));
+}
+
+void rocksdb_iter_next(rocksdb_iterator_t* iter) {
+ iter->rep->Next();
+}
+
+void rocksdb_iter_prev(rocksdb_iterator_t* iter) {
+ iter->rep->Prev();
+}
+
+const char* rocksdb_iter_key(const rocksdb_iterator_t* iter, size_t* klen) {
+ Slice s = iter->rep->key();
+ *klen = s.size();
+ return s.data();
+}
+
+const char* rocksdb_iter_value(const rocksdb_iterator_t* iter, size_t* vlen) {
+ Slice s = iter->rep->value();
+ *vlen = s.size();
+ return s.data();
+}
+
+void rocksdb_iter_get_error(const rocksdb_iterator_t* iter, char** errptr) {
+ SaveError(errptr, iter->rep->status());
+}
+
+rocksdb_writebatch_t* rocksdb_writebatch_create() {
+ return new rocksdb_writebatch_t;
+}
+
+rocksdb_writebatch_t* rocksdb_writebatch_create_from(const char* rep,
+ size_t size) {
+ rocksdb_writebatch_t* b = new rocksdb_writebatch_t;
+ b->rep = WriteBatch(std::string(rep, size));
+ return b;
+}
+
+void rocksdb_writebatch_destroy(rocksdb_writebatch_t* b) {
+ delete b;
+}
+
+void rocksdb_writebatch_clear(rocksdb_writebatch_t* b) {
+ b->rep.Clear();
+}
+
+int rocksdb_writebatch_count(rocksdb_writebatch_t* b) {
+ return b->rep.Count();
+}
+
+void rocksdb_writebatch_put(
+ rocksdb_writebatch_t* b,
+ const char* key, size_t klen,
+ const char* val, size_t vlen) {
+ b->rep.Put(Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_put_cf(
+ rocksdb_writebatch_t* b,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen,
+ const char* val, size_t vlen) {
+ b->rep.Put(column_family->rep, Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_putv(
+ rocksdb_writebatch_t* b,
+ int num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes,
+ int num_values, const char* const* values_list,
+ const size_t* values_list_sizes) {
+ std::vector<Slice> key_slices(num_keys);
+ for (int i = 0; i < num_keys; i++) {
+ key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ }
+ std::vector<Slice> value_slices(num_values);
+ for (int i = 0; i < num_values; i++) {
+ value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+ }
+ b->rep.Put(SliceParts(key_slices.data(), num_keys),
+ SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_putv_cf(
+ rocksdb_writebatch_t* b,
+ rocksdb_column_family_handle_t* column_family,
+ int num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes,
+ int num_values, const char* const* values_list,
+ const size_t* values_list_sizes) {
+ std::vector<Slice> key_slices(num_keys);
+ for (int i = 0; i < num_keys; i++) {
+ key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ }
+ std::vector<Slice> value_slices(num_values);
+ for (int i = 0; i < num_values; i++) {
+ value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+ }
+ b->rep.Put(column_family->rep, SliceParts(key_slices.data(), num_keys),
+ SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_merge(
+ rocksdb_writebatch_t* b,
+ const char* key, size_t klen,
+ const char* val, size_t vlen) {
+ b->rep.Merge(Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_merge_cf(
+ rocksdb_writebatch_t* b,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen,
+ const char* val, size_t vlen) {
+ b->rep.Merge(column_family->rep, Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_mergev(
+ rocksdb_writebatch_t* b,
+ int num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes,
+ int num_values, const char* const* values_list,
+ const size_t* values_list_sizes) {
+ std::vector<Slice> key_slices(num_keys);
+ for (int i = 0; i < num_keys; i++) {
+ key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ }
+ std::vector<Slice> value_slices(num_values);
+ for (int i = 0; i < num_values; i++) {
+ value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+ }
+ b->rep.Merge(SliceParts(key_slices.data(), num_keys),
+ SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_mergev_cf(
+ rocksdb_writebatch_t* b,
+ rocksdb_column_family_handle_t* column_family,
+ int num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes,
+ int num_values, const char* const* values_list,
+ const size_t* values_list_sizes) {
+ std::vector<Slice> key_slices(num_keys);
+ for (int i = 0; i < num_keys; i++) {
+ key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ }
+ std::vector<Slice> value_slices(num_values);
+ for (int i = 0; i < num_values; i++) {
+ value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+ }
+ b->rep.Merge(column_family->rep, SliceParts(key_slices.data(), num_keys),
+ SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_delete(
+ rocksdb_writebatch_t* b,
+ const char* key, size_t klen) {
+ b->rep.Delete(Slice(key, klen));
+}
+
+void rocksdb_writebatch_delete_cf(
+ rocksdb_writebatch_t* b,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen) {
+ b->rep.Delete(column_family->rep, Slice(key, klen));
+}
+
+void rocksdb_writebatch_deletev(
+ rocksdb_writebatch_t* b,
+ int num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes) {
+ std::vector<Slice> key_slices(num_keys);
+ for (int i = 0; i < num_keys; i++) {
+ key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ }
+ b->rep.Delete(SliceParts(key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_deletev_cf(
+ rocksdb_writebatch_t* b,
+ rocksdb_column_family_handle_t* column_family,
+ int num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes) {
+ std::vector<Slice> key_slices(num_keys);
+ for (int i = 0; i < num_keys; i++) {
+ key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ }
+ b->rep.Delete(column_family->rep, SliceParts(key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_delete_range(rocksdb_writebatch_t* b,
+ const char* start_key,
+ size_t start_key_len, const char* end_key,
+ size_t end_key_len) {
+ b->rep.DeleteRange(Slice(start_key, start_key_len),
+ Slice(end_key, end_key_len));
+}
+
+void rocksdb_writebatch_delete_range_cf(
+ rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+ const char* start_key, size_t start_key_len, const char* end_key,
+ size_t end_key_len) {
+ b->rep.DeleteRange(column_family->rep, Slice(start_key, start_key_len),
+ Slice(end_key, end_key_len));
+}
+
+void rocksdb_writebatch_delete_rangev(rocksdb_writebatch_t* b, int num_keys,
+ const char* const* start_keys_list,
+ const size_t* start_keys_list_sizes,
+ const char* const* end_keys_list,
+ const size_t* end_keys_list_sizes) {
+ std::vector<Slice> start_key_slices(num_keys);
+ std::vector<Slice> end_key_slices(num_keys);
+ for (int i = 0; i < num_keys; i++) {
+ start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]);
+ end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]);
+ }
+ b->rep.DeleteRange(SliceParts(start_key_slices.data(), num_keys),
+ SliceParts(end_key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_delete_rangev_cf(
+ rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+ int num_keys, const char* const* start_keys_list,
+ const size_t* start_keys_list_sizes, const char* const* end_keys_list,
+ const size_t* end_keys_list_sizes) {
+ std::vector<Slice> start_key_slices(num_keys);
+ std::vector<Slice> end_key_slices(num_keys);
+ for (int i = 0; i < num_keys; i++) {
+ start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]);
+ end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]);
+ }
+ b->rep.DeleteRange(column_family->rep,
+ SliceParts(start_key_slices.data(), num_keys),
+ SliceParts(end_key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_put_log_data(
+ rocksdb_writebatch_t* b,
+ const char* blob, size_t len) {
+ b->rep.PutLogData(Slice(blob, len));
+}
+
+class H : public WriteBatch::Handler {
+ public:
+ void* state_;
+ void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen);
+ void (*deleted_)(void*, const char* k, size_t klen);
+ void Put(const Slice& key, const Slice& value) override {
+ (*put_)(state_, key.data(), key.size(), value.data(), value.size());
+ }
+ void Delete(const Slice& key) override {
+ (*deleted_)(state_, key.data(), key.size());
+ }
+};
+
+void rocksdb_writebatch_iterate(
+ rocksdb_writebatch_t* b,
+ void* state,
+ void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
+ void (*deleted)(void*, const char* k, size_t klen)) {
+ H handler;
+ handler.state_ = state;
+ handler.put_ = put;
+ handler.deleted_ = deleted;
+ b->rep.Iterate(&handler);
+}
+
+const char* rocksdb_writebatch_data(rocksdb_writebatch_t* b, size_t* size) {
+ *size = b->rep.GetDataSize();
+ return b->rep.Data().c_str();
+}
+
+void rocksdb_writebatch_set_save_point(rocksdb_writebatch_t* b) {
+ b->rep.SetSavePoint();
+}
+
+void rocksdb_writebatch_rollback_to_save_point(rocksdb_writebatch_t* b,
+ char** errptr) {
+ SaveError(errptr, b->rep.RollbackToSavePoint());
+}
+
+void rocksdb_writebatch_pop_save_point(rocksdb_writebatch_t* b, char** errptr) {
+ SaveError(errptr, b->rep.PopSavePoint());
+}
+
+rocksdb_writebatch_wi_t* rocksdb_writebatch_wi_create(size_t reserved_bytes, unsigned char overwrite_key) {
+ rocksdb_writebatch_wi_t* b = new rocksdb_writebatch_wi_t;
+ b->rep = new WriteBatchWithIndex(BytewiseComparator(), reserved_bytes, overwrite_key);
+ return b;
+}
+
+void rocksdb_writebatch_wi_destroy(rocksdb_writebatch_wi_t* b) {
+ if (b->rep) {
+ delete b->rep;
+ }
+ delete b;
+}
+
+void rocksdb_writebatch_wi_clear(rocksdb_writebatch_wi_t* b) {
+ b->rep->Clear();
+}
+
+int rocksdb_writebatch_wi_count(rocksdb_writebatch_wi_t* b) {
+ return b->rep->GetWriteBatch()->Count();
+}
+
+void rocksdb_writebatch_wi_put(
+ rocksdb_writebatch_wi_t* b,
+ const char* key, size_t klen,
+ const char* val, size_t vlen) {
+ b->rep->Put(Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_wi_put_cf(
+ rocksdb_writebatch_wi_t* b,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen,
+ const char* val, size_t vlen) {
+ b->rep->Put(column_family->rep, Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_wi_putv(
+ rocksdb_writebatch_wi_t* b,
+ int num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes,
+ int num_values, const char* const* values_list,
+ const size_t* values_list_sizes) {
+ std::vector<Slice> key_slices(num_keys);
+ for (int i = 0; i < num_keys; i++) {
+ key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ }
+ std::vector<Slice> value_slices(num_values);
+ for (int i = 0; i < num_values; i++) {
+ value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+ }
+ b->rep->Put(SliceParts(key_slices.data(), num_keys),
+ SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_wi_putv_cf(
+ rocksdb_writebatch_wi_t* b,
+ rocksdb_column_family_handle_t* column_family,
+ int num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes,
+ int num_values, const char* const* values_list,
+ const size_t* values_list_sizes) {
+ std::vector<Slice> key_slices(num_keys);
+ for (int i = 0; i < num_keys; i++) {
+ key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ }
+ std::vector<Slice> value_slices(num_values);
+ for (int i = 0; i < num_values; i++) {
+ value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+ }
+ b->rep->Put(column_family->rep, SliceParts(key_slices.data(), num_keys),
+ SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_wi_merge(
+ rocksdb_writebatch_wi_t* b,
+ const char* key, size_t klen,
+ const char* val, size_t vlen) {
+ b->rep->Merge(Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_wi_merge_cf(
+ rocksdb_writebatch_wi_t* b,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen,
+ const char* val, size_t vlen) {
+ b->rep->Merge(column_family->rep, Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_wi_mergev(
+ rocksdb_writebatch_wi_t* b,
+ int num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes,
+ int num_values, const char* const* values_list,
+ const size_t* values_list_sizes) {
+ std::vector<Slice> key_slices(num_keys);
+ for (int i = 0; i < num_keys; i++) {
+ key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ }
+ std::vector<Slice> value_slices(num_values);
+ for (int i = 0; i < num_values; i++) {
+ value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+ }
+ b->rep->Merge(SliceParts(key_slices.data(), num_keys),
+ SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_wi_mergev_cf(
+ rocksdb_writebatch_wi_t* b,
+ rocksdb_column_family_handle_t* column_family,
+ int num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes,
+ int num_values, const char* const* values_list,
+ const size_t* values_list_sizes) {
+ std::vector<Slice> key_slices(num_keys);
+ for (int i = 0; i < num_keys; i++) {
+ key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ }
+ std::vector<Slice> value_slices(num_values);
+ for (int i = 0; i < num_values; i++) {
+ value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+ }
+ b->rep->Merge(column_family->rep, SliceParts(key_slices.data(), num_keys),
+ SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_wi_delete(
+ rocksdb_writebatch_wi_t* b,
+ const char* key, size_t klen) {
+ b->rep->Delete(Slice(key, klen));
+}
+
+void rocksdb_writebatch_wi_delete_cf(
+ rocksdb_writebatch_wi_t* b,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen) {
+ b->rep->Delete(column_family->rep, Slice(key, klen));
+}
+
+void rocksdb_writebatch_wi_deletev(
+ rocksdb_writebatch_wi_t* b,
+ int num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes) {
+ std::vector<Slice> key_slices(num_keys);
+ for (int i = 0; i < num_keys; i++) {
+ key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ }
+ b->rep->Delete(SliceParts(key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_wi_deletev_cf(
+ rocksdb_writebatch_wi_t* b,
+ rocksdb_column_family_handle_t* column_family,
+ int num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes) {
+ std::vector<Slice> key_slices(num_keys);
+ for (int i = 0; i < num_keys; i++) {
+ key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ }
+ b->rep->Delete(column_family->rep, SliceParts(key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_wi_delete_range(rocksdb_writebatch_wi_t* b,
+ const char* start_key,
+ size_t start_key_len, const char* end_key,
+ size_t end_key_len) {
+ b->rep->DeleteRange(Slice(start_key, start_key_len),
+ Slice(end_key, end_key_len));
+}
+
+void rocksdb_writebatch_wi_delete_range_cf(
+ rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+ const char* start_key, size_t start_key_len, const char* end_key,
+ size_t end_key_len) {
+ b->rep->DeleteRange(column_family->rep, Slice(start_key, start_key_len),
+ Slice(end_key, end_key_len));
+}
+
+void rocksdb_writebatch_wi_delete_rangev(rocksdb_writebatch_wi_t* b, int num_keys,
+ const char* const* start_keys_list,
+ const size_t* start_keys_list_sizes,
+ const char* const* end_keys_list,
+ const size_t* end_keys_list_sizes) {
+ std::vector<Slice> start_key_slices(num_keys);
+ std::vector<Slice> end_key_slices(num_keys);
+ for (int i = 0; i < num_keys; i++) {
+ start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]);
+ end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]);
+ }
+ b->rep->DeleteRange(SliceParts(start_key_slices.data(), num_keys),
+ SliceParts(end_key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_wi_delete_rangev_cf(
+ rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+ int num_keys, const char* const* start_keys_list,
+ const size_t* start_keys_list_sizes, const char* const* end_keys_list,
+ const size_t* end_keys_list_sizes) {
+ std::vector<Slice> start_key_slices(num_keys);
+ std::vector<Slice> end_key_slices(num_keys);
+ for (int i = 0; i < num_keys; i++) {
+ start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]);
+ end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]);
+ }
+ b->rep->DeleteRange(column_family->rep,
+ SliceParts(start_key_slices.data(), num_keys),
+ SliceParts(end_key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_wi_put_log_data(
+ rocksdb_writebatch_wi_t* b,
+ const char* blob, size_t len) {
+ b->rep->PutLogData(Slice(blob, len));
+}
+
+void rocksdb_writebatch_wi_iterate(
+ rocksdb_writebatch_wi_t* b,
+ void* state,
+ void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
+ void (*deleted)(void*, const char* k, size_t klen)) {
+ H handler;
+ handler.state_ = state;
+ handler.put_ = put;
+ handler.deleted_ = deleted;
+ b->rep->GetWriteBatch()->Iterate(&handler);
+}
+
+const char* rocksdb_writebatch_wi_data(rocksdb_writebatch_wi_t* b, size_t* size) {
+ WriteBatch* wb = b->rep->GetWriteBatch();
+ *size = wb->GetDataSize();
+ return wb->Data().c_str();
+}
+
+void rocksdb_writebatch_wi_set_save_point(rocksdb_writebatch_wi_t* b) {
+ b->rep->SetSavePoint();
+}
+
+void rocksdb_writebatch_wi_rollback_to_save_point(rocksdb_writebatch_wi_t* b,
+ char** errptr) {
+ SaveError(errptr, b->rep->RollbackToSavePoint());
+}
+
+rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base(
+ rocksdb_writebatch_wi_t* wbwi,
+ rocksdb_iterator_t* base_iterator) {
+ rocksdb_iterator_t* result = new rocksdb_iterator_t;
+ result->rep = wbwi->rep->NewIteratorWithBase(base_iterator->rep);
+ delete base_iterator;
+ return result;
+}
+
+rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base_cf(
+ rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator,
+ rocksdb_column_family_handle_t* column_family) {
+ rocksdb_iterator_t* result = new rocksdb_iterator_t;
+ result->rep =
+ wbwi->rep->NewIteratorWithBase(column_family->rep, base_iterator->rep);
+ delete base_iterator;
+ return result;
+}
+
+char* rocksdb_writebatch_wi_get_from_batch(
+ rocksdb_writebatch_wi_t* wbwi,
+ const rocksdb_options_t* options,
+ const char* key, size_t keylen,
+ size_t* vallen,
+ char** errptr) {
+ char* result = nullptr;
+ std::string tmp;
+ Status s = wbwi->rep->GetFromBatch(options->rep, Slice(key, keylen), &tmp);
+ if (s.ok()) {
+ *vallen = tmp.size();
+ result = CopyString(tmp);
+ } else {
+ *vallen = 0;
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ }
+ return result;
+}
+
+char* rocksdb_writebatch_wi_get_from_batch_cf(
+ rocksdb_writebatch_wi_t* wbwi,
+ const rocksdb_options_t* options,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t keylen,
+ size_t* vallen,
+ char** errptr) {
+ char* result = nullptr;
+ std::string tmp;
+ Status s = wbwi->rep->GetFromBatch(column_family->rep, options->rep,
+ Slice(key, keylen), &tmp);
+ if (s.ok()) {
+ *vallen = tmp.size();
+ result = CopyString(tmp);
+ } else {
+ *vallen = 0;
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ }
+ return result;
+}
+
+char* rocksdb_writebatch_wi_get_from_batch_and_db(
+ rocksdb_writebatch_wi_t* wbwi,
+ rocksdb_t* db,
+ const rocksdb_readoptions_t* options,
+ const char* key, size_t keylen,
+ size_t* vallen,
+ char** errptr) {
+ char* result = nullptr;
+ std::string tmp;
+ Status s = wbwi->rep->GetFromBatchAndDB(db->rep, options->rep, Slice(key, keylen), &tmp);
+ if (s.ok()) {
+ *vallen = tmp.size();
+ result = CopyString(tmp);
+ } else {
+ *vallen = 0;
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ }
+ return result;
+}
+
+char* rocksdb_writebatch_wi_get_from_batch_and_db_cf(
+ rocksdb_writebatch_wi_t* wbwi,
+ rocksdb_t* db,
+ const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t keylen,
+ size_t* vallen,
+ char** errptr) {
+ char* result = nullptr;
+ std::string tmp;
+ Status s = wbwi->rep->GetFromBatchAndDB(db->rep, options->rep, column_family->rep,
+ Slice(key, keylen), &tmp);
+ if (s.ok()) {
+ *vallen = tmp.size();
+ result = CopyString(tmp);
+ } else {
+ *vallen = 0;
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ }
+ return result;
+}
+
+void rocksdb_write_writebatch_wi(
+ rocksdb_t* db,
+ const rocksdb_writeoptions_t* options,
+ rocksdb_writebatch_wi_t* wbwi,
+ char** errptr) {
+ WriteBatch* wb = wbwi->rep->GetWriteBatch();
+ SaveError(errptr, db->rep->Write(options->rep, wb));
+}
+
+rocksdb_block_based_table_options_t*
+rocksdb_block_based_options_create() {
+ return new rocksdb_block_based_table_options_t;
+}
+
+void rocksdb_block_based_options_destroy(
+ rocksdb_block_based_table_options_t* options) {
+ delete options;
+}
+
+void rocksdb_block_based_options_set_block_size(
+ rocksdb_block_based_table_options_t* options, size_t block_size) {
+ options->rep.block_size = block_size;
+}
+
+void rocksdb_block_based_options_set_block_size_deviation(
+ rocksdb_block_based_table_options_t* options, int block_size_deviation) {
+ options->rep.block_size_deviation = block_size_deviation;
+}
+
+void rocksdb_block_based_options_set_block_restart_interval(
+ rocksdb_block_based_table_options_t* options, int block_restart_interval) {
+ options->rep.block_restart_interval = block_restart_interval;
+}
+
+void rocksdb_block_based_options_set_index_block_restart_interval(
+ rocksdb_block_based_table_options_t* options, int index_block_restart_interval) {
+ options->rep.index_block_restart_interval = index_block_restart_interval;
+}
+
+void rocksdb_block_based_options_set_metadata_block_size(
+ rocksdb_block_based_table_options_t* options, uint64_t metadata_block_size) {
+ options->rep.metadata_block_size = metadata_block_size;
+}
+
+void rocksdb_block_based_options_set_partition_filters(
+ rocksdb_block_based_table_options_t* options, unsigned char partition_filters) {
+ options->rep.partition_filters = partition_filters;
+}
+
+void rocksdb_block_based_options_set_use_delta_encoding(
+ rocksdb_block_based_table_options_t* options, unsigned char use_delta_encoding) {
+ options->rep.use_delta_encoding = use_delta_encoding;
+}
+
+void rocksdb_block_based_options_set_filter_policy(
+ rocksdb_block_based_table_options_t* options,
+ rocksdb_filterpolicy_t* filter_policy) {
+ options->rep.filter_policy.reset(filter_policy);
+}
+
+void rocksdb_block_based_options_set_no_block_cache(
+ rocksdb_block_based_table_options_t* options,
+ unsigned char no_block_cache) {
+ options->rep.no_block_cache = no_block_cache;
+}
+
+void rocksdb_block_based_options_set_block_cache(
+ rocksdb_block_based_table_options_t* options,
+ rocksdb_cache_t* block_cache) {
+ if (block_cache) {
+ options->rep.block_cache = block_cache->rep;
+ }
+}
+
+void rocksdb_block_based_options_set_block_cache_compressed(
+ rocksdb_block_based_table_options_t* options,
+ rocksdb_cache_t* block_cache_compressed) {
+ if (block_cache_compressed) {
+ options->rep.block_cache_compressed = block_cache_compressed->rep;
+ }
+}
+
+void rocksdb_block_based_options_set_whole_key_filtering(
+ rocksdb_block_based_table_options_t* options, unsigned char v) {
+ options->rep.whole_key_filtering = v;
+}
+
+void rocksdb_block_based_options_set_format_version(
+ rocksdb_block_based_table_options_t* options, int v) {
+ options->rep.format_version = v;
+}
+
+void rocksdb_block_based_options_set_index_type(
+ rocksdb_block_based_table_options_t* options, int v) {
+ options->rep.index_type = static_cast<BlockBasedTableOptions::IndexType>(v);
+}
+
+void rocksdb_block_based_options_set_data_block_index_type(
+ rocksdb_block_based_table_options_t* options, int v) {
+ options->rep.data_block_index_type =
+ static_cast<BlockBasedTableOptions::DataBlockIndexType>(v);
+}
+
+void rocksdb_block_based_options_set_data_block_hash_ratio(
+ rocksdb_block_based_table_options_t* options, double v) {
+ options->rep.data_block_hash_table_util_ratio = v;
+}
+
+void rocksdb_block_based_options_set_hash_index_allow_collision(
+ rocksdb_block_based_table_options_t* options, unsigned char v) {
+ options->rep.hash_index_allow_collision = v;
+}
+
+void rocksdb_block_based_options_set_cache_index_and_filter_blocks(
+ rocksdb_block_based_table_options_t* options, unsigned char v) {
+ options->rep.cache_index_and_filter_blocks = v;
+}
+
+void rocksdb_block_based_options_set_cache_index_and_filter_blocks_with_high_priority(
+ rocksdb_block_based_table_options_t* options, unsigned char v) {
+ options->rep.cache_index_and_filter_blocks_with_high_priority = v;
+}
+
+void rocksdb_block_based_options_set_pin_l0_filter_and_index_blocks_in_cache(
+ rocksdb_block_based_table_options_t* options, unsigned char v) {
+ options->rep.pin_l0_filter_and_index_blocks_in_cache = v;
+}
+
+void rocksdb_block_based_options_set_pin_top_level_index_and_filter(
+ rocksdb_block_based_table_options_t* options, unsigned char v) {
+ options->rep.pin_top_level_index_and_filter = v;
+}
+
+void rocksdb_options_set_block_based_table_factory(
+ rocksdb_options_t *opt,
+ rocksdb_block_based_table_options_t* table_options) {
+ if (table_options) {
+ opt->rep.table_factory.reset(
+ ROCKSDB_NAMESPACE::NewBlockBasedTableFactory(table_options->rep));
+ }
+}
+
+rocksdb_cuckoo_table_options_t*
+rocksdb_cuckoo_options_create() {
+ return new rocksdb_cuckoo_table_options_t;
+}
+
+void rocksdb_cuckoo_options_destroy(
+ rocksdb_cuckoo_table_options_t* options) {
+ delete options;
+}
+
+void rocksdb_cuckoo_options_set_hash_ratio(
+ rocksdb_cuckoo_table_options_t* options, double v) {
+ options->rep.hash_table_ratio = v;
+}
+
+void rocksdb_cuckoo_options_set_max_search_depth(
+ rocksdb_cuckoo_table_options_t* options, uint32_t v) {
+ options->rep.max_search_depth = v;
+}
+
+void rocksdb_cuckoo_options_set_cuckoo_block_size(
+ rocksdb_cuckoo_table_options_t* options, uint32_t v) {
+ options->rep.cuckoo_block_size = v;
+}
+
+void rocksdb_cuckoo_options_set_identity_as_first_hash(
+ rocksdb_cuckoo_table_options_t* options, unsigned char v) {
+ options->rep.identity_as_first_hash = v;
+}
+
+void rocksdb_cuckoo_options_set_use_module_hash(
+ rocksdb_cuckoo_table_options_t* options, unsigned char v) {
+ options->rep.use_module_hash = v;
+}
+
+void rocksdb_options_set_cuckoo_table_factory(
+ rocksdb_options_t *opt,
+ rocksdb_cuckoo_table_options_t* table_options) {
+ if (table_options) {
+ opt->rep.table_factory.reset(
+ ROCKSDB_NAMESPACE::NewCuckooTableFactory(table_options->rep));
+ }
+}
+
+void rocksdb_set_options(
+ rocksdb_t* db, int count, const char* const keys[], const char* const values[], char** errptr) {
+ std::unordered_map<std::string, std::string> options_map;
+ for (int i=0; i<count; i++)
+ options_map[keys[i]] = values[i];
+ SaveError(errptr,
+ db->rep->SetOptions(options_map));
+ }
+
+void rocksdb_set_options_cf(
+ rocksdb_t* db, rocksdb_column_family_handle_t* handle, int count, const char* const keys[], const char* const values[], char** errptr) {
+ std::unordered_map<std::string, std::string> options_map;
+ for (int i=0; i<count; i++)
+ options_map[keys[i]] = values[i];
+ SaveError(errptr,
+ db->rep->SetOptions(handle->rep, options_map));
+ }
+
+rocksdb_options_t* rocksdb_options_create() {
+ return new rocksdb_options_t;
+}
+
+void rocksdb_options_destroy(rocksdb_options_t* options) {
+ delete options;
+}
+
+void rocksdb_options_increase_parallelism(
+ rocksdb_options_t* opt, int total_threads) {
+ opt->rep.IncreaseParallelism(total_threads);
+}
+
+void rocksdb_options_optimize_for_point_lookup(
+ rocksdb_options_t* opt, uint64_t block_cache_size_mb) {
+ opt->rep.OptimizeForPointLookup(block_cache_size_mb);
+}
+
+void rocksdb_options_optimize_level_style_compaction(
+ rocksdb_options_t* opt, uint64_t memtable_memory_budget) {
+ opt->rep.OptimizeLevelStyleCompaction(memtable_memory_budget);
+}
+
+void rocksdb_options_optimize_universal_style_compaction(
+ rocksdb_options_t* opt, uint64_t memtable_memory_budget) {
+ opt->rep.OptimizeUniversalStyleCompaction(memtable_memory_budget);
+}
+
+void rocksdb_options_set_allow_ingest_behind(
+ rocksdb_options_t* opt, unsigned char v) {
+ opt->rep.allow_ingest_behind = v;
+}
+
+void rocksdb_options_set_compaction_filter(
+ rocksdb_options_t* opt,
+ rocksdb_compactionfilter_t* filter) {
+ opt->rep.compaction_filter = filter;
+}
+
+void rocksdb_options_set_compaction_filter_factory(
+ rocksdb_options_t* opt, rocksdb_compactionfilterfactory_t* factory) {
+ opt->rep.compaction_filter_factory =
+ std::shared_ptr<CompactionFilterFactory>(factory);
+}
+
+void rocksdb_options_compaction_readahead_size(
+ rocksdb_options_t* opt, size_t s) {
+ opt->rep.compaction_readahead_size = s;
+}
+
+void rocksdb_options_set_comparator(
+ rocksdb_options_t* opt,
+ rocksdb_comparator_t* cmp) {
+ opt->rep.comparator = cmp;
+}
+
+void rocksdb_options_set_merge_operator(
+ rocksdb_options_t* opt,
+ rocksdb_mergeoperator_t* merge_operator) {
+ opt->rep.merge_operator = std::shared_ptr<MergeOperator>(merge_operator);
+}
+
+
+void rocksdb_options_set_create_if_missing(
+ rocksdb_options_t* opt, unsigned char v) {
+ opt->rep.create_if_missing = v;
+}
+
+void rocksdb_options_set_create_missing_column_families(
+ rocksdb_options_t* opt, unsigned char v) {
+ opt->rep.create_missing_column_families = v;
+}
+
+void rocksdb_options_set_error_if_exists(
+ rocksdb_options_t* opt, unsigned char v) {
+ opt->rep.error_if_exists = v;
+}
+
+void rocksdb_options_set_paranoid_checks(
+ rocksdb_options_t* opt, unsigned char v) {
+ opt->rep.paranoid_checks = v;
+}
+
+void rocksdb_options_set_db_paths(rocksdb_options_t* opt,
+ const rocksdb_dbpath_t** dbpath_values,
+ size_t num_paths) {
+ std::vector<DbPath> db_paths(num_paths);
+ for (size_t i = 0; i < num_paths; ++i) {
+ db_paths[i] = dbpath_values[i]->rep;
+ }
+ opt->rep.db_paths = db_paths;
+}
+
+void rocksdb_options_set_env(rocksdb_options_t* opt, rocksdb_env_t* env) {
+ opt->rep.env = (env ? env->rep : nullptr);
+}
+
+void rocksdb_options_set_info_log(rocksdb_options_t* opt, rocksdb_logger_t* l) {
+ if (l) {
+ opt->rep.info_log = l->rep;
+ }
+}
+
+void rocksdb_options_set_info_log_level(
+ rocksdb_options_t* opt, int v) {
+ opt->rep.info_log_level = static_cast<InfoLogLevel>(v);
+}
+
+void rocksdb_options_set_db_write_buffer_size(rocksdb_options_t* opt,
+ size_t s) {
+ opt->rep.db_write_buffer_size = s;
+}
+
+void rocksdb_options_set_write_buffer_size(rocksdb_options_t* opt, size_t s) {
+ opt->rep.write_buffer_size = s;
+}
+
+void rocksdb_options_set_max_open_files(rocksdb_options_t* opt, int n) {
+ opt->rep.max_open_files = n;
+}
+
+void rocksdb_options_set_max_file_opening_threads(rocksdb_options_t* opt, int n) {
+ opt->rep.max_file_opening_threads = n;
+}
+
+void rocksdb_options_set_max_total_wal_size(rocksdb_options_t* opt, uint64_t n) {
+ opt->rep.max_total_wal_size = n;
+}
+
+void rocksdb_options_set_target_file_size_base(
+ rocksdb_options_t* opt, uint64_t n) {
+ opt->rep.target_file_size_base = n;
+}
+
+void rocksdb_options_set_target_file_size_multiplier(
+ rocksdb_options_t* opt, int n) {
+ opt->rep.target_file_size_multiplier = n;
+}
+
+void rocksdb_options_set_max_bytes_for_level_base(
+ rocksdb_options_t* opt, uint64_t n) {
+ opt->rep.max_bytes_for_level_base = n;
+}
+
+void rocksdb_options_set_level_compaction_dynamic_level_bytes(
+ rocksdb_options_t* opt, unsigned char v) {
+ opt->rep.level_compaction_dynamic_level_bytes = v;
+}
+
+void rocksdb_options_set_max_bytes_for_level_multiplier(rocksdb_options_t* opt,
+ double n) {
+ opt->rep.max_bytes_for_level_multiplier = n;
+}
+
+void rocksdb_options_set_max_compaction_bytes(rocksdb_options_t* opt,
+ uint64_t n) {
+ opt->rep.max_compaction_bytes = n;
+}
+
+void rocksdb_options_set_max_bytes_for_level_multiplier_additional(
+ rocksdb_options_t* opt, int* level_values, size_t num_levels) {
+ opt->rep.max_bytes_for_level_multiplier_additional.resize(num_levels);
+ for (size_t i = 0; i < num_levels; ++i) {
+ opt->rep.max_bytes_for_level_multiplier_additional[i] = level_values[i];
+ }
+}
+
+void rocksdb_options_enable_statistics(rocksdb_options_t* opt) {
+ opt->rep.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+}
+
+void rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt,
+ unsigned char val) {
+ opt->rep.skip_stats_update_on_db_open = val;
+}
+
+void rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(
+ rocksdb_options_t* opt, unsigned char val) {
+ opt->rep.skip_checking_sst_file_sizes_on_db_open = val;
+}
+
+void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) {
+ opt->rep.num_levels = n;
+}
+
+void rocksdb_options_set_level0_file_num_compaction_trigger(
+ rocksdb_options_t* opt, int n) {
+ opt->rep.level0_file_num_compaction_trigger = n;
+}
+
+void rocksdb_options_set_level0_slowdown_writes_trigger(
+ rocksdb_options_t* opt, int n) {
+ opt->rep.level0_slowdown_writes_trigger = n;
+}
+
+void rocksdb_options_set_level0_stop_writes_trigger(
+ rocksdb_options_t* opt, int n) {
+ opt->rep.level0_stop_writes_trigger = n;
+}
+
+void rocksdb_options_set_max_mem_compaction_level(rocksdb_options_t* /*opt*/,
+ int /*n*/) {}
+
+void rocksdb_options_set_wal_recovery_mode(rocksdb_options_t* opt,int mode) {
+ opt->rep.wal_recovery_mode = static_cast<WALRecoveryMode>(mode);
+}
+
+void rocksdb_options_set_compression(rocksdb_options_t* opt, int t) {
+ opt->rep.compression = static_cast<CompressionType>(t);
+}
+
+void rocksdb_options_set_compression_per_level(rocksdb_options_t* opt,
+ int* level_values,
+ size_t num_levels) {
+ opt->rep.compression_per_level.resize(num_levels);
+ for (size_t i = 0; i < num_levels; ++i) {
+ opt->rep.compression_per_level[i] =
+ static_cast<CompressionType>(level_values[i]);
+ }
+}
+
+void rocksdb_options_set_bottommost_compression_options(rocksdb_options_t* opt,
+ int w_bits, int level,
+ int strategy,
+ int max_dict_bytes,
+ bool enabled) {
+ opt->rep.bottommost_compression_opts.window_bits = w_bits;
+ opt->rep.bottommost_compression_opts.level = level;
+ opt->rep.bottommost_compression_opts.strategy = strategy;
+ opt->rep.bottommost_compression_opts.max_dict_bytes = max_dict_bytes;
+ opt->rep.bottommost_compression_opts.enabled = enabled;
+}
+
+void rocksdb_options_set_compression_options(rocksdb_options_t* opt, int w_bits,
+ int level, int strategy,
+ int max_dict_bytes) {
+ opt->rep.compression_opts.window_bits = w_bits;
+ opt->rep.compression_opts.level = level;
+ opt->rep.compression_opts.strategy = strategy;
+ opt->rep.compression_opts.max_dict_bytes = max_dict_bytes;
+}
+
+void rocksdb_options_set_prefix_extractor(
+ rocksdb_options_t* opt, rocksdb_slicetransform_t* prefix_extractor) {
+ opt->rep.prefix_extractor.reset(prefix_extractor);
+}
+
+void rocksdb_options_set_use_fsync(
+ rocksdb_options_t* opt, int use_fsync) {
+ opt->rep.use_fsync = use_fsync;
+}
+
+void rocksdb_options_set_db_log_dir(
+ rocksdb_options_t* opt, const char* db_log_dir) {
+ opt->rep.db_log_dir = db_log_dir;
+}
+
+void rocksdb_options_set_wal_dir(
+ rocksdb_options_t* opt, const char* v) {
+ opt->rep.wal_dir = v;
+}
+
+void rocksdb_options_set_WAL_ttl_seconds(rocksdb_options_t* opt, uint64_t ttl) {
+ opt->rep.WAL_ttl_seconds = ttl;
+}
+
+void rocksdb_options_set_WAL_size_limit_MB(
+ rocksdb_options_t* opt, uint64_t limit) {
+ opt->rep.WAL_size_limit_MB = limit;
+}
+
+void rocksdb_options_set_manifest_preallocation_size(
+ rocksdb_options_t* opt, size_t v) {
+ opt->rep.manifest_preallocation_size = v;
+}
+
+// noop
+void rocksdb_options_set_purge_redundant_kvs_while_flush(
+ rocksdb_options_t* /*opt*/, unsigned char /*v*/) {}
+
+void rocksdb_options_set_use_direct_reads(rocksdb_options_t* opt,
+ unsigned char v) {
+ opt->rep.use_direct_reads = v;
+}
+
+void rocksdb_options_set_use_direct_io_for_flush_and_compaction(
+ rocksdb_options_t* opt, unsigned char v) {
+ opt->rep.use_direct_io_for_flush_and_compaction = v;
+}
+
+void rocksdb_options_set_allow_mmap_reads(
+ rocksdb_options_t* opt, unsigned char v) {
+ opt->rep.allow_mmap_reads = v;
+}
+
+void rocksdb_options_set_allow_mmap_writes(
+ rocksdb_options_t* opt, unsigned char v) {
+ opt->rep.allow_mmap_writes = v;
+}
+
+void rocksdb_options_set_is_fd_close_on_exec(
+ rocksdb_options_t* opt, unsigned char v) {
+ opt->rep.is_fd_close_on_exec = v;
+}
+
+void rocksdb_options_set_skip_log_error_on_recovery(
+ rocksdb_options_t* opt, unsigned char v) {
+ opt->rep.skip_log_error_on_recovery = v;
+}
+
+void rocksdb_options_set_stats_dump_period_sec(
+ rocksdb_options_t* opt, unsigned int v) {
+ opt->rep.stats_dump_period_sec = v;
+}
+
+void rocksdb_options_set_advise_random_on_open(
+ rocksdb_options_t* opt, unsigned char v) {
+ opt->rep.advise_random_on_open = v;
+}
+
+void rocksdb_options_set_access_hint_on_compaction_start(
+ rocksdb_options_t* opt, int v) {
+ switch(v) {
+ case 0:
+ opt->rep.access_hint_on_compaction_start =
+ ROCKSDB_NAMESPACE::Options::NONE;
+ break;
+ case 1:
+ opt->rep.access_hint_on_compaction_start =
+ ROCKSDB_NAMESPACE::Options::NORMAL;
+ break;
+ case 2:
+ opt->rep.access_hint_on_compaction_start =
+ ROCKSDB_NAMESPACE::Options::SEQUENTIAL;
+ break;
+ case 3:
+ opt->rep.access_hint_on_compaction_start =
+ ROCKSDB_NAMESPACE::Options::WILLNEED;
+ break;
+ }
+}
+
+void rocksdb_options_set_use_adaptive_mutex(
+ rocksdb_options_t* opt, unsigned char v) {
+ opt->rep.use_adaptive_mutex = v;
+}
+
+void rocksdb_options_set_wal_bytes_per_sync(
+ rocksdb_options_t* opt, uint64_t v) {
+ opt->rep.wal_bytes_per_sync = v;
+}
+
+void rocksdb_options_set_bytes_per_sync(
+ rocksdb_options_t* opt, uint64_t v) {
+ opt->rep.bytes_per_sync = v;
+}
+
+void rocksdb_options_set_writable_file_max_buffer_size(rocksdb_options_t* opt,
+ uint64_t v) {
+ opt->rep.writable_file_max_buffer_size = static_cast<size_t>(v);
+}
+
+void rocksdb_options_set_allow_concurrent_memtable_write(rocksdb_options_t* opt,
+ unsigned char v) {
+ opt->rep.allow_concurrent_memtable_write = v;
+}
+
+void rocksdb_options_set_enable_write_thread_adaptive_yield(
+ rocksdb_options_t* opt, unsigned char v) {
+ opt->rep.enable_write_thread_adaptive_yield = v;
+}
+
+void rocksdb_options_set_max_sequential_skip_in_iterations(
+ rocksdb_options_t* opt, uint64_t v) {
+ opt->rep.max_sequential_skip_in_iterations = v;
+}
+
+void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t* opt, int n) {
+ opt->rep.max_write_buffer_number = n;
+}
+
+void rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t* opt, int n) {
+ opt->rep.min_write_buffer_number_to_merge = n;
+}
+
+void rocksdb_options_set_max_write_buffer_number_to_maintain(
+ rocksdb_options_t* opt, int n) {
+ opt->rep.max_write_buffer_number_to_maintain = n;
+}
+
+void rocksdb_options_set_max_write_buffer_size_to_maintain(
+ rocksdb_options_t* opt, int64_t n) {
+ opt->rep.max_write_buffer_size_to_maintain = n;
+}
+
+void rocksdb_options_set_enable_pipelined_write(rocksdb_options_t* opt,
+ unsigned char v) {
+ opt->rep.enable_pipelined_write = v;
+}
+
+void rocksdb_options_set_unordered_write(rocksdb_options_t* opt,
+ unsigned char v) {
+ opt->rep.unordered_write = v;
+}
+
+void rocksdb_options_set_max_subcompactions(rocksdb_options_t* opt,
+ uint32_t n) {
+ opt->rep.max_subcompactions = n;
+}
+
+void rocksdb_options_set_max_background_jobs(rocksdb_options_t* opt, int n) {
+ opt->rep.max_background_jobs = n;
+}
+
+void rocksdb_options_set_max_background_compactions(rocksdb_options_t* opt, int n) {
+ opt->rep.max_background_compactions = n;
+}
+
+void rocksdb_options_set_base_background_compactions(rocksdb_options_t* opt,
+ int n) {
+ opt->rep.base_background_compactions = n;
+}
+
+void rocksdb_options_set_max_background_flushes(rocksdb_options_t* opt, int n) {
+ opt->rep.max_background_flushes = n;
+}
+
+void rocksdb_options_set_max_log_file_size(rocksdb_options_t* opt, size_t v) {
+ opt->rep.max_log_file_size = v;
+}
+
+void rocksdb_options_set_log_file_time_to_roll(rocksdb_options_t* opt, size_t v) {
+ opt->rep.log_file_time_to_roll = v;
+}
+
+void rocksdb_options_set_keep_log_file_num(rocksdb_options_t* opt, size_t v) {
+ opt->rep.keep_log_file_num = v;
+}
+
+void rocksdb_options_set_recycle_log_file_num(rocksdb_options_t* opt,
+ size_t v) {
+ opt->rep.recycle_log_file_num = v;
+}
+
+void rocksdb_options_set_soft_rate_limit(rocksdb_options_t* opt, double v) {
+ opt->rep.soft_rate_limit = v;
+}
+
+void rocksdb_options_set_hard_rate_limit(rocksdb_options_t* opt, double v) {
+ opt->rep.hard_rate_limit = v;
+}
+
+void rocksdb_options_set_soft_pending_compaction_bytes_limit(rocksdb_options_t* opt, size_t v) {
+ opt->rep.soft_pending_compaction_bytes_limit = v;
+}
+
+void rocksdb_options_set_hard_pending_compaction_bytes_limit(rocksdb_options_t* opt, size_t v) {
+ opt->rep.hard_pending_compaction_bytes_limit = v;
+}
+
+void rocksdb_options_set_rate_limit_delay_max_milliseconds(
+ rocksdb_options_t* opt, unsigned int v) {
+ opt->rep.rate_limit_delay_max_milliseconds = v;
+}
+
+void rocksdb_options_set_max_manifest_file_size(
+ rocksdb_options_t* opt, size_t v) {
+ opt->rep.max_manifest_file_size = v;
+}
+
+void rocksdb_options_set_table_cache_numshardbits(
+ rocksdb_options_t* opt, int v) {
+ opt->rep.table_cache_numshardbits = v;
+}
+
+void rocksdb_options_set_table_cache_remove_scan_count_limit(
+ rocksdb_options_t* /*opt*/, int /*v*/) {
+ // this option is deprecated
+}
+
+void rocksdb_options_set_arena_block_size(
+ rocksdb_options_t* opt, size_t v) {
+ opt->rep.arena_block_size = v;
+}
+
+void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t* opt, int disable) {
+ opt->rep.disable_auto_compactions = disable;
+}
+
+void rocksdb_options_set_optimize_filters_for_hits(rocksdb_options_t* opt, int v) {
+ opt->rep.optimize_filters_for_hits = v;
+}
+
+void rocksdb_options_set_delete_obsolete_files_period_micros(
+ rocksdb_options_t* opt, uint64_t v) {
+ opt->rep.delete_obsolete_files_period_micros = v;
+}
+
+void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t* opt) {
+ opt->rep.PrepareForBulkLoad();
+}
+
+void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t *opt) {
+ opt->rep.memtable_factory.reset(new ROCKSDB_NAMESPACE::VectorRepFactory);
+}
+
+void rocksdb_options_set_memtable_prefix_bloom_size_ratio(
+ rocksdb_options_t* opt, double v) {
+ opt->rep.memtable_prefix_bloom_size_ratio = v;
+}
+
+void rocksdb_options_set_memtable_huge_page_size(rocksdb_options_t* opt,
+ size_t v) {
+ opt->rep.memtable_huge_page_size = v;
+}
+
+void rocksdb_options_set_hash_skip_list_rep(
+ rocksdb_options_t *opt, size_t bucket_count,
+ int32_t skiplist_height, int32_t skiplist_branching_factor) {
+ ROCKSDB_NAMESPACE::MemTableRepFactory* factory =
+ ROCKSDB_NAMESPACE::NewHashSkipListRepFactory(
+ bucket_count, skiplist_height, skiplist_branching_factor);
+ opt->rep.memtable_factory.reset(factory);
+}
+
+void rocksdb_options_set_hash_link_list_rep(
+ rocksdb_options_t *opt, size_t bucket_count) {
+ opt->rep.memtable_factory.reset(
+ ROCKSDB_NAMESPACE::NewHashLinkListRepFactory(bucket_count));
+}
+
+void rocksdb_options_set_plain_table_factory(
+ rocksdb_options_t *opt, uint32_t user_key_len, int bloom_bits_per_key,
+ double hash_table_ratio, size_t index_sparseness) {
+ ROCKSDB_NAMESPACE::PlainTableOptions options;
+ options.user_key_len = user_key_len;
+ options.bloom_bits_per_key = bloom_bits_per_key;
+ options.hash_table_ratio = hash_table_ratio;
+ options.index_sparseness = index_sparseness;
+
+ ROCKSDB_NAMESPACE::TableFactory* factory =
+ ROCKSDB_NAMESPACE::NewPlainTableFactory(options);
+ opt->rep.table_factory.reset(factory);
+}
+
+void rocksdb_options_set_max_successive_merges(
+ rocksdb_options_t* opt, size_t v) {
+ opt->rep.max_successive_merges = v;
+}
+
+void rocksdb_options_set_bloom_locality(
+ rocksdb_options_t* opt, uint32_t v) {
+ opt->rep.bloom_locality = v;
+}
+
+void rocksdb_options_set_inplace_update_support(
+ rocksdb_options_t* opt, unsigned char v) {
+ opt->rep.inplace_update_support = v;
+}
+
+void rocksdb_options_set_inplace_update_num_locks(
+ rocksdb_options_t* opt, size_t v) {
+ opt->rep.inplace_update_num_locks = v;
+}
+
+void rocksdb_options_set_report_bg_io_stats(
+ rocksdb_options_t* opt, int v) {
+ opt->rep.report_bg_io_stats = v;
+}
+
+void rocksdb_options_set_compaction_style(rocksdb_options_t *opt, int style) {
+ opt->rep.compaction_style =
+ static_cast<ROCKSDB_NAMESPACE::CompactionStyle>(style);
+}
+
+void rocksdb_options_set_universal_compaction_options(rocksdb_options_t *opt, rocksdb_universal_compaction_options_t *uco) {
+ opt->rep.compaction_options_universal = *(uco->rep);
+}
+
+void rocksdb_options_set_fifo_compaction_options(
+ rocksdb_options_t* opt,
+ rocksdb_fifo_compaction_options_t* fifo) {
+ opt->rep.compaction_options_fifo = fifo->rep;
+}
+
+char *rocksdb_options_statistics_get_string(rocksdb_options_t *opt) {
+ ROCKSDB_NAMESPACE::Statistics* statistics = opt->rep.statistics.get();
+ if (statistics) {
+ return strdup(statistics->ToString().c_str());
+ }
+ return nullptr;
+}
+
+void rocksdb_options_set_ratelimiter(rocksdb_options_t *opt, rocksdb_ratelimiter_t *limiter) {
+ if (limiter) {
+ opt->rep.rate_limiter = limiter->rep;
+ }
+}
+
+void rocksdb_options_set_atomic_flush(rocksdb_options_t* opt,
+ unsigned char atomic_flush) {
+ opt->rep.atomic_flush = atomic_flush;
+}
+
+rocksdb_ratelimiter_t* rocksdb_ratelimiter_create(
+ int64_t rate_bytes_per_sec,
+ int64_t refill_period_us,
+ int32_t fairness) {
+ rocksdb_ratelimiter_t* rate_limiter = new rocksdb_ratelimiter_t;
+ rate_limiter->rep.reset(
+ NewGenericRateLimiter(rate_bytes_per_sec,
+ refill_period_us, fairness));
+ return rate_limiter;
+}
+
+void rocksdb_ratelimiter_destroy(rocksdb_ratelimiter_t *limiter) {
+ delete limiter;
+}
+
+void rocksdb_options_set_row_cache(rocksdb_options_t* opt, rocksdb_cache_t* cache) {
+ if(cache) {
+ opt->rep.row_cache = cache->rep;
+ }
+}
+
+void rocksdb_set_perf_level(int v) {
+ PerfLevel level = static_cast<PerfLevel>(v);
+ SetPerfLevel(level);
+}
+
+rocksdb_perfcontext_t* rocksdb_perfcontext_create() {
+ rocksdb_perfcontext_t* context = new rocksdb_perfcontext_t;
+ context->rep = ROCKSDB_NAMESPACE::get_perf_context();
+ return context;
+}
+
+void rocksdb_perfcontext_reset(rocksdb_perfcontext_t* context) {
+ context->rep->Reset();
+}
+
+char* rocksdb_perfcontext_report(rocksdb_perfcontext_t* context,
+ unsigned char exclude_zero_counters) {
+ return strdup(context->rep->ToString(exclude_zero_counters).c_str());
+}
+
+uint64_t rocksdb_perfcontext_metric(rocksdb_perfcontext_t* context,
+ int metric) {
+ PerfContext* rep = context->rep;
+ switch (metric) {
+ case rocksdb_user_key_comparison_count:
+ return rep->user_key_comparison_count;
+ case rocksdb_block_cache_hit_count:
+ return rep->block_cache_hit_count;
+ case rocksdb_block_read_count:
+ return rep->block_read_count;
+ case rocksdb_block_read_byte:
+ return rep->block_read_byte;
+ case rocksdb_block_read_time:
+ return rep->block_read_time;
+ case rocksdb_block_checksum_time:
+ return rep->block_checksum_time;
+ case rocksdb_block_decompress_time:
+ return rep->block_decompress_time;
+ case rocksdb_get_read_bytes:
+ return rep->get_read_bytes;
+ case rocksdb_multiget_read_bytes:
+ return rep->multiget_read_bytes;
+ case rocksdb_iter_read_bytes:
+ return rep->iter_read_bytes;
+ case rocksdb_internal_key_skipped_count:
+ return rep->internal_key_skipped_count;
+ case rocksdb_internal_delete_skipped_count:
+ return rep->internal_delete_skipped_count;
+ case rocksdb_internal_recent_skipped_count:
+ return rep->internal_recent_skipped_count;
+ case rocksdb_internal_merge_count:
+ return rep->internal_merge_count;
+ case rocksdb_get_snapshot_time:
+ return rep->get_snapshot_time;
+ case rocksdb_get_from_memtable_time:
+ return rep->get_from_memtable_time;
+ case rocksdb_get_from_memtable_count:
+ return rep->get_from_memtable_count;
+ case rocksdb_get_post_process_time:
+ return rep->get_post_process_time;
+ case rocksdb_get_from_output_files_time:
+ return rep->get_from_output_files_time;
+ case rocksdb_seek_on_memtable_time:
+ return rep->seek_on_memtable_time;
+ case rocksdb_seek_on_memtable_count:
+ return rep->seek_on_memtable_count;
+ case rocksdb_next_on_memtable_count:
+ return rep->next_on_memtable_count;
+ case rocksdb_prev_on_memtable_count:
+ return rep->prev_on_memtable_count;
+ case rocksdb_seek_child_seek_time:
+ return rep->seek_child_seek_time;
+ case rocksdb_seek_child_seek_count:
+ return rep->seek_child_seek_count;
+ case rocksdb_seek_min_heap_time:
+ return rep->seek_min_heap_time;
+ case rocksdb_seek_max_heap_time:
+ return rep->seek_max_heap_time;
+ case rocksdb_seek_internal_seek_time:
+ return rep->seek_internal_seek_time;
+ case rocksdb_find_next_user_entry_time:
+ return rep->find_next_user_entry_time;
+ case rocksdb_write_wal_time:
+ return rep->write_wal_time;
+ case rocksdb_write_memtable_time:
+ return rep->write_memtable_time;
+ case rocksdb_write_delay_time:
+ return rep->write_delay_time;
+ case rocksdb_write_pre_and_post_process_time:
+ return rep->write_pre_and_post_process_time;
+ case rocksdb_db_mutex_lock_nanos:
+ return rep->db_mutex_lock_nanos;
+ case rocksdb_db_condition_wait_nanos:
+ return rep->db_condition_wait_nanos;
+ case rocksdb_merge_operator_time_nanos:
+ return rep->merge_operator_time_nanos;
+ case rocksdb_read_index_block_nanos:
+ return rep->read_index_block_nanos;
+ case rocksdb_read_filter_block_nanos:
+ return rep->read_filter_block_nanos;
+ case rocksdb_new_table_block_iter_nanos:
+ return rep->new_table_block_iter_nanos;
+ case rocksdb_new_table_iterator_nanos:
+ return rep->new_table_iterator_nanos;
+ case rocksdb_block_seek_nanos:
+ return rep->block_seek_nanos;
+ case rocksdb_find_table_nanos:
+ return rep->find_table_nanos;
+ case rocksdb_bloom_memtable_hit_count:
+ return rep->bloom_memtable_hit_count;
+ case rocksdb_bloom_memtable_miss_count:
+ return rep->bloom_memtable_miss_count;
+ case rocksdb_bloom_sst_hit_count:
+ return rep->bloom_sst_hit_count;
+ case rocksdb_bloom_sst_miss_count:
+ return rep->bloom_sst_miss_count;
+ case rocksdb_key_lock_wait_time:
+ return rep->key_lock_wait_time;
+ case rocksdb_key_lock_wait_count:
+ return rep->key_lock_wait_count;
+ case rocksdb_env_new_sequential_file_nanos:
+ return rep->env_new_sequential_file_nanos;
+ case rocksdb_env_new_random_access_file_nanos:
+ return rep->env_new_random_access_file_nanos;
+ case rocksdb_env_new_writable_file_nanos:
+ return rep->env_new_writable_file_nanos;
+ case rocksdb_env_reuse_writable_file_nanos:
+ return rep->env_reuse_writable_file_nanos;
+ case rocksdb_env_new_random_rw_file_nanos:
+ return rep->env_new_random_rw_file_nanos;
+ case rocksdb_env_new_directory_nanos:
+ return rep->env_new_directory_nanos;
+ case rocksdb_env_file_exists_nanos:
+ return rep->env_file_exists_nanos;
+ case rocksdb_env_get_children_nanos:
+ return rep->env_get_children_nanos;
+ case rocksdb_env_get_children_file_attributes_nanos:
+ return rep->env_get_children_file_attributes_nanos;
+ case rocksdb_env_delete_file_nanos:
+ return rep->env_delete_file_nanos;
+ case rocksdb_env_create_dir_nanos:
+ return rep->env_create_dir_nanos;
+ case rocksdb_env_create_dir_if_missing_nanos:
+ return rep->env_create_dir_if_missing_nanos;
+ case rocksdb_env_delete_dir_nanos:
+ return rep->env_delete_dir_nanos;
+ case rocksdb_env_get_file_size_nanos:
+ return rep->env_get_file_size_nanos;
+ case rocksdb_env_get_file_modification_time_nanos:
+ return rep->env_get_file_modification_time_nanos;
+ case rocksdb_env_rename_file_nanos:
+ return rep->env_rename_file_nanos;
+ case rocksdb_env_link_file_nanos:
+ return rep->env_link_file_nanos;
+ case rocksdb_env_lock_file_nanos:
+ return rep->env_lock_file_nanos;
+ case rocksdb_env_unlock_file_nanos:
+ return rep->env_unlock_file_nanos;
+ case rocksdb_env_new_logger_nanos:
+ return rep->env_new_logger_nanos;
+ default:
+ break;
+ }
+ return 0;
+}
+
+void rocksdb_perfcontext_destroy(rocksdb_perfcontext_t* context) {
+ delete context;
+}
+
+/*
+TODO:
+DB::OpenForReadOnly
+DB::KeyMayExist
+DB::GetOptions
+DB::GetSortedWalFiles
+DB::GetLatestSequenceNumber
+DB::GetUpdatesSince
+DB::GetDbIdentity
+DB::RunManualCompaction
+custom cache
+table_properties_collectors
+*/
+
+rocksdb_compactionfilter_t* rocksdb_compactionfilter_create(
+ void* state,
+ void (*destructor)(void*),
+ unsigned char (*filter)(
+ void*,
+ int level,
+ const char* key, size_t key_length,
+ const char* existing_value, size_t value_length,
+ char** new_value, size_t *new_value_length,
+ unsigned char* value_changed),
+ const char* (*name)(void*)) {
+ rocksdb_compactionfilter_t* result = new rocksdb_compactionfilter_t;
+ result->state_ = state;
+ result->destructor_ = destructor;
+ result->filter_ = filter;
+ result->ignore_snapshots_ = true;
+ result->name_ = name;
+ return result;
+}
+
+void rocksdb_compactionfilter_set_ignore_snapshots(
+ rocksdb_compactionfilter_t* filter,
+ unsigned char whether_ignore) {
+ filter->ignore_snapshots_ = whether_ignore;
+}
+
+void rocksdb_compactionfilter_destroy(rocksdb_compactionfilter_t* filter) {
+ delete filter;
+}
+
+unsigned char rocksdb_compactionfiltercontext_is_full_compaction(
+ rocksdb_compactionfiltercontext_t* context) {
+ return context->rep.is_full_compaction;
+}
+
+unsigned char rocksdb_compactionfiltercontext_is_manual_compaction(
+ rocksdb_compactionfiltercontext_t* context) {
+ return context->rep.is_manual_compaction;
+}
+
+rocksdb_compactionfilterfactory_t* rocksdb_compactionfilterfactory_create(
+ void* state, void (*destructor)(void*),
+ rocksdb_compactionfilter_t* (*create_compaction_filter)(
+ void*, rocksdb_compactionfiltercontext_t* context),
+ const char* (*name)(void*)) {
+ rocksdb_compactionfilterfactory_t* result =
+ new rocksdb_compactionfilterfactory_t;
+ result->state_ = state;
+ result->destructor_ = destructor;
+ result->create_compaction_filter_ = create_compaction_filter;
+ result->name_ = name;
+ return result;
+}
+
+void rocksdb_compactionfilterfactory_destroy(
+ rocksdb_compactionfilterfactory_t* factory) {
+ delete factory;
+}
+
+rocksdb_comparator_t* rocksdb_comparator_create(
+ void* state,
+ void (*destructor)(void*),
+ int (*compare)(
+ void*,
+ const char* a, size_t alen,
+ const char* b, size_t blen),
+ const char* (*name)(void*)) {
+ rocksdb_comparator_t* result = new rocksdb_comparator_t;
+ result->state_ = state;
+ result->destructor_ = destructor;
+ result->compare_ = compare;
+ result->name_ = name;
+ return result;
+}
+
+void rocksdb_comparator_destroy(rocksdb_comparator_t* cmp) {
+ delete cmp;
+}
+
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create(
+ void* state,
+ void (*destructor)(void*),
+ char* (*create_filter)(
+ void*,
+ const char* const* key_array, const size_t* key_length_array,
+ int num_keys,
+ size_t* filter_length),
+ unsigned char (*key_may_match)(
+ void*,
+ const char* key, size_t length,
+ const char* filter, size_t filter_length),
+ void (*delete_filter)(
+ void*,
+ const char* filter, size_t filter_length),
+ const char* (*name)(void*)) {
+ rocksdb_filterpolicy_t* result = new rocksdb_filterpolicy_t;
+ result->state_ = state;
+ result->destructor_ = destructor;
+ result->create_ = create_filter;
+ result->key_match_ = key_may_match;
+ result->delete_filter_ = delete_filter;
+ result->name_ = name;
+ return result;
+}
+
+void rocksdb_filterpolicy_destroy(rocksdb_filterpolicy_t* filter) {
+ delete filter;
+}
+
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom_format(int bits_per_key, bool original_format) {
+ // Make a rocksdb_filterpolicy_t, but override all of its methods so
+ // they delegate to a NewBloomFilterPolicy() instead of user
+ // supplied C functions.
+ struct Wrapper : public rocksdb_filterpolicy_t {
+ const FilterPolicy* rep_;
+ ~Wrapper() override { delete rep_; }
+ const char* Name() const override { return rep_->Name(); }
+ void CreateFilter(const Slice* keys, int n,
+ std::string* dst) const override {
+ return rep_->CreateFilter(keys, n, dst);
+ }
+ bool KeyMayMatch(const Slice& key, const Slice& filter) const override {
+ return rep_->KeyMayMatch(key, filter);
+ }
+ // No need to override GetFilterBitsBuilder if this one is overridden
+ ROCKSDB_NAMESPACE::FilterBitsBuilder* GetBuilderWithContext(
+ const ROCKSDB_NAMESPACE::FilterBuildingContext& context)
+ const override {
+ return rep_->GetBuilderWithContext(context);
+ }
+ ROCKSDB_NAMESPACE::FilterBitsReader* GetFilterBitsReader(
+ const Slice& contents) const override {
+ return rep_->GetFilterBitsReader(contents);
+ }
+ static void DoNothing(void*) {}
+ };
+ Wrapper* wrapper = new Wrapper;
+ wrapper->rep_ = NewBloomFilterPolicy(bits_per_key, original_format);
+ wrapper->state_ = nullptr;
+ wrapper->delete_filter_ = nullptr;
+ wrapper->destructor_ = &Wrapper::DoNothing;
+ return wrapper;
+}
+
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom_full(int bits_per_key) {
+ return rocksdb_filterpolicy_create_bloom_format(bits_per_key, false);
+}
+
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom(int bits_per_key) {
+ return rocksdb_filterpolicy_create_bloom_format(bits_per_key, true);
+}
+
+rocksdb_mergeoperator_t* rocksdb_mergeoperator_create(
+ void* state, void (*destructor)(void*),
+ char* (*full_merge)(void*, const char* key, size_t key_length,
+ const char* existing_value,
+ size_t existing_value_length,
+ const char* const* operands_list,
+ const size_t* operands_list_length, int num_operands,
+ unsigned char* success, size_t* new_value_length),
+ char* (*partial_merge)(void*, const char* key, size_t key_length,
+ const char* const* operands_list,
+ const size_t* operands_list_length, int num_operands,
+ unsigned char* success, size_t* new_value_length),
+ void (*delete_value)(void*, const char* value, size_t value_length),
+ const char* (*name)(void*)) {
+ rocksdb_mergeoperator_t* result = new rocksdb_mergeoperator_t;
+ result->state_ = state;
+ result->destructor_ = destructor;
+ result->full_merge_ = full_merge;
+ result->partial_merge_ = partial_merge;
+ result->delete_value_ = delete_value;
+ result->name_ = name;
+ return result;
+}
+
+void rocksdb_mergeoperator_destroy(rocksdb_mergeoperator_t* merge_operator) {
+ delete merge_operator;
+}
+
+rocksdb_readoptions_t* rocksdb_readoptions_create() {
+ return new rocksdb_readoptions_t;
+}
+
+void rocksdb_readoptions_destroy(rocksdb_readoptions_t* opt) {
+ delete opt;
+}
+
+void rocksdb_readoptions_set_verify_checksums(
+ rocksdb_readoptions_t* opt,
+ unsigned char v) {
+ opt->rep.verify_checksums = v;
+}
+
+void rocksdb_readoptions_set_fill_cache(
+ rocksdb_readoptions_t* opt, unsigned char v) {
+ opt->rep.fill_cache = v;
+}
+
+void rocksdb_readoptions_set_snapshot(
+ rocksdb_readoptions_t* opt,
+ const rocksdb_snapshot_t* snap) {
+ opt->rep.snapshot = (snap ? snap->rep : nullptr);
+}
+
+void rocksdb_readoptions_set_iterate_upper_bound(
+ rocksdb_readoptions_t* opt,
+ const char* key, size_t keylen) {
+ if (key == nullptr) {
+ opt->upper_bound = Slice();
+ opt->rep.iterate_upper_bound = nullptr;
+
+ } else {
+ opt->upper_bound = Slice(key, keylen);
+ opt->rep.iterate_upper_bound = &opt->upper_bound;
+ }
+}
+
+void rocksdb_readoptions_set_iterate_lower_bound(
+ rocksdb_readoptions_t *opt,
+ const char* key, size_t keylen) {
+ if (key == nullptr) {
+ opt->lower_bound = Slice();
+ opt->rep.iterate_lower_bound = nullptr;
+ } else {
+ opt->lower_bound = Slice(key, keylen);
+ opt->rep.iterate_lower_bound = &opt->lower_bound;
+ }
+}
+
+void rocksdb_readoptions_set_read_tier(
+ rocksdb_readoptions_t* opt, int v) {
+ opt->rep.read_tier = static_cast<ROCKSDB_NAMESPACE::ReadTier>(v);
+}
+
+void rocksdb_readoptions_set_tailing(
+ rocksdb_readoptions_t* opt, unsigned char v) {
+ opt->rep.tailing = v;
+}
+
+void rocksdb_readoptions_set_managed(
+ rocksdb_readoptions_t* opt, unsigned char v) {
+ opt->rep.managed = v;
+}
+
+void rocksdb_readoptions_set_readahead_size(
+ rocksdb_readoptions_t* opt, size_t v) {
+ opt->rep.readahead_size = v;
+}
+
+void rocksdb_readoptions_set_prefix_same_as_start(
+ rocksdb_readoptions_t* opt, unsigned char v) {
+ opt->rep.prefix_same_as_start = v;
+}
+
+void rocksdb_readoptions_set_pin_data(rocksdb_readoptions_t* opt,
+ unsigned char v) {
+ opt->rep.pin_data = v;
+}
+
+void rocksdb_readoptions_set_total_order_seek(rocksdb_readoptions_t* opt,
+ unsigned char v) {
+ opt->rep.total_order_seek = v;
+}
+
+void rocksdb_readoptions_set_max_skippable_internal_keys(
+ rocksdb_readoptions_t* opt,
+ uint64_t v) {
+ opt->rep.max_skippable_internal_keys = v;
+}
+
+void rocksdb_readoptions_set_background_purge_on_iterator_cleanup(
+ rocksdb_readoptions_t* opt, unsigned char v) {
+ opt->rep.background_purge_on_iterator_cleanup = v;
+}
+
+void rocksdb_readoptions_set_ignore_range_deletions(
+ rocksdb_readoptions_t* opt, unsigned char v) {
+ opt->rep.ignore_range_deletions = v;
+}
+
+rocksdb_writeoptions_t* rocksdb_writeoptions_create() {
+ return new rocksdb_writeoptions_t;
+}
+
+void rocksdb_writeoptions_destroy(rocksdb_writeoptions_t* opt) {
+ delete opt;
+}
+
+void rocksdb_writeoptions_set_sync(
+ rocksdb_writeoptions_t* opt, unsigned char v) {
+ opt->rep.sync = v;
+}
+
+void rocksdb_writeoptions_disable_WAL(rocksdb_writeoptions_t* opt, int disable) {
+ opt->rep.disableWAL = disable;
+}
+
+void rocksdb_writeoptions_set_ignore_missing_column_families(
+ rocksdb_writeoptions_t* opt,
+ unsigned char v) {
+ opt->rep.ignore_missing_column_families = v;
+}
+
+void rocksdb_writeoptions_set_no_slowdown(
+ rocksdb_writeoptions_t* opt,
+ unsigned char v) {
+ opt->rep.no_slowdown = v;
+}
+
+void rocksdb_writeoptions_set_low_pri(
+ rocksdb_writeoptions_t* opt,
+ unsigned char v) {
+ opt->rep.low_pri = v;
+}
+
+void rocksdb_writeoptions_set_memtable_insert_hint_per_batch(
+ rocksdb_writeoptions_t* opt, unsigned char v) {
+ opt->rep.memtable_insert_hint_per_batch = v;
+}
+
+rocksdb_compactoptions_t* rocksdb_compactoptions_create() {
+ return new rocksdb_compactoptions_t;
+}
+
+void rocksdb_compactoptions_destroy(rocksdb_compactoptions_t* opt) {
+ delete opt;
+}
+
+void rocksdb_compactoptions_set_bottommost_level_compaction(
+ rocksdb_compactoptions_t* opt, unsigned char v) {
+ opt->rep.bottommost_level_compaction = static_cast<BottommostLevelCompaction>(v);
+}
+
+void rocksdb_compactoptions_set_exclusive_manual_compaction(
+ rocksdb_compactoptions_t* opt, unsigned char v) {
+ opt->rep.exclusive_manual_compaction = v;
+}
+
+void rocksdb_compactoptions_set_change_level(rocksdb_compactoptions_t* opt,
+ unsigned char v) {
+ opt->rep.change_level = v;
+}
+
+void rocksdb_compactoptions_set_target_level(rocksdb_compactoptions_t* opt,
+ int n) {
+ opt->rep.target_level = n;
+}
+
+rocksdb_flushoptions_t* rocksdb_flushoptions_create() {
+ return new rocksdb_flushoptions_t;
+}
+
+void rocksdb_flushoptions_destroy(rocksdb_flushoptions_t* opt) {
+ delete opt;
+}
+
+void rocksdb_flushoptions_set_wait(
+ rocksdb_flushoptions_t* opt, unsigned char v) {
+ opt->rep.wait = v;
+}
+
+rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity) {
+ rocksdb_cache_t* c = new rocksdb_cache_t;
+ c->rep = NewLRUCache(capacity);
+ return c;
+}
+
+void rocksdb_cache_destroy(rocksdb_cache_t* cache) {
+ delete cache;
+}
+
+void rocksdb_cache_set_capacity(rocksdb_cache_t* cache, size_t capacity) {
+ cache->rep->SetCapacity(capacity);
+}
+
+size_t rocksdb_cache_get_usage(rocksdb_cache_t* cache) {
+ return cache->rep->GetUsage();
+}
+
+size_t rocksdb_cache_get_pinned_usage(rocksdb_cache_t* cache) {
+ return cache->rep->GetPinnedUsage();
+}
+
+rocksdb_dbpath_t* rocksdb_dbpath_create(const char* path, uint64_t target_size) {
+ rocksdb_dbpath_t* result = new rocksdb_dbpath_t;
+ result->rep.path = std::string(path);
+ result->rep.target_size = target_size;
+ return result;
+}
+
+void rocksdb_dbpath_destroy(rocksdb_dbpath_t* dbpath) {
+ delete dbpath;
+}
+
+rocksdb_env_t* rocksdb_create_default_env() {
+ rocksdb_env_t* result = new rocksdb_env_t;
+ result->rep = Env::Default();
+ result->is_default = true;
+ return result;
+}
+
+rocksdb_env_t* rocksdb_create_mem_env() {
+ rocksdb_env_t* result = new rocksdb_env_t;
+ result->rep = ROCKSDB_NAMESPACE::NewMemEnv(Env::Default());
+ result->is_default = false;
+ return result;
+}
+
+void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n) {
+ env->rep->SetBackgroundThreads(n);
+}
+
+void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n) {
+ env->rep->SetBackgroundThreads(n, Env::HIGH);
+}
+
+void rocksdb_env_join_all_threads(rocksdb_env_t* env) {
+ env->rep->WaitForJoin();
+}
+
+void rocksdb_env_lower_thread_pool_io_priority(rocksdb_env_t* env) {
+ env->rep->LowerThreadPoolIOPriority();
+}
+
+void rocksdb_env_lower_high_priority_thread_pool_io_priority(rocksdb_env_t* env) {
+ env->rep->LowerThreadPoolIOPriority(Env::HIGH);
+}
+
+void rocksdb_env_lower_thread_pool_cpu_priority(rocksdb_env_t* env) {
+ env->rep->LowerThreadPoolCPUPriority();
+}
+
+void rocksdb_env_lower_high_priority_thread_pool_cpu_priority(rocksdb_env_t* env) {
+ env->rep->LowerThreadPoolCPUPriority(Env::HIGH);
+}
+
+void rocksdb_env_destroy(rocksdb_env_t* env) {
+ if (!env->is_default) delete env->rep;
+ delete env;
+}
+
+rocksdb_envoptions_t* rocksdb_envoptions_create() {
+ rocksdb_envoptions_t* opt = new rocksdb_envoptions_t;
+ return opt;
+}
+
+void rocksdb_envoptions_destroy(rocksdb_envoptions_t* opt) { delete opt; }
+
+rocksdb_sstfilewriter_t* rocksdb_sstfilewriter_create(
+ const rocksdb_envoptions_t* env, const rocksdb_options_t* io_options) {
+ rocksdb_sstfilewriter_t* writer = new rocksdb_sstfilewriter_t;
+ writer->rep = new SstFileWriter(env->rep, io_options->rep);
+ return writer;
+}
+
+rocksdb_sstfilewriter_t* rocksdb_sstfilewriter_create_with_comparator(
+ const rocksdb_envoptions_t* env, const rocksdb_options_t* io_options,
+ const rocksdb_comparator_t* /*comparator*/) {
+ rocksdb_sstfilewriter_t* writer = new rocksdb_sstfilewriter_t;
+ writer->rep = new SstFileWriter(env->rep, io_options->rep);
+ return writer;
+}
+
+void rocksdb_sstfilewriter_open(rocksdb_sstfilewriter_t* writer,
+ const char* name, char** errptr) {
+ SaveError(errptr, writer->rep->Open(std::string(name)));
+}
+
+void rocksdb_sstfilewriter_add(rocksdb_sstfilewriter_t* writer, const char* key,
+ size_t keylen, const char* val, size_t vallen,
+ char** errptr) {
+ SaveError(errptr, writer->rep->Put(Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_sstfilewriter_put(rocksdb_sstfilewriter_t* writer, const char* key,
+ size_t keylen, const char* val, size_t vallen,
+ char** errptr) {
+ SaveError(errptr, writer->rep->Put(Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_sstfilewriter_merge(rocksdb_sstfilewriter_t* writer,
+ const char* key, size_t keylen,
+ const char* val, size_t vallen,
+ char** errptr) {
+ SaveError(errptr, writer->rep->Merge(Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_sstfilewriter_delete(rocksdb_sstfilewriter_t* writer,
+ const char* key, size_t keylen,
+ char** errptr) {
+ SaveError(errptr, writer->rep->Delete(Slice(key, keylen)));
+}
+
+void rocksdb_sstfilewriter_finish(rocksdb_sstfilewriter_t* writer,
+ char** errptr) {
+ SaveError(errptr, writer->rep->Finish(nullptr));
+}
+
+void rocksdb_sstfilewriter_file_size(rocksdb_sstfilewriter_t* writer,
+ uint64_t* file_size) {
+ *file_size = writer->rep->FileSize();
+}
+
+void rocksdb_sstfilewriter_destroy(rocksdb_sstfilewriter_t* writer) {
+ delete writer->rep;
+ delete writer;
+}
+
+rocksdb_ingestexternalfileoptions_t*
+rocksdb_ingestexternalfileoptions_create() {
+ rocksdb_ingestexternalfileoptions_t* opt =
+ new rocksdb_ingestexternalfileoptions_t;
+ return opt;
+}
+
+void rocksdb_ingestexternalfileoptions_set_move_files(
+ rocksdb_ingestexternalfileoptions_t* opt, unsigned char move_files) {
+ opt->rep.move_files = move_files;
+}
+
+void rocksdb_ingestexternalfileoptions_set_snapshot_consistency(
+ rocksdb_ingestexternalfileoptions_t* opt,
+ unsigned char snapshot_consistency) {
+ opt->rep.snapshot_consistency = snapshot_consistency;
+}
+
+void rocksdb_ingestexternalfileoptions_set_allow_global_seqno(
+ rocksdb_ingestexternalfileoptions_t* opt,
+ unsigned char allow_global_seqno) {
+ opt->rep.allow_global_seqno = allow_global_seqno;
+}
+
+void rocksdb_ingestexternalfileoptions_set_allow_blocking_flush(
+ rocksdb_ingestexternalfileoptions_t* opt,
+ unsigned char allow_blocking_flush) {
+ opt->rep.allow_blocking_flush = allow_blocking_flush;
+}
+
+void rocksdb_ingestexternalfileoptions_set_ingest_behind(
+ rocksdb_ingestexternalfileoptions_t* opt,
+ unsigned char ingest_behind) {
+ opt->rep.ingest_behind = ingest_behind;
+}
+
+void rocksdb_ingestexternalfileoptions_destroy(
+ rocksdb_ingestexternalfileoptions_t* opt) {
+ delete opt;
+}
+
+void rocksdb_ingest_external_file(
+ rocksdb_t* db, const char* const* file_list, const size_t list_len,
+ const rocksdb_ingestexternalfileoptions_t* opt, char** errptr) {
+ std::vector<std::string> files(list_len);
+ for (size_t i = 0; i < list_len; ++i) {
+ files[i] = std::string(file_list[i]);
+ }
+ SaveError(errptr, db->rep->IngestExternalFile(files, opt->rep));
+}
+
+void rocksdb_ingest_external_file_cf(
+ rocksdb_t* db, rocksdb_column_family_handle_t* handle,
+ const char* const* file_list, const size_t list_len,
+ const rocksdb_ingestexternalfileoptions_t* opt, char** errptr) {
+ std::vector<std::string> files(list_len);
+ for (size_t i = 0; i < list_len; ++i) {
+ files[i] = std::string(file_list[i]);
+ }
+ SaveError(errptr, db->rep->IngestExternalFile(handle->rep, files, opt->rep));
+}
+
+void rocksdb_try_catch_up_with_primary(rocksdb_t* db, char** errptr) {
+ SaveError(errptr, db->rep->TryCatchUpWithPrimary());
+}
+
+rocksdb_slicetransform_t* rocksdb_slicetransform_create(
+ void* state,
+ void (*destructor)(void*),
+ char* (*transform)(
+ void*,
+ const char* key, size_t length,
+ size_t* dst_length),
+ unsigned char (*in_domain)(
+ void*,
+ const char* key, size_t length),
+ unsigned char (*in_range)(
+ void*,
+ const char* key, size_t length),
+ const char* (*name)(void*)) {
+ rocksdb_slicetransform_t* result = new rocksdb_slicetransform_t;
+ result->state_ = state;
+ result->destructor_ = destructor;
+ result->transform_ = transform;
+ result->in_domain_ = in_domain;
+ result->in_range_ = in_range;
+ result->name_ = name;
+ return result;
+}
+
+void rocksdb_slicetransform_destroy(rocksdb_slicetransform_t* st) {
+ delete st;
+}
+
+struct Wrapper : public rocksdb_slicetransform_t {
+ const SliceTransform* rep_;
+ ~Wrapper() override { delete rep_; }
+ const char* Name() const override { return rep_->Name(); }
+ Slice Transform(const Slice& src) const override {
+ return rep_->Transform(src);
+ }
+ bool InDomain(const Slice& src) const override {
+ return rep_->InDomain(src);
+ }
+ bool InRange(const Slice& src) const override { return rep_->InRange(src); }
+ static void DoNothing(void*) { }
+};
+
+rocksdb_slicetransform_t* rocksdb_slicetransform_create_fixed_prefix(size_t prefixLen) {
+ Wrapper* wrapper = new Wrapper;
+ wrapper->rep_ = ROCKSDB_NAMESPACE::NewFixedPrefixTransform(prefixLen);
+ wrapper->state_ = nullptr;
+ wrapper->destructor_ = &Wrapper::DoNothing;
+ return wrapper;
+}
+
+rocksdb_slicetransform_t* rocksdb_slicetransform_create_noop() {
+ Wrapper* wrapper = new Wrapper;
+ wrapper->rep_ = ROCKSDB_NAMESPACE::NewNoopTransform();
+ wrapper->state_ = nullptr;
+ wrapper->destructor_ = &Wrapper::DoNothing;
+ return wrapper;
+}
+
+rocksdb_universal_compaction_options_t* rocksdb_universal_compaction_options_create() {
+ rocksdb_universal_compaction_options_t* result = new rocksdb_universal_compaction_options_t;
+ result->rep = new ROCKSDB_NAMESPACE::CompactionOptionsUniversal;
+ return result;
+}
+
+void rocksdb_universal_compaction_options_set_size_ratio(
+ rocksdb_universal_compaction_options_t* uco, int ratio) {
+ uco->rep->size_ratio = ratio;
+}
+
+void rocksdb_universal_compaction_options_set_min_merge_width(
+ rocksdb_universal_compaction_options_t* uco, int w) {
+ uco->rep->min_merge_width = w;
+}
+
+void rocksdb_universal_compaction_options_set_max_merge_width(
+ rocksdb_universal_compaction_options_t* uco, int w) {
+ uco->rep->max_merge_width = w;
+}
+
+void rocksdb_universal_compaction_options_set_max_size_amplification_percent(
+ rocksdb_universal_compaction_options_t* uco, int p) {
+ uco->rep->max_size_amplification_percent = p;
+}
+
+void rocksdb_universal_compaction_options_set_compression_size_percent(
+ rocksdb_universal_compaction_options_t* uco, int p) {
+ uco->rep->compression_size_percent = p;
+}
+
+void rocksdb_universal_compaction_options_set_stop_style(
+ rocksdb_universal_compaction_options_t* uco, int style) {
+ uco->rep->stop_style =
+ static_cast<ROCKSDB_NAMESPACE::CompactionStopStyle>(style);
+}
+
+void rocksdb_universal_compaction_options_destroy(
+ rocksdb_universal_compaction_options_t* uco) {
+ delete uco->rep;
+ delete uco;
+}
+
+rocksdb_fifo_compaction_options_t* rocksdb_fifo_compaction_options_create() {
+ rocksdb_fifo_compaction_options_t* result = new rocksdb_fifo_compaction_options_t;
+ result->rep = CompactionOptionsFIFO();
+ return result;
+}
+
+void rocksdb_fifo_compaction_options_set_max_table_files_size(
+ rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size) {
+ fifo_opts->rep.max_table_files_size = size;
+}
+
+void rocksdb_fifo_compaction_options_destroy(
+ rocksdb_fifo_compaction_options_t* fifo_opts) {
+ delete fifo_opts;
+}
+
+void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt, int level) {
+ if (level >= 0) {
+ assert(level <= opt->rep.num_levels);
+ opt->rep.compression_per_level.resize(opt->rep.num_levels);
+ for (int i = 0; i < level; i++) {
+ opt->rep.compression_per_level[i] = ROCKSDB_NAMESPACE::kNoCompression;
+ }
+ for (int i = level; i < opt->rep.num_levels; i++) {
+ opt->rep.compression_per_level[i] = opt->rep.compression;
+ }
+ }
+}
+
+int rocksdb_livefiles_count(
+ const rocksdb_livefiles_t* lf) {
+ return static_cast<int>(lf->rep.size());
+}
+
+const char* rocksdb_livefiles_name(
+ const rocksdb_livefiles_t* lf,
+ int index) {
+ return lf->rep[index].name.c_str();
+}
+
+int rocksdb_livefiles_level(
+ const rocksdb_livefiles_t* lf,
+ int index) {
+ return lf->rep[index].level;
+}
+
+size_t rocksdb_livefiles_size(
+ const rocksdb_livefiles_t* lf,
+ int index) {
+ return lf->rep[index].size;
+}
+
+const char* rocksdb_livefiles_smallestkey(
+ const rocksdb_livefiles_t* lf,
+ int index,
+ size_t* size) {
+ *size = lf->rep[index].smallestkey.size();
+ return lf->rep[index].smallestkey.data();
+}
+
+const char* rocksdb_livefiles_largestkey(
+ const rocksdb_livefiles_t* lf,
+ int index,
+ size_t* size) {
+ *size = lf->rep[index].largestkey.size();
+ return lf->rep[index].largestkey.data();
+}
+
+uint64_t rocksdb_livefiles_entries(
+ const rocksdb_livefiles_t* lf,
+ int index) {
+ return lf->rep[index].num_entries;
+}
+
+uint64_t rocksdb_livefiles_deletions(
+ const rocksdb_livefiles_t* lf,
+ int index) {
+ return lf->rep[index].num_deletions;
+}
+
+extern void rocksdb_livefiles_destroy(
+ const rocksdb_livefiles_t* lf) {
+ delete lf;
+}
+
+void rocksdb_get_options_from_string(const rocksdb_options_t* base_options,
+ const char* opts_str,
+ rocksdb_options_t* new_options,
+ char** errptr) {
+ SaveError(errptr,
+ GetOptionsFromString(base_options->rep, std::string(opts_str),
+ &new_options->rep));
+}
+
+void rocksdb_delete_file_in_range(rocksdb_t* db, const char* start_key,
+ size_t start_key_len, const char* limit_key,
+ size_t limit_key_len, char** errptr) {
+ Slice a, b;
+ SaveError(
+ errptr,
+ DeleteFilesInRange(
+ db->rep, db->rep->DefaultColumnFamily(),
+ (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+ (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)));
+}
+
+void rocksdb_delete_file_in_range_cf(
+ rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+ const char* start_key, size_t start_key_len, const char* limit_key,
+ size_t limit_key_len, char** errptr) {
+ Slice a, b;
+ SaveError(
+ errptr,
+ DeleteFilesInRange(
+ db->rep, column_family->rep,
+ (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+ (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)));
+}
+
+rocksdb_transactiondb_options_t* rocksdb_transactiondb_options_create() {
+ return new rocksdb_transactiondb_options_t;
+}
+
+void rocksdb_transactiondb_options_destroy(rocksdb_transactiondb_options_t* opt){
+ delete opt;
+}
+
+void rocksdb_transactiondb_options_set_max_num_locks(
+ rocksdb_transactiondb_options_t* opt, int64_t max_num_locks) {
+ opt->rep.max_num_locks = max_num_locks;
+}
+
+void rocksdb_transactiondb_options_set_num_stripes(
+ rocksdb_transactiondb_options_t* opt, size_t num_stripes) {
+ opt->rep.num_stripes = num_stripes;
+}
+
+void rocksdb_transactiondb_options_set_transaction_lock_timeout(
+ rocksdb_transactiondb_options_t* opt, int64_t txn_lock_timeout) {
+ opt->rep.transaction_lock_timeout = txn_lock_timeout;
+}
+
+void rocksdb_transactiondb_options_set_default_lock_timeout(
+ rocksdb_transactiondb_options_t* opt, int64_t default_lock_timeout) {
+ opt->rep.default_lock_timeout = default_lock_timeout;
+}
+
+rocksdb_transaction_options_t* rocksdb_transaction_options_create() {
+ return new rocksdb_transaction_options_t;
+}
+
+void rocksdb_transaction_options_destroy(rocksdb_transaction_options_t* opt) {
+ delete opt;
+}
+
+void rocksdb_transaction_options_set_set_snapshot(
+ rocksdb_transaction_options_t* opt, unsigned char v) {
+ opt->rep.set_snapshot = v;
+}
+
+void rocksdb_transaction_options_set_deadlock_detect(
+ rocksdb_transaction_options_t* opt, unsigned char v) {
+ opt->rep.deadlock_detect = v;
+}
+
+void rocksdb_transaction_options_set_lock_timeout(
+ rocksdb_transaction_options_t* opt, int64_t lock_timeout) {
+ opt->rep.lock_timeout = lock_timeout;
+}
+
+void rocksdb_transaction_options_set_expiration(
+ rocksdb_transaction_options_t* opt, int64_t expiration) {
+ opt->rep.expiration = expiration;
+}
+
+void rocksdb_transaction_options_set_deadlock_detect_depth(
+ rocksdb_transaction_options_t* opt, int64_t depth) {
+ opt->rep.deadlock_detect_depth = depth;
+}
+
+void rocksdb_transaction_options_set_max_write_batch_size(
+ rocksdb_transaction_options_t* opt, size_t size) {
+ opt->rep.max_write_batch_size = size;
+}
+
+rocksdb_optimistictransaction_options_t*
+rocksdb_optimistictransaction_options_create() {
+ return new rocksdb_optimistictransaction_options_t;
+}
+
+void rocksdb_optimistictransaction_options_destroy(
+ rocksdb_optimistictransaction_options_t* opt) {
+ delete opt;
+}
+
+void rocksdb_optimistictransaction_options_set_set_snapshot(
+ rocksdb_optimistictransaction_options_t* opt, unsigned char v) {
+ opt->rep.set_snapshot = v;
+}
+
+rocksdb_column_family_handle_t* rocksdb_transactiondb_create_column_family(
+ rocksdb_transactiondb_t* txn_db,
+ const rocksdb_options_t* column_family_options,
+ const char* column_family_name, char** errptr) {
+ rocksdb_column_family_handle_t* handle = new rocksdb_column_family_handle_t;
+ SaveError(errptr, txn_db->rep->CreateColumnFamily(
+ ColumnFamilyOptions(column_family_options->rep),
+ std::string(column_family_name), &(handle->rep)));
+ return handle;
+}
+
+rocksdb_transactiondb_t* rocksdb_transactiondb_open(
+ const rocksdb_options_t* options,
+ const rocksdb_transactiondb_options_t* txn_db_options, const char* name,
+ char** errptr) {
+ TransactionDB* txn_db;
+ if (SaveError(errptr, TransactionDB::Open(options->rep, txn_db_options->rep,
+ std::string(name), &txn_db))) {
+ return nullptr;
+ }
+ rocksdb_transactiondb_t* result = new rocksdb_transactiondb_t;
+ result->rep = txn_db;
+ return result;
+}
+
+rocksdb_transactiondb_t* rocksdb_transactiondb_open_column_families(
+ const rocksdb_options_t* options,
+ const rocksdb_transactiondb_options_t* txn_db_options, const char* name,
+ int num_column_families, const char* const* column_family_names,
+ const rocksdb_options_t* const* column_family_options,
+ rocksdb_column_family_handle_t** column_family_handles, char** errptr) {
+ std::vector<ColumnFamilyDescriptor> column_families;
+ for (int i = 0; i < num_column_families; i++) {
+ column_families.push_back(ColumnFamilyDescriptor(
+ std::string(column_family_names[i]),
+ ColumnFamilyOptions(column_family_options[i]->rep)));
+ }
+
+ TransactionDB* txn_db;
+ std::vector<ColumnFamilyHandle*> handles;
+ if (SaveError(errptr, TransactionDB::Open(options->rep, txn_db_options->rep,
+ std::string(name), column_families,
+ &handles, &txn_db))) {
+ return nullptr;
+ }
+
+ for (size_t i = 0; i < handles.size(); i++) {
+ rocksdb_column_family_handle_t* c_handle =
+ new rocksdb_column_family_handle_t;
+ c_handle->rep = handles[i];
+ column_family_handles[i] = c_handle;
+ }
+ rocksdb_transactiondb_t* result = new rocksdb_transactiondb_t;
+ result->rep = txn_db;
+ return result;
+}
+
+const rocksdb_snapshot_t* rocksdb_transactiondb_create_snapshot(
+ rocksdb_transactiondb_t* txn_db) {
+ rocksdb_snapshot_t* result = new rocksdb_snapshot_t;
+ result->rep = txn_db->rep->GetSnapshot();
+ return result;
+}
+
+void rocksdb_transactiondb_release_snapshot(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_snapshot_t* snapshot) {
+ txn_db->rep->ReleaseSnapshot(snapshot->rep);
+ delete snapshot;
+}
+
+rocksdb_transaction_t* rocksdb_transaction_begin(
+ rocksdb_transactiondb_t* txn_db,
+ const rocksdb_writeoptions_t* write_options,
+ const rocksdb_transaction_options_t* txn_options,
+ rocksdb_transaction_t* old_txn) {
+ if (old_txn == nullptr) {
+ rocksdb_transaction_t* result = new rocksdb_transaction_t;
+ result->rep = txn_db->rep->BeginTransaction(write_options->rep,
+ txn_options->rep, nullptr);
+ return result;
+ }
+ old_txn->rep = txn_db->rep->BeginTransaction(write_options->rep,
+ txn_options->rep, old_txn->rep);
+ return old_txn;
+}
+
+void rocksdb_transaction_commit(rocksdb_transaction_t* txn, char** errptr) {
+ SaveError(errptr, txn->rep->Commit());
+}
+
+void rocksdb_transaction_rollback(rocksdb_transaction_t* txn, char** errptr) {
+ SaveError(errptr, txn->rep->Rollback());
+}
+
+void rocksdb_transaction_set_savepoint(rocksdb_transaction_t* txn) {
+ txn->rep->SetSavePoint();
+}
+
+void rocksdb_transaction_rollback_to_savepoint(rocksdb_transaction_t* txn, char** errptr) {
+ SaveError(errptr, txn->rep->RollbackToSavePoint());
+}
+
+void rocksdb_transaction_destroy(rocksdb_transaction_t* txn) {
+ delete txn->rep;
+ delete txn;
+}
+
+const rocksdb_snapshot_t* rocksdb_transaction_get_snapshot(
+ rocksdb_transaction_t* txn) {
+ rocksdb_snapshot_t* result = new rocksdb_snapshot_t;
+ result->rep = txn->rep->GetSnapshot();
+ return result;
+}
+
+// Read a key inside a transaction
+char* rocksdb_transaction_get(rocksdb_transaction_t* txn,
+ const rocksdb_readoptions_t* options,
+ const char* key, size_t klen, size_t* vlen,
+ char** errptr) {
+ char* result = nullptr;
+ std::string tmp;
+ Status s = txn->rep->Get(options->rep, Slice(key, klen), &tmp);
+ if (s.ok()) {
+ *vlen = tmp.size();
+ result = CopyString(tmp);
+ } else {
+ *vlen = 0;
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ }
+ return result;
+}
+
+char* rocksdb_transaction_get_cf(rocksdb_transaction_t* txn,
+ const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen, size_t* vlen,
+ char** errptr) {
+ char* result = nullptr;
+ std::string tmp;
+ Status s =
+ txn->rep->Get(options->rep, column_family->rep, Slice(key, klen), &tmp);
+ if (s.ok()) {
+ *vlen = tmp.size();
+ result = CopyString(tmp);
+ } else {
+ *vlen = 0;
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ }
+ return result;
+}
+
+// Read a key inside a transaction
+char* rocksdb_transaction_get_for_update(rocksdb_transaction_t* txn,
+ const rocksdb_readoptions_t* options,
+ const char* key, size_t klen,
+ size_t* vlen, unsigned char exclusive,
+ char** errptr) {
+ char* result = nullptr;
+ std::string tmp;
+ Status s =
+ txn->rep->GetForUpdate(options->rep, Slice(key, klen), &tmp, exclusive);
+ if (s.ok()) {
+ *vlen = tmp.size();
+ result = CopyString(tmp);
+ } else {
+ *vlen = 0;
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ }
+ return result;
+}
+
+char* rocksdb_transaction_get_for_update_cf(
+ rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+ size_t* vlen, unsigned char exclusive, char** errptr) {
+ char* result = nullptr;
+ std::string tmp;
+ Status s = txn->rep->GetForUpdate(options->rep, column_family->rep,
+ Slice(key, klen), &tmp, exclusive);
+ if (s.ok()) {
+ *vlen = tmp.size();
+ result = CopyString(tmp);
+ } else {
+ *vlen = 0;
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ }
+ return result;
+}
+
+// Read a key outside a transaction
+char* rocksdb_transactiondb_get(
+ rocksdb_transactiondb_t* txn_db,
+ const rocksdb_readoptions_t* options,
+ const char* key, size_t klen,
+ size_t* vlen,
+ char** errptr){
+ char* result = nullptr;
+ std::string tmp;
+ Status s = txn_db->rep->Get(options->rep, Slice(key, klen), &tmp);
+ if (s.ok()) {
+ *vlen = tmp.size();
+ result = CopyString(tmp);
+ } else {
+ *vlen = 0;
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ }
+ return result;
+}
+
+char* rocksdb_transactiondb_get_cf(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, size_t* vallen, char** errptr) {
+ char* result = nullptr;
+ std::string tmp;
+ Status s = txn_db->rep->Get(options->rep, column_family->rep,
+ Slice(key, keylen), &tmp);
+ if (s.ok()) {
+ *vallen = tmp.size();
+ result = CopyString(tmp);
+ } else {
+ *vallen = 0;
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ }
+ return result;
+}
+
+// Put a key inside a transaction
+void rocksdb_transaction_put(rocksdb_transaction_t* txn, const char* key,
+ size_t klen, const char* val, size_t vlen,
+ char** errptr) {
+ SaveError(errptr, txn->rep->Put(Slice(key, klen), Slice(val, vlen)));
+}
+
+void rocksdb_transaction_put_cf(rocksdb_transaction_t* txn,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen, const char* val,
+ size_t vlen, char** errptr) {
+ SaveError(errptr, txn->rep->Put(column_family->rep, Slice(key, klen),
+ Slice(val, vlen)));
+}
+
+// Put a key outside a transaction
+void rocksdb_transactiondb_put(rocksdb_transactiondb_t* txn_db,
+ const rocksdb_writeoptions_t* options,
+ const char* key, size_t klen, const char* val,
+ size_t vlen, char** errptr) {
+ SaveError(errptr,
+ txn_db->rep->Put(options->rep, Slice(key, klen), Slice(val, vlen)));
+}
+
+void rocksdb_transactiondb_put_cf(rocksdb_transactiondb_t* txn_db,
+ const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t keylen,
+ const char* val, size_t vallen,
+ char** errptr) {
+ SaveError(errptr, txn_db->rep->Put(options->rep, column_family->rep,
+ Slice(key, keylen), Slice(val, vallen)));
+}
+
+// Write batch into transaction db
+void rocksdb_transactiondb_write(
+ rocksdb_transactiondb_t* db,
+ const rocksdb_writeoptions_t* options,
+ rocksdb_writebatch_t* batch,
+ char** errptr) {
+ SaveError(errptr, db->rep->Write(options->rep, &batch->rep));
+}
+
+// Merge a key inside a transaction
+void rocksdb_transaction_merge(rocksdb_transaction_t* txn, const char* key,
+ size_t klen, const char* val, size_t vlen,
+ char** errptr) {
+ SaveError(errptr, txn->rep->Merge(Slice(key, klen), Slice(val, vlen)));
+}
+
+void rocksdb_transaction_merge_cf(rocksdb_transaction_t* txn,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen, const char* val,
+ size_t vlen, char** errptr) {
+ SaveError(errptr, txn->rep->Merge(column_family->rep, Slice(key, klen),
+ Slice(val, vlen)));
+}
+
+// Merge a key outside a transaction
+void rocksdb_transactiondb_merge(rocksdb_transactiondb_t* txn_db,
+ const rocksdb_writeoptions_t* options,
+ const char* key, size_t klen, const char* val,
+ size_t vlen, char** errptr) {
+ SaveError(errptr, txn_db->rep->Merge(options->rep, Slice(key, klen),
+ Slice(val, vlen)));
+}
+
+void rocksdb_transactiondb_merge_cf(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+ const char* val, size_t vlen, char** errptr) {
+ SaveError(errptr, txn_db->rep->Merge(options->rep, column_family->rep,
+ Slice(key, klen), Slice(val, vlen)));
+}
+
+// Delete a key inside a transaction
+void rocksdb_transaction_delete(rocksdb_transaction_t* txn, const char* key,
+ size_t klen, char** errptr) {
+ SaveError(errptr, txn->rep->Delete(Slice(key, klen)));
+}
+
+void rocksdb_transaction_delete_cf(
+ rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen, char** errptr) {
+ SaveError(errptr, txn->rep->Delete(column_family->rep, Slice(key, klen)));
+}
+
+// Delete a key outside a transaction
+void rocksdb_transactiondb_delete(rocksdb_transactiondb_t* txn_db,
+ const rocksdb_writeoptions_t* options,
+ const char* key, size_t klen, char** errptr) {
+ SaveError(errptr, txn_db->rep->Delete(options->rep, Slice(key, klen)));
+}
+
+void rocksdb_transactiondb_delete_cf(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, char** errptr) {
+ SaveError(errptr, txn_db->rep->Delete(options->rep, column_family->rep,
+ Slice(key, keylen)));
+}
+
+// Create an iterator inside a transaction
+rocksdb_iterator_t* rocksdb_transaction_create_iterator(
+ rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options) {
+ rocksdb_iterator_t* result = new rocksdb_iterator_t;
+ result->rep = txn->rep->GetIterator(options->rep);
+ return result;
+}
+
+// Create an iterator inside a transaction with column family
+rocksdb_iterator_t* rocksdb_transaction_create_iterator_cf(
+ rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family) {
+ rocksdb_iterator_t* result = new rocksdb_iterator_t;
+ result->rep = txn->rep->GetIterator(options->rep, column_family->rep);
+ return result;
+}
+
+// Create an iterator outside a transaction
+rocksdb_iterator_t* rocksdb_transactiondb_create_iterator(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options) {
+ rocksdb_iterator_t* result = new rocksdb_iterator_t;
+ result->rep = txn_db->rep->NewIterator(options->rep);
+ return result;
+}
+
+rocksdb_iterator_t* rocksdb_transactiondb_create_iterator_cf(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family) {
+ rocksdb_iterator_t* result = new rocksdb_iterator_t;
+ result->rep = txn_db->rep->NewIterator(options->rep, column_family->rep);
+ return result;
+}
+
+void rocksdb_transactiondb_close(rocksdb_transactiondb_t* txn_db) {
+ delete txn_db->rep;
+ delete txn_db;
+}
+
+rocksdb_checkpoint_t* rocksdb_transactiondb_checkpoint_object_create(
+ rocksdb_transactiondb_t* txn_db, char** errptr) {
+ Checkpoint* checkpoint;
+ if (SaveError(errptr, Checkpoint::Create(txn_db->rep, &checkpoint))) {
+ return nullptr;
+ }
+ rocksdb_checkpoint_t* result = new rocksdb_checkpoint_t;
+ result->rep = checkpoint;
+ return result;
+}
+
+rocksdb_optimistictransactiondb_t* rocksdb_optimistictransactiondb_open(
+ const rocksdb_options_t* options, const char* name, char** errptr) {
+ OptimisticTransactionDB* otxn_db;
+ if (SaveError(errptr, OptimisticTransactionDB::Open(
+ options->rep, std::string(name), &otxn_db))) {
+ return nullptr;
+ }
+ rocksdb_optimistictransactiondb_t* result =
+ new rocksdb_optimistictransactiondb_t;
+ result->rep = otxn_db;
+ return result;
+}
+
+rocksdb_optimistictransactiondb_t*
+rocksdb_optimistictransactiondb_open_column_families(
+ const rocksdb_options_t* db_options, const char* name,
+ int num_column_families, const char* const* column_family_names,
+ const rocksdb_options_t* const* column_family_options,
+ rocksdb_column_family_handle_t** column_family_handles, char** errptr) {
+ std::vector<ColumnFamilyDescriptor> column_families;
+ for (int i = 0; i < num_column_families; i++) {
+ column_families.push_back(ColumnFamilyDescriptor(
+ std::string(column_family_names[i]),
+ ColumnFamilyOptions(column_family_options[i]->rep)));
+ }
+
+ OptimisticTransactionDB* otxn_db;
+ std::vector<ColumnFamilyHandle*> handles;
+ if (SaveError(errptr, OptimisticTransactionDB::Open(
+ DBOptions(db_options->rep), std::string(name),
+ column_families, &handles, &otxn_db))) {
+ return nullptr;
+ }
+
+ for (size_t i = 0; i < handles.size(); i++) {
+ rocksdb_column_family_handle_t* c_handle =
+ new rocksdb_column_family_handle_t;
+ c_handle->rep = handles[i];
+ column_family_handles[i] = c_handle;
+ }
+ rocksdb_optimistictransactiondb_t* result =
+ new rocksdb_optimistictransactiondb_t;
+ result->rep = otxn_db;
+ return result;
+}
+
+rocksdb_t* rocksdb_optimistictransactiondb_get_base_db(
+ rocksdb_optimistictransactiondb_t* otxn_db) {
+ DB* base_db = otxn_db->rep->GetBaseDB();
+
+ if (base_db != nullptr) {
+ rocksdb_t* result = new rocksdb_t;
+ result->rep = base_db;
+ return result;
+ }
+
+ return nullptr;
+}
+
+void rocksdb_optimistictransactiondb_close_base_db(rocksdb_t* base_db) {
+ delete base_db;
+}
+
+rocksdb_transaction_t* rocksdb_optimistictransaction_begin(
+ rocksdb_optimistictransactiondb_t* otxn_db,
+ const rocksdb_writeoptions_t* write_options,
+ const rocksdb_optimistictransaction_options_t* otxn_options,
+ rocksdb_transaction_t* old_txn) {
+ if (old_txn == nullptr) {
+ rocksdb_transaction_t* result = new rocksdb_transaction_t;
+ result->rep = otxn_db->rep->BeginTransaction(write_options->rep,
+ otxn_options->rep, nullptr);
+ return result;
+ }
+ old_txn->rep = otxn_db->rep->BeginTransaction(
+ write_options->rep, otxn_options->rep, old_txn->rep);
+ return old_txn;
+}
+
+void rocksdb_optimistictransactiondb_close(
+ rocksdb_optimistictransactiondb_t* otxn_db) {
+ delete otxn_db->rep;
+ delete otxn_db;
+}
+
+void rocksdb_free(void* ptr) { free(ptr); }
+
+rocksdb_pinnableslice_t* rocksdb_get_pinned(
+ rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key,
+ size_t keylen, char** errptr) {
+ rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
+ Status s = db->rep->Get(options->rep, db->rep->DefaultColumnFamily(),
+ Slice(key, keylen), &v->rep);
+ if (!s.ok()) {
+ delete (v);
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ return nullptr;
+ }
+ return v;
+}
+
+rocksdb_pinnableslice_t* rocksdb_get_pinned_cf(
+ rocksdb_t* db, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, char** errptr) {
+ rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
+ Status s = db->rep->Get(options->rep, column_family->rep, Slice(key, keylen),
+ &v->rep);
+ if (!s.ok()) {
+ delete v;
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ return nullptr;
+ }
+ return v;
+}
+
+void rocksdb_pinnableslice_destroy(rocksdb_pinnableslice_t* v) { delete v; }
+
+const char* rocksdb_pinnableslice_value(const rocksdb_pinnableslice_t* v,
+ size_t* vlen) {
+ if (!v) {
+ *vlen = 0;
+ return nullptr;
+ }
+
+ *vlen = v->rep.size();
+ return v->rep.data();
+}
+
+// container to keep databases and caches in order to use
+// ROCKSDB_NAMESPACE::MemoryUtil
+struct rocksdb_memory_consumers_t {
+ std::vector<rocksdb_t*> dbs;
+ std::unordered_set<rocksdb_cache_t*> caches;
+};
+
+// initializes new container of memory consumers
+rocksdb_memory_consumers_t* rocksdb_memory_consumers_create() {
+ return new rocksdb_memory_consumers_t;
+}
+
+// adds datatabase to the container of memory consumers
+void rocksdb_memory_consumers_add_db(rocksdb_memory_consumers_t* consumers,
+ rocksdb_t* db) {
+ consumers->dbs.push_back(db);
+}
+
+// adds cache to the container of memory consumers
+void rocksdb_memory_consumers_add_cache(rocksdb_memory_consumers_t* consumers,
+ rocksdb_cache_t* cache) {
+ consumers->caches.insert(cache);
+}
+
+// deletes container with memory consumers
+void rocksdb_memory_consumers_destroy(rocksdb_memory_consumers_t* consumers) {
+ delete consumers;
+}
+
+// contains memory usage statistics provided by ROCKSDB_NAMESPACE::MemoryUtil
+struct rocksdb_memory_usage_t {
+ uint64_t mem_table_total;
+ uint64_t mem_table_unflushed;
+ uint64_t mem_table_readers_total;
+ uint64_t cache_total;
+};
+
+// estimates amount of memory occupied by consumers (dbs and caches)
+rocksdb_memory_usage_t* rocksdb_approximate_memory_usage_create(
+ rocksdb_memory_consumers_t* consumers, char** errptr) {
+
+ vector<DB*> dbs;
+ for (auto db : consumers->dbs) {
+ dbs.push_back(db->rep);
+ }
+
+ unordered_set<const Cache*> cache_set;
+ for (auto cache : consumers->caches) {
+ cache_set.insert(const_cast<const Cache*>(cache->rep.get()));
+ }
+
+ std::map<ROCKSDB_NAMESPACE::MemoryUtil::UsageType, uint64_t> usage_by_type;
+
+ auto status = MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set,
+ &usage_by_type);
+ if (SaveError(errptr, status)) {
+ return nullptr;
+ }
+
+ auto result = new rocksdb_memory_usage_t;
+ result->mem_table_total = usage_by_type[MemoryUtil::kMemTableTotal];
+ result->mem_table_unflushed = usage_by_type[MemoryUtil::kMemTableUnFlushed];
+ result->mem_table_readers_total = usage_by_type[MemoryUtil::kTableReadersTotal];
+ result->cache_total = usage_by_type[MemoryUtil::kCacheTotal];
+ return result;
+}
+
+uint64_t rocksdb_approximate_memory_usage_get_mem_table_total(
+ rocksdb_memory_usage_t* memory_usage) {
+ return memory_usage->mem_table_total;
+}
+
+uint64_t rocksdb_approximate_memory_usage_get_mem_table_unflushed(
+ rocksdb_memory_usage_t* memory_usage) {
+ return memory_usage->mem_table_unflushed;
+}
+
+uint64_t rocksdb_approximate_memory_usage_get_mem_table_readers_total(
+ rocksdb_memory_usage_t* memory_usage) {
+ return memory_usage->mem_table_readers_total;
+}
+
+uint64_t rocksdb_approximate_memory_usage_get_cache_total(
+ rocksdb_memory_usage_t* memory_usage) {
+ return memory_usage->cache_total;
+}
+
+// deletes container with memory usage estimates
+void rocksdb_approximate_memory_usage_destroy(rocksdb_memory_usage_t* usage) {
+ delete usage;
+}
+
+} // end extern "C"
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/c_test.c b/src/rocksdb/db/c_test.c
new file mode 100644
index 000000000..cf2e266f9
--- /dev/null
+++ b/src/rocksdb/db/c_test.c
@@ -0,0 +1,1866 @@
+/* Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+ Use of this source code is governed by a BSD-style license that can be
+ found in the LICENSE file. See the AUTHORS file for names of contributors. */
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+#include <stdio.h>
+
+#ifndef ROCKSDB_LITE // Lite does not support C API
+
+#include "rocksdb/c.h"
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#ifndef OS_WIN
+#include <unistd.h>
+#endif
+#include <inttypes.h>
+
+// Can not use port/port.h macros as this is a c file
+#ifdef OS_WIN
+#include <windows.h>
+
+// Ok for uniqueness
+int geteuid() {
+ int result = 0;
+
+ result = ((int)GetCurrentProcessId() << 16);
+ result |= (int)GetCurrentThreadId();
+
+ return result;
+}
+
+// VS < 2015
+#if defined(_MSC_VER) && (_MSC_VER < 1900)
+#define snprintf _snprintf
+#endif
+
+#endif
+
+const char* phase = "";
+static char dbname[200];
+static char sstfilename[200];
+static char dbbackupname[200];
+static char dbcheckpointname[200];
+static char dbpathname[200];
+static char secondary_path[200];
+
+static void StartPhase(const char* name) {
+ fprintf(stderr, "=== Test %s\n", name);
+ phase = name;
+}
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning (disable: 4996) // getenv security warning
+#endif
+static const char* GetTempDir(void) {
+ const char* ret = getenv("TEST_TMPDIR");
+ if (ret == NULL || ret[0] == '\0')
+ ret = "/tmp";
+ return ret;
+}
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+#define CheckNoError(err) \
+ if ((err) != NULL) { \
+ fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, (err)); \
+ abort(); \
+ }
+
+#define CheckCondition(cond) \
+ if (!(cond)) { \
+ fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, #cond); \
+ abort(); \
+ }
+
+static void CheckEqual(const char* expected, const char* v, size_t n) {
+ if (expected == NULL && v == NULL) {
+ // ok
+ } else if (expected != NULL && v != NULL && n == strlen(expected) &&
+ memcmp(expected, v, n) == 0) {
+ // ok
+ return;
+ } else {
+ fprintf(stderr, "%s: expected '%s', got '%s'\n",
+ phase,
+ (expected ? expected : "(null)"),
+ (v ? v : "(null"));
+ abort();
+ }
+}
+
+static void Free(char** ptr) {
+ if (*ptr) {
+ free(*ptr);
+ *ptr = NULL;
+ }
+}
+
+static void CheckValue(
+ char* err,
+ const char* expected,
+ char** actual,
+ size_t actual_length) {
+ CheckNoError(err);
+ CheckEqual(expected, *actual, actual_length);
+ Free(actual);
+}
+
+static void CheckGet(
+ rocksdb_t* db,
+ const rocksdb_readoptions_t* options,
+ const char* key,
+ const char* expected) {
+ char* err = NULL;
+ size_t val_len;
+ char* val;
+ val = rocksdb_get(db, options, key, strlen(key), &val_len, &err);
+ CheckNoError(err);
+ CheckEqual(expected, val, val_len);
+ Free(&val);
+}
+
+static void CheckGetCF(
+ rocksdb_t* db,
+ const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* handle,
+ const char* key,
+ const char* expected) {
+ char* err = NULL;
+ size_t val_len;
+ char* val;
+ val = rocksdb_get_cf(db, options, handle, key, strlen(key), &val_len, &err);
+ CheckNoError(err);
+ CheckEqual(expected, val, val_len);
+ Free(&val);
+}
+
+static void CheckPinGet(rocksdb_t* db, const rocksdb_readoptions_t* options,
+ const char* key, const char* expected) {
+ char* err = NULL;
+ size_t val_len;
+ const char* val;
+ rocksdb_pinnableslice_t* p;
+ p = rocksdb_get_pinned(db, options, key, strlen(key), &err);
+ CheckNoError(err);
+ val = rocksdb_pinnableslice_value(p, &val_len);
+ CheckEqual(expected, val, val_len);
+ rocksdb_pinnableslice_destroy(p);
+}
+
+static void CheckPinGetCF(rocksdb_t* db, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* handle,
+ const char* key, const char* expected) {
+ char* err = NULL;
+ size_t val_len;
+ const char* val;
+ rocksdb_pinnableslice_t* p;
+ p = rocksdb_get_pinned_cf(db, options, handle, key, strlen(key), &err);
+ CheckNoError(err);
+ val = rocksdb_pinnableslice_value(p, &val_len);
+ CheckEqual(expected, val, val_len);
+ rocksdb_pinnableslice_destroy(p);
+}
+
+static void CheckIter(rocksdb_iterator_t* iter,
+ const char* key, const char* val) {
+ size_t len;
+ const char* str;
+ str = rocksdb_iter_key(iter, &len);
+ CheckEqual(key, str, len);
+ str = rocksdb_iter_value(iter, &len);
+ CheckEqual(val, str, len);
+}
+
+// Callback from rocksdb_writebatch_iterate()
+static void CheckPut(void* ptr,
+ const char* k, size_t klen,
+ const char* v, size_t vlen) {
+ int* state = (int*) ptr;
+ CheckCondition(*state < 2);
+ switch (*state) {
+ case 0:
+ CheckEqual("bar", k, klen);
+ CheckEqual("b", v, vlen);
+ break;
+ case 1:
+ CheckEqual("box", k, klen);
+ CheckEqual("c", v, vlen);
+ break;
+ }
+ (*state)++;
+}
+
+// Callback from rocksdb_writebatch_iterate()
+static void CheckDel(void* ptr, const char* k, size_t klen) {
+ int* state = (int*) ptr;
+ CheckCondition(*state == 2);
+ CheckEqual("bar", k, klen);
+ (*state)++;
+}
+
+static void CmpDestroy(void* arg) { (void)arg; }
+
+static int CmpCompare(void* arg, const char* a, size_t alen,
+ const char* b, size_t blen) {
+ (void)arg;
+ size_t n = (alen < blen) ? alen : blen;
+ int r = memcmp(a, b, n);
+ if (r == 0) {
+ if (alen < blen) r = -1;
+ else if (alen > blen) r = +1;
+ }
+ return r;
+}
+
+static const char* CmpName(void* arg) {
+ (void)arg;
+ return "foo";
+}
+
+// Custom filter policy
+static unsigned char fake_filter_result = 1;
+static void FilterDestroy(void* arg) { (void)arg; }
+static const char* FilterName(void* arg) {
+ (void)arg;
+ return "TestFilter";
+}
+static char* FilterCreate(
+ void* arg,
+ const char* const* key_array, const size_t* key_length_array,
+ int num_keys,
+ size_t* filter_length) {
+ (void)arg;
+ (void)key_array;
+ (void)key_length_array;
+ (void)num_keys;
+ *filter_length = 4;
+ char* result = malloc(4);
+ memcpy(result, "fake", 4);
+ return result;
+}
+static unsigned char FilterKeyMatch(
+ void* arg,
+ const char* key, size_t length,
+ const char* filter, size_t filter_length) {
+ (void)arg;
+ (void)key;
+ (void)length;
+ CheckCondition(filter_length == 4);
+ CheckCondition(memcmp(filter, "fake", 4) == 0);
+ return fake_filter_result;
+}
+
+// Custom compaction filter
+static void CFilterDestroy(void* arg) { (void)arg; }
+static const char* CFilterName(void* arg) {
+ (void)arg;
+ return "foo";
+}
+static unsigned char CFilterFilter(void* arg, int level, const char* key,
+ size_t key_length,
+ const char* existing_value,
+ size_t value_length, char** new_value,
+ size_t* new_value_length,
+ unsigned char* value_changed) {
+ (void)arg;
+ (void)level;
+ (void)existing_value;
+ (void)value_length;
+ if (key_length == 3) {
+ if (memcmp(key, "bar", key_length) == 0) {
+ return 1;
+ } else if (memcmp(key, "baz", key_length) == 0) {
+ *value_changed = 1;
+ *new_value = "newbazvalue";
+ *new_value_length = 11;
+ return 0;
+ }
+ }
+ return 0;
+}
+
+static void CFilterFactoryDestroy(void* arg) { (void)arg; }
+static const char* CFilterFactoryName(void* arg) {
+ (void)arg;
+ return "foo";
+}
+static rocksdb_compactionfilter_t* CFilterCreate(
+ void* arg, rocksdb_compactionfiltercontext_t* context) {
+ (void)arg;
+ (void)context;
+ return rocksdb_compactionfilter_create(NULL, CFilterDestroy, CFilterFilter,
+ CFilterName);
+}
+
+static rocksdb_t* CheckCompaction(rocksdb_t* db, rocksdb_options_t* options,
+ rocksdb_readoptions_t* roptions,
+ rocksdb_writeoptions_t* woptions) {
+ char* err = NULL;
+ db = rocksdb_open(options, dbname, &err);
+ CheckNoError(err);
+ rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "foo", "foovalue");
+ rocksdb_put(db, woptions, "bar", 3, "barvalue", 8, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "bar", "barvalue");
+ rocksdb_put(db, woptions, "baz", 3, "bazvalue", 8, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "baz", "bazvalue");
+
+ // Force compaction
+ rocksdb_compact_range(db, NULL, 0, NULL, 0);
+ // should have filtered bar, but not foo
+ CheckGet(db, roptions, "foo", "foovalue");
+ CheckGet(db, roptions, "bar", NULL);
+ CheckGet(db, roptions, "baz", "newbazvalue");
+ return db;
+}
+
+// Custom merge operator
+static void MergeOperatorDestroy(void* arg) { (void)arg; }
+static const char* MergeOperatorName(void* arg) {
+ (void)arg;
+ return "TestMergeOperator";
+}
+static char* MergeOperatorFullMerge(
+ void* arg,
+ const char* key, size_t key_length,
+ const char* existing_value, size_t existing_value_length,
+ const char* const* operands_list, const size_t* operands_list_length,
+ int num_operands,
+ unsigned char* success, size_t* new_value_length) {
+ (void)arg;
+ (void)key;
+ (void)key_length;
+ (void)existing_value;
+ (void)existing_value_length;
+ (void)operands_list;
+ (void)operands_list_length;
+ (void)num_operands;
+ *new_value_length = 4;
+ *success = 1;
+ char* result = malloc(4);
+ memcpy(result, "fake", 4);
+ return result;
+}
+static char* MergeOperatorPartialMerge(
+ void* arg,
+ const char* key, size_t key_length,
+ const char* const* operands_list, const size_t* operands_list_length,
+ int num_operands,
+ unsigned char* success, size_t* new_value_length) {
+ (void)arg;
+ (void)key;
+ (void)key_length;
+ (void)operands_list;
+ (void)operands_list_length;
+ (void)num_operands;
+ *new_value_length = 4;
+ *success = 1;
+ char* result = malloc(4);
+ memcpy(result, "fake", 4);
+ return result;
+}
+
+static void CheckTxnGet(
+ rocksdb_transaction_t* txn,
+ const rocksdb_readoptions_t* options,
+ const char* key,
+ const char* expected) {
+ char* err = NULL;
+ size_t val_len;
+ char* val;
+ val = rocksdb_transaction_get(txn, options, key, strlen(key), &val_len, &err);
+ CheckNoError(err);
+ CheckEqual(expected, val, val_len);
+ Free(&val);
+}
+
+static void CheckTxnGetCF(rocksdb_transaction_t* txn,
+ const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, const char* expected) {
+ char* err = NULL;
+ size_t val_len;
+ char* val;
+ val = rocksdb_transaction_get_cf(txn, options, column_family, key,
+ strlen(key), &val_len, &err);
+ CheckNoError(err);
+ CheckEqual(expected, val, val_len);
+ Free(&val);
+}
+
+static void CheckTxnDBGet(
+ rocksdb_transactiondb_t* txn_db,
+ const rocksdb_readoptions_t* options,
+ const char* key,
+ const char* expected) {
+ char* err = NULL;
+ size_t val_len;
+ char* val;
+ val = rocksdb_transactiondb_get(txn_db, options, key, strlen(key), &val_len, &err);
+ CheckNoError(err);
+ CheckEqual(expected, val, val_len);
+ Free(&val);
+}
+
+static void CheckTxnDBGetCF(rocksdb_transactiondb_t* txn_db,
+ const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, const char* expected) {
+ char* err = NULL;
+ size_t val_len;
+ char* val;
+ val = rocksdb_transactiondb_get_cf(txn_db, options, column_family, key,
+ strlen(key), &val_len, &err);
+ CheckNoError(err);
+ CheckEqual(expected, val, val_len);
+ Free(&val);
+}
+
+int main(int argc, char** argv) {
+ (void)argc;
+ (void)argv;
+ rocksdb_t* db;
+ rocksdb_comparator_t* cmp;
+ rocksdb_cache_t* cache;
+ rocksdb_dbpath_t *dbpath;
+ rocksdb_env_t* env;
+ rocksdb_options_t* options;
+ rocksdb_compactoptions_t* coptions;
+ rocksdb_block_based_table_options_t* table_options;
+ rocksdb_readoptions_t* roptions;
+ rocksdb_writeoptions_t* woptions;
+ rocksdb_ratelimiter_t* rate_limiter;
+ rocksdb_transactiondb_t* txn_db;
+ rocksdb_transactiondb_options_t* txn_db_options;
+ rocksdb_transaction_t* txn;
+ rocksdb_transaction_options_t* txn_options;
+ rocksdb_optimistictransactiondb_t* otxn_db;
+ rocksdb_optimistictransaction_options_t* otxn_options;
+ char* err = NULL;
+ int run = -1;
+
+ snprintf(dbname, sizeof(dbname),
+ "%s/rocksdb_c_test-%d",
+ GetTempDir(),
+ ((int) geteuid()));
+
+ snprintf(dbbackupname, sizeof(dbbackupname),
+ "%s/rocksdb_c_test-%d-backup",
+ GetTempDir(),
+ ((int) geteuid()));
+
+ snprintf(dbcheckpointname, sizeof(dbcheckpointname),
+ "%s/rocksdb_c_test-%d-checkpoint",
+ GetTempDir(),
+ ((int) geteuid()));
+
+ snprintf(sstfilename, sizeof(sstfilename),
+ "%s/rocksdb_c_test-%d-sst",
+ GetTempDir(),
+ ((int)geteuid()));
+
+ snprintf(dbpathname, sizeof(dbpathname),
+ "%s/rocksdb_c_test-%d-dbpath",
+ GetTempDir(),
+ ((int) geteuid()));
+
+ StartPhase("create_objects");
+ cmp = rocksdb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName);
+ dbpath = rocksdb_dbpath_create(dbpathname, 1024 * 1024);
+ env = rocksdb_create_default_env();
+ cache = rocksdb_cache_create_lru(100000);
+
+ options = rocksdb_options_create();
+ rocksdb_options_set_comparator(options, cmp);
+ rocksdb_options_set_error_if_exists(options, 1);
+ rocksdb_options_set_env(options, env);
+ rocksdb_options_set_info_log(options, NULL);
+ rocksdb_options_set_write_buffer_size(options, 100000);
+ rocksdb_options_set_paranoid_checks(options, 1);
+ rocksdb_options_set_max_open_files(options, 10);
+ rocksdb_options_set_base_background_compactions(options, 1);
+
+ table_options = rocksdb_block_based_options_create();
+ rocksdb_block_based_options_set_block_cache(table_options, cache);
+ rocksdb_block_based_options_set_data_block_index_type(table_options, 1);
+ rocksdb_block_based_options_set_data_block_hash_ratio(table_options, 0.75);
+ rocksdb_options_set_block_based_table_factory(options, table_options);
+
+ rocksdb_options_set_compression(options, rocksdb_no_compression);
+ rocksdb_options_set_compression_options(options, -14, -1, 0, 0);
+ int compression_levels[] = {rocksdb_no_compression, rocksdb_no_compression,
+ rocksdb_no_compression, rocksdb_no_compression};
+ rocksdb_options_set_compression_per_level(options, compression_levels, 4);
+ rate_limiter = rocksdb_ratelimiter_create(1000 * 1024 * 1024, 100 * 1000, 10);
+ rocksdb_options_set_ratelimiter(options, rate_limiter);
+ rocksdb_ratelimiter_destroy(rate_limiter);
+
+ roptions = rocksdb_readoptions_create();
+ rocksdb_readoptions_set_verify_checksums(roptions, 1);
+ rocksdb_readoptions_set_fill_cache(roptions, 1);
+
+ woptions = rocksdb_writeoptions_create();
+ rocksdb_writeoptions_set_sync(woptions, 1);
+
+ coptions = rocksdb_compactoptions_create();
+ rocksdb_compactoptions_set_exclusive_manual_compaction(coptions, 1);
+
+ StartPhase("destroy");
+ rocksdb_destroy_db(options, dbname, &err);
+ Free(&err);
+
+ StartPhase("open_error");
+ rocksdb_open(options, dbname, &err);
+ CheckCondition(err != NULL);
+ Free(&err);
+
+ StartPhase("open");
+ rocksdb_options_set_create_if_missing(options, 1);
+ db = rocksdb_open(options, dbname, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "foo", NULL);
+
+ StartPhase("put");
+ rocksdb_put(db, woptions, "foo", 3, "hello", 5, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "foo", "hello");
+
+ StartPhase("backup_and_restore");
+ {
+ rocksdb_destroy_db(options, dbbackupname, &err);
+ CheckNoError(err);
+
+ rocksdb_backup_engine_t *be = rocksdb_backup_engine_open(options, dbbackupname, &err);
+ CheckNoError(err);
+
+ rocksdb_backup_engine_create_new_backup(be, db, &err);
+ CheckNoError(err);
+
+ // need a change to trigger a new backup
+ rocksdb_delete(db, woptions, "does-not-exist", 14, &err);
+ CheckNoError(err);
+
+ rocksdb_backup_engine_create_new_backup(be, db, &err);
+ CheckNoError(err);
+
+ const rocksdb_backup_engine_info_t* bei = rocksdb_backup_engine_get_backup_info(be);
+ CheckCondition(rocksdb_backup_engine_info_count(bei) > 1);
+ rocksdb_backup_engine_info_destroy(bei);
+
+ rocksdb_backup_engine_purge_old_backups(be, 1, &err);
+ CheckNoError(err);
+
+ bei = rocksdb_backup_engine_get_backup_info(be);
+ CheckCondition(rocksdb_backup_engine_info_count(bei) == 1);
+ rocksdb_backup_engine_info_destroy(bei);
+
+ rocksdb_delete(db, woptions, "foo", 3, &err);
+ CheckNoError(err);
+
+ rocksdb_close(db);
+
+ rocksdb_destroy_db(options, dbname, &err);
+ CheckNoError(err);
+
+ rocksdb_restore_options_t *restore_options = rocksdb_restore_options_create();
+ rocksdb_restore_options_set_keep_log_files(restore_options, 0);
+ rocksdb_backup_engine_restore_db_from_latest_backup(be, dbname, dbname, restore_options, &err);
+ CheckNoError(err);
+ rocksdb_restore_options_destroy(restore_options);
+
+ rocksdb_options_set_error_if_exists(options, 0);
+ db = rocksdb_open(options, dbname, &err);
+ CheckNoError(err);
+ rocksdb_options_set_error_if_exists(options, 1);
+
+ CheckGet(db, roptions, "foo", "hello");
+
+ rocksdb_backup_engine_close(be);
+ }
+
+ StartPhase("checkpoint");
+ {
+ rocksdb_destroy_db(options, dbcheckpointname, &err);
+ CheckNoError(err);
+
+ rocksdb_checkpoint_t* checkpoint = rocksdb_checkpoint_object_create(db, &err);
+ CheckNoError(err);
+
+ rocksdb_checkpoint_create(checkpoint, dbcheckpointname, 0, &err);
+ CheckNoError(err);
+
+ // start a new database from the checkpoint
+ rocksdb_close(db);
+ rocksdb_options_set_error_if_exists(options, 0);
+ db = rocksdb_open(options, dbcheckpointname, &err);
+ CheckNoError(err);
+
+ CheckGet(db, roptions, "foo", "hello");
+
+ rocksdb_checkpoint_object_destroy(checkpoint);
+
+ rocksdb_close(db);
+ rocksdb_destroy_db(options, dbcheckpointname, &err);
+ CheckNoError(err);
+
+ db = rocksdb_open(options, dbname, &err);
+ CheckNoError(err);
+ rocksdb_options_set_error_if_exists(options, 1);
+ }
+
+ StartPhase("compactall");
+ rocksdb_compact_range(db, NULL, 0, NULL, 0);
+ CheckGet(db, roptions, "foo", "hello");
+
+ StartPhase("compactrange");
+ rocksdb_compact_range(db, "a", 1, "z", 1);
+ CheckGet(db, roptions, "foo", "hello");
+
+ StartPhase("compactallopt");
+ rocksdb_compact_range_opt(db, coptions, NULL, 0, NULL, 0);
+ CheckGet(db, roptions, "foo", "hello");
+
+ StartPhase("compactrangeopt");
+ rocksdb_compact_range_opt(db, coptions, "a", 1, "z", 1);
+ CheckGet(db, roptions, "foo", "hello");
+
+ // Simple check cache usage
+ StartPhase("cache_usage");
+ {
+ rocksdb_readoptions_set_pin_data(roptions, 1);
+ rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+ rocksdb_iter_seek(iter, "foo", 3);
+
+ size_t usage = rocksdb_cache_get_usage(cache);
+ CheckCondition(usage > 0);
+
+ size_t pin_usage = rocksdb_cache_get_pinned_usage(cache);
+ CheckCondition(pin_usage > 0);
+
+ rocksdb_iter_next(iter);
+ rocksdb_iter_destroy(iter);
+ rocksdb_readoptions_set_pin_data(roptions, 0);
+ }
+
+ StartPhase("addfile");
+ {
+ rocksdb_envoptions_t* env_opt = rocksdb_envoptions_create();
+ rocksdb_options_t* io_options = rocksdb_options_create();
+ rocksdb_sstfilewriter_t* writer =
+ rocksdb_sstfilewriter_create(env_opt, io_options);
+
+ remove(sstfilename);
+ rocksdb_sstfilewriter_open(writer, sstfilename, &err);
+ CheckNoError(err);
+ rocksdb_sstfilewriter_put(writer, "sstk1", 5, "v1", 2, &err);
+ CheckNoError(err);
+ rocksdb_sstfilewriter_put(writer, "sstk2", 5, "v2", 2, &err);
+ CheckNoError(err);
+ rocksdb_sstfilewriter_put(writer, "sstk3", 5, "v3", 2, &err);
+ CheckNoError(err);
+ rocksdb_sstfilewriter_finish(writer, &err);
+ CheckNoError(err);
+
+ rocksdb_ingestexternalfileoptions_t* ing_opt =
+ rocksdb_ingestexternalfileoptions_create();
+ const char* file_list[1] = {sstfilename};
+ rocksdb_ingest_external_file(db, file_list, 1, ing_opt, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "sstk1", "v1");
+ CheckGet(db, roptions, "sstk2", "v2");
+ CheckGet(db, roptions, "sstk3", "v3");
+
+ remove(sstfilename);
+ rocksdb_sstfilewriter_open(writer, sstfilename, &err);
+ CheckNoError(err);
+ rocksdb_sstfilewriter_put(writer, "sstk2", 5, "v4", 2, &err);
+ CheckNoError(err);
+ rocksdb_sstfilewriter_put(writer, "sstk22", 6, "v5", 2, &err);
+ CheckNoError(err);
+ rocksdb_sstfilewriter_put(writer, "sstk3", 5, "v6", 2, &err);
+ CheckNoError(err);
+ rocksdb_sstfilewriter_finish(writer, &err);
+ CheckNoError(err);
+
+ rocksdb_ingest_external_file(db, file_list, 1, ing_opt, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "sstk1", "v1");
+ CheckGet(db, roptions, "sstk2", "v4");
+ CheckGet(db, roptions, "sstk22", "v5");
+ CheckGet(db, roptions, "sstk3", "v6");
+
+ rocksdb_ingestexternalfileoptions_destroy(ing_opt);
+ rocksdb_sstfilewriter_destroy(writer);
+ rocksdb_options_destroy(io_options);
+ rocksdb_envoptions_destroy(env_opt);
+
+ // Delete all keys we just ingested
+ rocksdb_delete(db, woptions, "sstk1", 5, &err);
+ CheckNoError(err);
+ rocksdb_delete(db, woptions, "sstk2", 5, &err);
+ CheckNoError(err);
+ rocksdb_delete(db, woptions, "sstk22", 6, &err);
+ CheckNoError(err);
+ rocksdb_delete(db, woptions, "sstk3", 5, &err);
+ CheckNoError(err);
+ }
+
+ StartPhase("writebatch");
+ {
+ rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
+ rocksdb_writebatch_put(wb, "foo", 3, "a", 1);
+ rocksdb_writebatch_clear(wb);
+ rocksdb_writebatch_put(wb, "bar", 3, "b", 1);
+ rocksdb_writebatch_put(wb, "box", 3, "c", 1);
+ rocksdb_writebatch_delete(wb, "bar", 3);
+ rocksdb_write(db, woptions, wb, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "foo", "hello");
+ CheckGet(db, roptions, "bar", NULL);
+ CheckGet(db, roptions, "box", "c");
+ int pos = 0;
+ rocksdb_writebatch_iterate(wb, &pos, CheckPut, CheckDel);
+ CheckCondition(pos == 3);
+ rocksdb_writebatch_clear(wb);
+ rocksdb_writebatch_put(wb, "bar", 3, "b", 1);
+ rocksdb_writebatch_put(wb, "bay", 3, "d", 1);
+ rocksdb_writebatch_delete_range(wb, "bar", 3, "bay", 3);
+ rocksdb_write(db, woptions, wb, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "bar", NULL);
+ CheckGet(db, roptions, "bay", "d");
+ rocksdb_writebatch_clear(wb);
+ const char* start_list[1] = {"bay"};
+ const size_t start_sizes[1] = {3};
+ const char* end_list[1] = {"baz"};
+ const size_t end_sizes[1] = {3};
+ rocksdb_writebatch_delete_rangev(wb, 1, start_list, start_sizes, end_list,
+ end_sizes);
+ rocksdb_write(db, woptions, wb, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "bay", NULL);
+ rocksdb_writebatch_destroy(wb);
+ }
+
+ StartPhase("writebatch_vectors");
+ {
+ rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
+ const char* k_list[2] = { "z", "ap" };
+ const size_t k_sizes[2] = { 1, 2 };
+ const char* v_list[3] = { "x", "y", "z" };
+ const size_t v_sizes[3] = { 1, 1, 1 };
+ rocksdb_writebatch_putv(wb, 2, k_list, k_sizes, 3, v_list, v_sizes);
+ rocksdb_write(db, woptions, wb, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "zap", "xyz");
+ rocksdb_writebatch_delete(wb, "zap", 3);
+ rocksdb_write(db, woptions, wb, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "zap", NULL);
+ rocksdb_writebatch_destroy(wb);
+ }
+
+ StartPhase("writebatch_savepoint");
+ {
+ rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
+ rocksdb_writebatch_set_save_point(wb);
+ rocksdb_writebatch_set_save_point(wb);
+ const char* k_list[2] = {"z", "ap"};
+ const size_t k_sizes[2] = {1, 2};
+ const char* v_list[3] = {"x", "y", "z"};
+ const size_t v_sizes[3] = {1, 1, 1};
+ rocksdb_writebatch_pop_save_point(wb, &err);
+ CheckNoError(err);
+ rocksdb_writebatch_putv(wb, 2, k_list, k_sizes, 3, v_list, v_sizes);
+ rocksdb_writebatch_rollback_to_save_point(wb, &err);
+ CheckNoError(err);
+ rocksdb_write(db, woptions, wb, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "zap", NULL);
+ rocksdb_writebatch_destroy(wb);
+ }
+
+ StartPhase("writebatch_rep");
+ {
+ rocksdb_writebatch_t* wb1 = rocksdb_writebatch_create();
+ rocksdb_writebatch_put(wb1, "baz", 3, "d", 1);
+ rocksdb_writebatch_put(wb1, "quux", 4, "e", 1);
+ rocksdb_writebatch_delete(wb1, "quux", 4);
+ size_t repsize1 = 0;
+ const char* rep = rocksdb_writebatch_data(wb1, &repsize1);
+ rocksdb_writebatch_t* wb2 = rocksdb_writebatch_create_from(rep, repsize1);
+ CheckCondition(rocksdb_writebatch_count(wb1) ==
+ rocksdb_writebatch_count(wb2));
+ size_t repsize2 = 0;
+ CheckCondition(
+ memcmp(rep, rocksdb_writebatch_data(wb2, &repsize2), repsize1) == 0);
+ rocksdb_writebatch_destroy(wb1);
+ rocksdb_writebatch_destroy(wb2);
+ }
+
+ StartPhase("writebatch_wi");
+ {
+ rocksdb_writebatch_wi_t* wbi = rocksdb_writebatch_wi_create(0, 1);
+ rocksdb_writebatch_wi_put(wbi, "foo", 3, "a", 1);
+ rocksdb_writebatch_wi_clear(wbi);
+ rocksdb_writebatch_wi_put(wbi, "bar", 3, "b", 1);
+ rocksdb_writebatch_wi_put(wbi, "box", 3, "c", 1);
+ rocksdb_writebatch_wi_delete(wbi, "bar", 3);
+ int count = rocksdb_writebatch_wi_count(wbi);
+ CheckCondition(count == 3);
+ size_t size;
+ char* value;
+ value = rocksdb_writebatch_wi_get_from_batch(wbi, options, "box", 3, &size, &err);
+ CheckValue(err, "c", &value, size);
+ value = rocksdb_writebatch_wi_get_from_batch(wbi, options, "bar", 3, &size, &err);
+ CheckValue(err, NULL, &value, size);
+ value = rocksdb_writebatch_wi_get_from_batch_and_db(wbi, db, roptions, "foo", 3, &size, &err);
+ CheckValue(err, "hello", &value, size);
+ value = rocksdb_writebatch_wi_get_from_batch_and_db(wbi, db, roptions, "box", 3, &size, &err);
+ CheckValue(err, "c", &value, size);
+ rocksdb_write_writebatch_wi(db, woptions, wbi, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "foo", "hello");
+ CheckGet(db, roptions, "bar", NULL);
+ CheckGet(db, roptions, "box", "c");
+ int pos = 0;
+ rocksdb_writebatch_wi_iterate(wbi, &pos, CheckPut, CheckDel);
+ CheckCondition(pos == 3);
+ rocksdb_writebatch_wi_clear(wbi);
+ rocksdb_writebatch_wi_destroy(wbi);
+ }
+
+ StartPhase("writebatch_wi_vectors");
+ {
+ rocksdb_writebatch_wi_t* wb = rocksdb_writebatch_wi_create(0, 1);
+ const char* k_list[2] = { "z", "ap" };
+ const size_t k_sizes[2] = { 1, 2 };
+ const char* v_list[3] = { "x", "y", "z" };
+ const size_t v_sizes[3] = { 1, 1, 1 };
+ rocksdb_writebatch_wi_putv(wb, 2, k_list, k_sizes, 3, v_list, v_sizes);
+ rocksdb_write_writebatch_wi(db, woptions, wb, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "zap", "xyz");
+ rocksdb_writebatch_wi_delete(wb, "zap", 3);
+ rocksdb_write_writebatch_wi(db, woptions, wb, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "zap", NULL);
+ rocksdb_writebatch_wi_destroy(wb);
+ }
+
+ StartPhase("writebatch_wi_savepoint");
+ {
+ rocksdb_writebatch_wi_t* wb = rocksdb_writebatch_wi_create(0, 1);
+ rocksdb_writebatch_wi_set_save_point(wb);
+ const char* k_list[2] = {"z", "ap"};
+ const size_t k_sizes[2] = {1, 2};
+ const char* v_list[3] = {"x", "y", "z"};
+ const size_t v_sizes[3] = {1, 1, 1};
+ rocksdb_writebatch_wi_putv(wb, 2, k_list, k_sizes, 3, v_list, v_sizes);
+ rocksdb_writebatch_wi_rollback_to_save_point(wb, &err);
+ CheckNoError(err);
+ rocksdb_write_writebatch_wi(db, woptions, wb, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "zap", NULL);
+ rocksdb_writebatch_wi_destroy(wb);
+ }
+
+ StartPhase("iter");
+ {
+ rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+ CheckCondition(!rocksdb_iter_valid(iter));
+ rocksdb_iter_seek_to_first(iter);
+ CheckCondition(rocksdb_iter_valid(iter));
+ CheckIter(iter, "box", "c");
+ rocksdb_iter_next(iter);
+ CheckIter(iter, "foo", "hello");
+ rocksdb_iter_prev(iter);
+ CheckIter(iter, "box", "c");
+ rocksdb_iter_prev(iter);
+ CheckCondition(!rocksdb_iter_valid(iter));
+ rocksdb_iter_seek_to_last(iter);
+ CheckIter(iter, "foo", "hello");
+ rocksdb_iter_seek(iter, "b", 1);
+ CheckIter(iter, "box", "c");
+ rocksdb_iter_seek_for_prev(iter, "g", 1);
+ CheckIter(iter, "foo", "hello");
+ rocksdb_iter_seek_for_prev(iter, "box", 3);
+ CheckIter(iter, "box", "c");
+ rocksdb_iter_get_error(iter, &err);
+ CheckNoError(err);
+ rocksdb_iter_destroy(iter);
+ }
+
+ StartPhase("wbwi_iter");
+ {
+ rocksdb_iterator_t* base_iter = rocksdb_create_iterator(db, roptions);
+ rocksdb_writebatch_wi_t* wbi = rocksdb_writebatch_wi_create(0, 1);
+ rocksdb_writebatch_wi_put(wbi, "bar", 3, "b", 1);
+ rocksdb_writebatch_wi_delete(wbi, "foo", 3);
+ rocksdb_iterator_t* iter =
+ rocksdb_writebatch_wi_create_iterator_with_base(wbi, base_iter);
+ CheckCondition(!rocksdb_iter_valid(iter));
+ rocksdb_iter_seek_to_first(iter);
+ CheckCondition(rocksdb_iter_valid(iter));
+ CheckIter(iter, "bar", "b");
+ rocksdb_iter_next(iter);
+ CheckIter(iter, "box", "c");
+ rocksdb_iter_prev(iter);
+ CheckIter(iter, "bar", "b");
+ rocksdb_iter_prev(iter);
+ CheckCondition(!rocksdb_iter_valid(iter));
+ rocksdb_iter_seek_to_last(iter);
+ CheckIter(iter, "box", "c");
+ rocksdb_iter_seek(iter, "b", 1);
+ CheckIter(iter, "bar", "b");
+ rocksdb_iter_seek_for_prev(iter, "c", 1);
+ CheckIter(iter, "box", "c");
+ rocksdb_iter_seek_for_prev(iter, "box", 3);
+ CheckIter(iter, "box", "c");
+ rocksdb_iter_get_error(iter, &err);
+ CheckNoError(err);
+ rocksdb_iter_destroy(iter);
+ rocksdb_writebatch_wi_destroy(wbi);
+ }
+
+ StartPhase("multiget");
+ {
+ const char* keys[3] = { "box", "foo", "notfound" };
+ const size_t keys_sizes[3] = { 3, 3, 8 };
+ char* vals[3];
+ size_t vals_sizes[3];
+ char* errs[3];
+ rocksdb_multi_get(db, roptions, 3, keys, keys_sizes, vals, vals_sizes, errs);
+
+ int i;
+ for (i = 0; i < 3; i++) {
+ CheckEqual(NULL, errs[i], 0);
+ switch (i) {
+ case 0:
+ CheckEqual("c", vals[i], vals_sizes[i]);
+ break;
+ case 1:
+ CheckEqual("hello", vals[i], vals_sizes[i]);
+ break;
+ case 2:
+ CheckEqual(NULL, vals[i], vals_sizes[i]);
+ break;
+ }
+ Free(&vals[i]);
+ }
+ }
+
+ StartPhase("pin_get");
+ {
+ CheckPinGet(db, roptions, "box", "c");
+ CheckPinGet(db, roptions, "foo", "hello");
+ CheckPinGet(db, roptions, "notfound", NULL);
+ }
+
+ StartPhase("approximate_sizes");
+ {
+ int i;
+ int n = 20000;
+ char keybuf[100];
+ char valbuf[100];
+ uint64_t sizes[2];
+ const char* start[2] = { "a", "k00000000000000010000" };
+ size_t start_len[2] = { 1, 21 };
+ const char* limit[2] = { "k00000000000000010000", "z" };
+ size_t limit_len[2] = { 21, 1 };
+ rocksdb_writeoptions_set_sync(woptions, 0);
+ for (i = 0; i < n; i++) {
+ snprintf(keybuf, sizeof(keybuf), "k%020d", i);
+ snprintf(valbuf, sizeof(valbuf), "v%020d", i);
+ rocksdb_put(db, woptions, keybuf, strlen(keybuf), valbuf, strlen(valbuf),
+ &err);
+ CheckNoError(err);
+ }
+ rocksdb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes);
+ CheckCondition(sizes[0] > 0);
+ CheckCondition(sizes[1] > 0);
+ }
+
+ StartPhase("property");
+ {
+ char* prop = rocksdb_property_value(db, "nosuchprop");
+ CheckCondition(prop == NULL);
+ prop = rocksdb_property_value(db, "rocksdb.stats");
+ CheckCondition(prop != NULL);
+ Free(&prop);
+ }
+
+ StartPhase("snapshot");
+ {
+ const rocksdb_snapshot_t* snap;
+ snap = rocksdb_create_snapshot(db);
+ rocksdb_delete(db, woptions, "foo", 3, &err);
+ CheckNoError(err);
+ rocksdb_readoptions_set_snapshot(roptions, snap);
+ CheckGet(db, roptions, "foo", "hello");
+ rocksdb_readoptions_set_snapshot(roptions, NULL);
+ CheckGet(db, roptions, "foo", NULL);
+ rocksdb_release_snapshot(db, snap);
+ }
+
+ StartPhase("repair");
+ {
+ // If we do not compact here, then the lazy deletion of
+ // files (https://reviews.facebook.net/D6123) would leave
+ // around deleted files and the repair process will find
+ // those files and put them back into the database.
+ rocksdb_compact_range(db, NULL, 0, NULL, 0);
+ rocksdb_close(db);
+ rocksdb_options_set_create_if_missing(options, 0);
+ rocksdb_options_set_error_if_exists(options, 0);
+ rocksdb_options_set_wal_recovery_mode(options, 2);
+ rocksdb_repair_db(options, dbname, &err);
+ CheckNoError(err);
+ db = rocksdb_open(options, dbname, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "foo", NULL);
+ CheckGet(db, roptions, "bar", NULL);
+ CheckGet(db, roptions, "box", "c");
+ rocksdb_options_set_create_if_missing(options, 1);
+ rocksdb_options_set_error_if_exists(options, 1);
+ }
+
+ StartPhase("filter");
+ for (run = 0; run <= 2; run++) {
+ // First run uses custom filter
+ // Second run uses old block-based bloom filter
+ // Third run uses full bloom filter
+ CheckNoError(err);
+ rocksdb_filterpolicy_t* policy;
+ if (run == 0) {
+ policy = rocksdb_filterpolicy_create(NULL, FilterDestroy, FilterCreate,
+ FilterKeyMatch, NULL, FilterName);
+ } else if (run == 1) {
+ policy = rocksdb_filterpolicy_create_bloom(8);
+ } else {
+ policy = rocksdb_filterpolicy_create_bloom_full(8);
+ }
+ rocksdb_block_based_options_set_filter_policy(table_options, policy);
+
+ // Create new database
+ rocksdb_close(db);
+ rocksdb_destroy_db(options, dbname, &err);
+ rocksdb_options_set_block_based_table_factory(options, table_options);
+ db = rocksdb_open(options, dbname, &err);
+ CheckNoError(err);
+ rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
+ CheckNoError(err);
+ rocksdb_put(db, woptions, "bar", 3, "barvalue", 8, &err);
+ CheckNoError(err);
+
+ {
+ // Add enough keys to get just one reasonably populated Bloom filter
+ const int keys_to_add = 1500;
+ int i;
+ char keybuf[100];
+ for (i = 0; i < keys_to_add; i++) {
+ snprintf(keybuf, sizeof(keybuf), "yes%020d", i);
+ rocksdb_put(db, woptions, keybuf, strlen(keybuf), "val", 3, &err);
+ CheckNoError(err);
+ }
+ }
+ rocksdb_compact_range(db, NULL, 0, NULL, 0);
+
+ fake_filter_result = 1;
+ CheckGet(db, roptions, "foo", "foovalue");
+ CheckGet(db, roptions, "bar", "barvalue");
+ if (run == 0) {
+ // Must not find value when custom filter returns false
+ fake_filter_result = 0;
+ CheckGet(db, roptions, "foo", NULL);
+ CheckGet(db, roptions, "bar", NULL);
+ fake_filter_result = 1;
+
+ CheckGet(db, roptions, "foo", "foovalue");
+ CheckGet(db, roptions, "bar", "barvalue");
+ }
+
+ {
+ // Query some keys not added to identify Bloom filter implementation
+ // from false positive queries, using perfcontext to detect Bloom
+ // filter behavior
+ rocksdb_perfcontext_t* perf = rocksdb_perfcontext_create();
+ rocksdb_perfcontext_reset(perf);
+
+ const int keys_to_query = 10000;
+ int i;
+ char keybuf[100];
+ for (i = 0; i < keys_to_query; i++) {
+ fake_filter_result = i % 2;
+ snprintf(keybuf, sizeof(keybuf), "no%020d", i);
+ CheckGet(db, roptions, keybuf, NULL);
+ }
+
+ const int hits =
+ (int)rocksdb_perfcontext_metric(perf, rocksdb_bloom_sst_hit_count);
+ if (run == 0) {
+ // Due to half true, half false with fake filter result
+ CheckCondition(hits == keys_to_query / 2);
+ } else if (run == 1) {
+ // Essentially a fingerprint of the block-based Bloom schema
+ CheckCondition(hits == 241);
+ } else {
+ // Essentially a fingerprint of the full Bloom schema(s),
+ // format_version < 5, which vary for three different CACHE_LINE_SIZEs
+ CheckCondition(hits == 224 || hits == 180 || hits == 125);
+ }
+ CheckCondition(
+ (keys_to_query - hits) ==
+ (int)rocksdb_perfcontext_metric(perf, rocksdb_bloom_sst_miss_count));
+
+ rocksdb_perfcontext_destroy(perf);
+ }
+
+ // Reset the policy
+ rocksdb_block_based_options_set_filter_policy(table_options, NULL);
+ rocksdb_options_set_block_based_table_factory(options, table_options);
+ }
+
+ StartPhase("compaction_filter");
+ {
+ rocksdb_options_t* options_with_filter = rocksdb_options_create();
+ rocksdb_options_set_create_if_missing(options_with_filter, 1);
+ rocksdb_compactionfilter_t* cfilter;
+ cfilter = rocksdb_compactionfilter_create(NULL, CFilterDestroy,
+ CFilterFilter, CFilterName);
+ // Create new database
+ rocksdb_close(db);
+ rocksdb_destroy_db(options_with_filter, dbname, &err);
+ rocksdb_options_set_compaction_filter(options_with_filter, cfilter);
+ db = CheckCompaction(db, options_with_filter, roptions, woptions);
+
+ rocksdb_options_set_compaction_filter(options_with_filter, NULL);
+ rocksdb_compactionfilter_destroy(cfilter);
+ rocksdb_options_destroy(options_with_filter);
+ }
+
+ StartPhase("compaction_filter_factory");
+ {
+ rocksdb_options_t* options_with_filter_factory = rocksdb_options_create();
+ rocksdb_options_set_create_if_missing(options_with_filter_factory, 1);
+ rocksdb_compactionfilterfactory_t* factory;
+ factory = rocksdb_compactionfilterfactory_create(
+ NULL, CFilterFactoryDestroy, CFilterCreate, CFilterFactoryName);
+ // Create new database
+ rocksdb_close(db);
+ rocksdb_destroy_db(options_with_filter_factory, dbname, &err);
+ rocksdb_options_set_compaction_filter_factory(options_with_filter_factory,
+ factory);
+ db = CheckCompaction(db, options_with_filter_factory, roptions, woptions);
+
+ rocksdb_options_set_compaction_filter_factory(
+ options_with_filter_factory, NULL);
+ rocksdb_options_destroy(options_with_filter_factory);
+ }
+
+ StartPhase("merge_operator");
+ {
+ rocksdb_mergeoperator_t* merge_operator;
+ merge_operator = rocksdb_mergeoperator_create(
+ NULL, MergeOperatorDestroy, MergeOperatorFullMerge,
+ MergeOperatorPartialMerge, NULL, MergeOperatorName);
+ // Create new database
+ rocksdb_close(db);
+ rocksdb_destroy_db(options, dbname, &err);
+ rocksdb_options_set_merge_operator(options, merge_operator);
+ db = rocksdb_open(options, dbname, &err);
+ CheckNoError(err);
+ rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "foo", "foovalue");
+ rocksdb_merge(db, woptions, "foo", 3, "barvalue", 8, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "foo", "fake");
+
+ // Merge of a non-existing value
+ rocksdb_merge(db, woptions, "bar", 3, "barvalue", 8, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "bar", "fake");
+
+ }
+
+ StartPhase("columnfamilies");
+ {
+ rocksdb_close(db);
+ rocksdb_destroy_db(options, dbname, &err);
+ CheckNoError(err);
+
+ rocksdb_options_t* db_options = rocksdb_options_create();
+ rocksdb_options_set_create_if_missing(db_options, 1);
+ db = rocksdb_open(db_options, dbname, &err);
+ CheckNoError(err)
+ rocksdb_column_family_handle_t* cfh;
+ cfh = rocksdb_create_column_family(db, db_options, "cf1", &err);
+ rocksdb_column_family_handle_destroy(cfh);
+ CheckNoError(err);
+ rocksdb_close(db);
+
+ size_t cflen;
+ char** column_fams = rocksdb_list_column_families(db_options, dbname, &cflen, &err);
+ CheckNoError(err);
+ CheckEqual("default", column_fams[0], 7);
+ CheckEqual("cf1", column_fams[1], 3);
+ CheckCondition(cflen == 2);
+ rocksdb_list_column_families_destroy(column_fams, cflen);
+
+ rocksdb_options_t* cf_options = rocksdb_options_create();
+
+ const char* cf_names[2] = {"default", "cf1"};
+ const rocksdb_options_t* cf_opts[2] = {cf_options, cf_options};
+ rocksdb_column_family_handle_t* handles[2];
+ db = rocksdb_open_column_families(db_options, dbname, 2, cf_names, cf_opts, handles, &err);
+ CheckNoError(err);
+
+ rocksdb_put_cf(db, woptions, handles[1], "foo", 3, "hello", 5, &err);
+ CheckNoError(err);
+
+ rocksdb_put_cf(db, woptions, handles[1], "foobar1", 7, "hello1", 6, &err);
+ CheckNoError(err);
+ rocksdb_put_cf(db, woptions, handles[1], "foobar2", 7, "hello2", 6, &err);
+ CheckNoError(err);
+ rocksdb_put_cf(db, woptions, handles[1], "foobar3", 7, "hello3", 6, &err);
+ CheckNoError(err);
+ rocksdb_put_cf(db, woptions, handles[1], "foobar4", 7, "hello4", 6, &err);
+ CheckNoError(err);
+
+ rocksdb_flushoptions_t *flush_options = rocksdb_flushoptions_create();
+ rocksdb_flushoptions_set_wait(flush_options, 1);
+ rocksdb_flush_cf(db, flush_options, handles[1], &err);
+ CheckNoError(err)
+ rocksdb_flushoptions_destroy(flush_options);
+
+ CheckGetCF(db, roptions, handles[1], "foo", "hello");
+ CheckPinGetCF(db, roptions, handles[1], "foo", "hello");
+
+ rocksdb_delete_cf(db, woptions, handles[1], "foo", 3, &err);
+ CheckNoError(err);
+
+ rocksdb_delete_range_cf(db, woptions, handles[1], "foobar2", 7, "foobar4",
+ 7, &err);
+ CheckNoError(err);
+
+ CheckGetCF(db, roptions, handles[1], "foo", NULL);
+ CheckPinGetCF(db, roptions, handles[1], "foo", NULL);
+
+ rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
+ rocksdb_writebatch_put_cf(wb, handles[1], "baz", 3, "a", 1);
+ rocksdb_writebatch_clear(wb);
+ rocksdb_writebatch_put_cf(wb, handles[1], "bar", 3, "b", 1);
+ rocksdb_writebatch_put_cf(wb, handles[1], "box", 3, "c", 1);
+ rocksdb_writebatch_delete_cf(wb, handles[1], "bar", 3);
+ rocksdb_write(db, woptions, wb, &err);
+ CheckNoError(err);
+ CheckGetCF(db, roptions, handles[1], "baz", NULL);
+ CheckGetCF(db, roptions, handles[1], "bar", NULL);
+ CheckGetCF(db, roptions, handles[1], "box", "c");
+ CheckPinGetCF(db, roptions, handles[1], "baz", NULL);
+ CheckPinGetCF(db, roptions, handles[1], "bar", NULL);
+ CheckPinGetCF(db, roptions, handles[1], "box", "c");
+ rocksdb_writebatch_destroy(wb);
+
+ const char* keys[3] = { "box", "box", "barfooxx" };
+ const rocksdb_column_family_handle_t* get_handles[3] = { handles[0], handles[1], handles[1] };
+ const size_t keys_sizes[3] = { 3, 3, 8 };
+ char* vals[3];
+ size_t vals_sizes[3];
+ char* errs[3];
+ rocksdb_multi_get_cf(db, roptions, get_handles, 3, keys, keys_sizes, vals, vals_sizes, errs);
+
+ int i;
+ for (i = 0; i < 3; i++) {
+ CheckEqual(NULL, errs[i], 0);
+ switch (i) {
+ case 0:
+ CheckEqual(NULL, vals[i], vals_sizes[i]); // wrong cf
+ break;
+ case 1:
+ CheckEqual("c", vals[i], vals_sizes[i]); // bingo
+ break;
+ case 2:
+ CheckEqual(NULL, vals[i], vals_sizes[i]); // normal not found
+ break;
+ }
+ Free(&vals[i]);
+ }
+
+ rocksdb_iterator_t* iter = rocksdb_create_iterator_cf(db, roptions, handles[1]);
+ CheckCondition(!rocksdb_iter_valid(iter));
+ rocksdb_iter_seek_to_first(iter);
+ CheckCondition(rocksdb_iter_valid(iter));
+
+ for (i = 0; rocksdb_iter_valid(iter) != 0; rocksdb_iter_next(iter)) {
+ i++;
+ }
+ CheckCondition(i == 3);
+ rocksdb_iter_get_error(iter, &err);
+ CheckNoError(err);
+ rocksdb_iter_destroy(iter);
+
+ rocksdb_column_family_handle_t* iters_cf_handles[2] = { handles[0], handles[1] };
+ rocksdb_iterator_t* iters_handles[2];
+ rocksdb_create_iterators(db, roptions, iters_cf_handles, iters_handles, 2, &err);
+ CheckNoError(err);
+
+ iter = iters_handles[0];
+ CheckCondition(!rocksdb_iter_valid(iter));
+ rocksdb_iter_seek_to_first(iter);
+ CheckCondition(!rocksdb_iter_valid(iter));
+ rocksdb_iter_destroy(iter);
+
+ iter = iters_handles[1];
+ CheckCondition(!rocksdb_iter_valid(iter));
+ rocksdb_iter_seek_to_first(iter);
+ CheckCondition(rocksdb_iter_valid(iter));
+
+ for (i = 0; rocksdb_iter_valid(iter) != 0; rocksdb_iter_next(iter)) {
+ i++;
+ }
+ CheckCondition(i == 3);
+ rocksdb_iter_get_error(iter, &err);
+ CheckNoError(err);
+ rocksdb_iter_destroy(iter);
+
+ rocksdb_drop_column_family(db, handles[1], &err);
+ CheckNoError(err);
+ for (i = 0; i < 2; i++) {
+ rocksdb_column_family_handle_destroy(handles[i]);
+ }
+ rocksdb_close(db);
+ rocksdb_destroy_db(options, dbname, &err);
+ rocksdb_options_destroy(db_options);
+ rocksdb_options_destroy(cf_options);
+ }
+
+ StartPhase("prefix");
+ {
+ // Create new database
+ rocksdb_options_set_allow_mmap_reads(options, 1);
+ rocksdb_options_set_prefix_extractor(options, rocksdb_slicetransform_create_fixed_prefix(3));
+ rocksdb_options_set_hash_skip_list_rep(options, 5000, 4, 4);
+ rocksdb_options_set_plain_table_factory(options, 4, 10, 0.75, 16);
+ rocksdb_options_set_allow_concurrent_memtable_write(options, 0);
+
+ db = rocksdb_open(options, dbname, &err);
+ CheckNoError(err);
+
+ rocksdb_put(db, woptions, "foo1", 4, "foo", 3, &err);
+ CheckNoError(err);
+ rocksdb_put(db, woptions, "foo2", 4, "foo", 3, &err);
+ CheckNoError(err);
+ rocksdb_put(db, woptions, "foo3", 4, "foo", 3, &err);
+ CheckNoError(err);
+ rocksdb_put(db, woptions, "bar1", 4, "bar", 3, &err);
+ CheckNoError(err);
+ rocksdb_put(db, woptions, "bar2", 4, "bar", 3, &err);
+ CheckNoError(err);
+ rocksdb_put(db, woptions, "bar3", 4, "bar", 3, &err);
+ CheckNoError(err);
+
+ rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+ CheckCondition(!rocksdb_iter_valid(iter));
+
+ rocksdb_iter_seek(iter, "bar", 3);
+ rocksdb_iter_get_error(iter, &err);
+ CheckNoError(err);
+ CheckCondition(rocksdb_iter_valid(iter));
+
+ CheckIter(iter, "bar1", "bar");
+ rocksdb_iter_next(iter);
+ CheckIter(iter, "bar2", "bar");
+ rocksdb_iter_next(iter);
+ CheckIter(iter, "bar3", "bar");
+ rocksdb_iter_get_error(iter, &err);
+ CheckNoError(err);
+ rocksdb_iter_destroy(iter);
+
+ rocksdb_readoptions_set_total_order_seek(roptions, 1);
+ iter = rocksdb_create_iterator(db, roptions);
+ CheckCondition(!rocksdb_iter_valid(iter));
+
+ rocksdb_iter_seek(iter, "ba", 2);
+ rocksdb_iter_get_error(iter, &err);
+ CheckNoError(err);
+ CheckCondition(rocksdb_iter_valid(iter));
+ CheckIter(iter, "bar1", "bar");
+
+ rocksdb_iter_destroy(iter);
+ rocksdb_readoptions_set_total_order_seek(roptions, 0);
+
+ rocksdb_close(db);
+ rocksdb_destroy_db(options, dbname, &err);
+ }
+
+ // Check memory usage stats
+ StartPhase("approximate_memory_usage");
+ {
+ // Create database
+ db = rocksdb_open(options, dbname, &err);
+ CheckNoError(err);
+
+ rocksdb_memory_consumers_t* consumers;
+ consumers = rocksdb_memory_consumers_create();
+ rocksdb_memory_consumers_add_db(consumers, db);
+ rocksdb_memory_consumers_add_cache(consumers, cache);
+
+ // take memory usage report before write-read operation
+ rocksdb_memory_usage_t* mu1;
+ mu1 = rocksdb_approximate_memory_usage_create(consumers, &err);
+ CheckNoError(err);
+
+ // Put data (this should affect memtables)
+ rocksdb_put(db, woptions, "memory", 6, "test", 4, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "memory", "test");
+
+ // take memory usage report after write-read operation
+ rocksdb_memory_usage_t* mu2;
+ mu2 = rocksdb_approximate_memory_usage_create(consumers, &err);
+ CheckNoError(err);
+
+ // amount of memory used within memtables should grow
+ CheckCondition(rocksdb_approximate_memory_usage_get_mem_table_total(mu2) >=
+ rocksdb_approximate_memory_usage_get_mem_table_total(mu1));
+ CheckCondition(rocksdb_approximate_memory_usage_get_mem_table_unflushed(mu2) >=
+ rocksdb_approximate_memory_usage_get_mem_table_unflushed(mu1));
+
+ rocksdb_memory_consumers_destroy(consumers);
+ rocksdb_approximate_memory_usage_destroy(mu1);
+ rocksdb_approximate_memory_usage_destroy(mu2);
+ rocksdb_close(db);
+ rocksdb_destroy_db(options, dbname, &err);
+ CheckNoError(err);
+ }
+
+ StartPhase("cuckoo_options");
+ {
+ rocksdb_cuckoo_table_options_t* cuckoo_options;
+ cuckoo_options = rocksdb_cuckoo_options_create();
+ rocksdb_cuckoo_options_set_hash_ratio(cuckoo_options, 0.5);
+ rocksdb_cuckoo_options_set_max_search_depth(cuckoo_options, 200);
+ rocksdb_cuckoo_options_set_cuckoo_block_size(cuckoo_options, 10);
+ rocksdb_cuckoo_options_set_identity_as_first_hash(cuckoo_options, 1);
+ rocksdb_cuckoo_options_set_use_module_hash(cuckoo_options, 0);
+ rocksdb_options_set_cuckoo_table_factory(options, cuckoo_options);
+
+ db = rocksdb_open(options, dbname, &err);
+ CheckNoError(err);
+
+ rocksdb_cuckoo_options_destroy(cuckoo_options);
+ }
+
+ StartPhase("iterate_upper_bound");
+ {
+ // Create new empty database
+ rocksdb_close(db);
+ rocksdb_destroy_db(options, dbname, &err);
+ CheckNoError(err);
+
+ rocksdb_options_set_prefix_extractor(options, NULL);
+ db = rocksdb_open(options, dbname, &err);
+ CheckNoError(err);
+
+ rocksdb_put(db, woptions, "a", 1, "0", 1, &err); CheckNoError(err);
+ rocksdb_put(db, woptions, "foo", 3, "bar", 3, &err); CheckNoError(err);
+ rocksdb_put(db, woptions, "foo1", 4, "bar1", 4, &err); CheckNoError(err);
+ rocksdb_put(db, woptions, "g1", 2, "0", 1, &err); CheckNoError(err);
+
+ // testing basic case with no iterate_upper_bound and no prefix_extractor
+ {
+ rocksdb_readoptions_set_iterate_upper_bound(roptions, NULL, 0);
+ rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+
+ rocksdb_iter_seek(iter, "foo", 3);
+ CheckCondition(rocksdb_iter_valid(iter));
+ CheckIter(iter, "foo", "bar");
+
+ rocksdb_iter_next(iter);
+ CheckCondition(rocksdb_iter_valid(iter));
+ CheckIter(iter, "foo1", "bar1");
+
+ rocksdb_iter_next(iter);
+ CheckCondition(rocksdb_iter_valid(iter));
+ CheckIter(iter, "g1", "0");
+
+ rocksdb_iter_destroy(iter);
+ }
+
+ // testing iterate_upper_bound and forward iterator
+ // to make sure it stops at bound
+ {
+ // iterate_upper_bound points beyond the last expected entry
+ rocksdb_readoptions_set_iterate_upper_bound(roptions, "foo2", 4);
+
+ rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+
+ rocksdb_iter_seek(iter, "foo", 3);
+ CheckCondition(rocksdb_iter_valid(iter));
+ CheckIter(iter, "foo", "bar");
+
+ rocksdb_iter_next(iter);
+ CheckCondition(rocksdb_iter_valid(iter));
+ CheckIter(iter, "foo1", "bar1");
+
+ rocksdb_iter_next(iter);
+ // should stop here...
+ CheckCondition(!rocksdb_iter_valid(iter));
+
+ rocksdb_iter_destroy(iter);
+ rocksdb_readoptions_set_iterate_upper_bound(roptions, NULL, 0);
+ }
+ }
+
+ StartPhase("transactions");
+ {
+ rocksdb_close(db);
+ rocksdb_destroy_db(options, dbname, &err);
+ CheckNoError(err);
+
+ // open a TransactionDB
+ txn_db_options = rocksdb_transactiondb_options_create();
+ txn_options = rocksdb_transaction_options_create();
+ rocksdb_options_set_create_if_missing(options, 1);
+ txn_db = rocksdb_transactiondb_open(options, txn_db_options, dbname, &err);
+ CheckNoError(err);
+
+ // put outside a transaction
+ rocksdb_transactiondb_put(txn_db, woptions, "foo", 3, "hello", 5, &err);
+ CheckNoError(err);
+ CheckTxnDBGet(txn_db, roptions, "foo", "hello");
+
+ // delete from outside transaction
+ rocksdb_transactiondb_delete(txn_db, woptions, "foo", 3, &err);
+ CheckNoError(err);
+ CheckTxnDBGet(txn_db, roptions, "foo", NULL);
+
+ // write batch into TransactionDB
+ rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
+ rocksdb_writebatch_put(wb, "foo", 3, "a", 1);
+ rocksdb_writebatch_clear(wb);
+ rocksdb_writebatch_put(wb, "bar", 3, "b", 1);
+ rocksdb_writebatch_put(wb, "box", 3, "c", 1);
+ rocksdb_writebatch_delete(wb, "bar", 3);
+ rocksdb_transactiondb_write(txn_db, woptions, wb, &err);
+ rocksdb_writebatch_destroy(wb);
+ CheckTxnDBGet(txn_db, roptions, "box", "c");
+ CheckNoError(err);
+
+ // begin a transaction
+ txn = rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL);
+ // put
+ rocksdb_transaction_put(txn, "foo", 3, "hello", 5, &err);
+ CheckNoError(err);
+ CheckTxnGet(txn, roptions, "foo", "hello");
+ // delete
+ rocksdb_transaction_delete(txn, "foo", 3, &err);
+ CheckNoError(err);
+ CheckTxnGet(txn, roptions, "foo", NULL);
+
+ rocksdb_transaction_put(txn, "foo", 3, "hello", 5, &err);
+ CheckNoError(err);
+
+ // read from outside transaction, before commit
+ CheckTxnDBGet(txn_db, roptions, "foo", NULL);
+
+ // commit
+ rocksdb_transaction_commit(txn, &err);
+ CheckNoError(err);
+
+ // read from outside transaction, after commit
+ CheckTxnDBGet(txn_db, roptions, "foo", "hello");
+
+ // reuse old transaction
+ txn = rocksdb_transaction_begin(txn_db, woptions, txn_options, txn);
+
+ // snapshot
+ const rocksdb_snapshot_t* snapshot;
+ snapshot = rocksdb_transactiondb_create_snapshot(txn_db);
+ rocksdb_readoptions_set_snapshot(roptions, snapshot);
+
+ rocksdb_transactiondb_put(txn_db, woptions, "foo", 3, "hey", 3, &err);
+ CheckNoError(err);
+
+ CheckTxnDBGet(txn_db, roptions, "foo", "hello");
+ rocksdb_readoptions_set_snapshot(roptions, NULL);
+ rocksdb_transactiondb_release_snapshot(txn_db, snapshot);
+ CheckTxnDBGet(txn_db, roptions, "foo", "hey");
+
+ // iterate
+ rocksdb_transaction_put(txn, "bar", 3, "hi", 2, &err);
+ rocksdb_iterator_t* iter = rocksdb_transaction_create_iterator(txn, roptions);
+ CheckCondition(!rocksdb_iter_valid(iter));
+ rocksdb_iter_seek_to_first(iter);
+ CheckCondition(rocksdb_iter_valid(iter));
+ CheckIter(iter, "bar", "hi");
+ rocksdb_iter_get_error(iter, &err);
+ CheckNoError(err);
+ rocksdb_iter_destroy(iter);
+
+ // rollback
+ rocksdb_transaction_rollback(txn, &err);
+ CheckNoError(err);
+ CheckTxnDBGet(txn_db, roptions, "bar", NULL);
+
+ // save point
+ rocksdb_transaction_put(txn, "foo1", 4, "hi1", 3, &err);
+ rocksdb_transaction_set_savepoint(txn);
+ CheckTxnGet(txn, roptions, "foo1", "hi1");
+ rocksdb_transaction_put(txn, "foo2", 4, "hi2", 3, &err);
+ CheckTxnGet(txn, roptions, "foo2", "hi2");
+
+ // rollback to savepoint
+ rocksdb_transaction_rollback_to_savepoint(txn, &err);
+ CheckNoError(err);
+ CheckTxnGet(txn, roptions, "foo2", NULL);
+ CheckTxnGet(txn, roptions, "foo1", "hi1");
+ CheckTxnDBGet(txn_db, roptions, "foo1", NULL);
+ CheckTxnDBGet(txn_db, roptions, "foo2", NULL);
+ rocksdb_transaction_commit(txn, &err);
+ CheckNoError(err);
+ CheckTxnDBGet(txn_db, roptions, "foo1", "hi1");
+ CheckTxnDBGet(txn_db, roptions, "foo2", NULL);
+
+ // Column families.
+ rocksdb_column_family_handle_t* cfh;
+ cfh = rocksdb_transactiondb_create_column_family(txn_db, options,
+ "txn_db_cf", &err);
+ CheckNoError(err);
+
+ rocksdb_transactiondb_put_cf(txn_db, woptions, cfh, "cf_foo", 6, "cf_hello",
+ 8, &err);
+ CheckNoError(err);
+ CheckTxnDBGetCF(txn_db, roptions, cfh, "cf_foo", "cf_hello");
+
+ rocksdb_transactiondb_delete_cf(txn_db, woptions, cfh, "cf_foo", 6, &err);
+ CheckNoError(err);
+ CheckTxnDBGetCF(txn_db, roptions, cfh, "cf_foo", NULL);
+
+ rocksdb_column_family_handle_destroy(cfh);
+
+ // close and destroy
+ rocksdb_transaction_destroy(txn);
+ rocksdb_transactiondb_close(txn_db);
+ rocksdb_destroy_db(options, dbname, &err);
+ CheckNoError(err);
+ rocksdb_transaction_options_destroy(txn_options);
+ rocksdb_transactiondb_options_destroy(txn_db_options);
+ }
+
+ StartPhase("optimistic_transactions");
+ {
+ rocksdb_options_t* db_options = rocksdb_options_create();
+ rocksdb_options_set_create_if_missing(db_options, 1);
+ rocksdb_options_set_allow_concurrent_memtable_write(db_options, 1);
+ otxn_db = rocksdb_optimistictransactiondb_open(db_options, dbname, &err);
+ otxn_options = rocksdb_optimistictransaction_options_create();
+ rocksdb_transaction_t* txn1 = rocksdb_optimistictransaction_begin(
+ otxn_db, woptions, otxn_options, NULL);
+ rocksdb_transaction_t* txn2 = rocksdb_optimistictransaction_begin(
+ otxn_db, woptions, otxn_options, NULL);
+ rocksdb_transaction_put(txn1, "key", 3, "value", 5, &err);
+ CheckNoError(err);
+ rocksdb_transaction_put(txn2, "key1", 4, "value1", 6, &err);
+ CheckNoError(err);
+ CheckTxnGet(txn1, roptions, "key", "value");
+ rocksdb_transaction_commit(txn1, &err);
+ CheckNoError(err);
+ rocksdb_transaction_commit(txn2, &err);
+ CheckNoError(err);
+ rocksdb_transaction_destroy(txn1);
+ rocksdb_transaction_destroy(txn2);
+
+ // Check column family
+ db = rocksdb_optimistictransactiondb_get_base_db(otxn_db);
+ rocksdb_put(db, woptions, "key", 3, "value", 5, &err);
+ CheckNoError(err);
+ rocksdb_column_family_handle_t *cfh1, *cfh2;
+ cfh1 = rocksdb_create_column_family(db, db_options, "txn_db_cf1", &err);
+ cfh2 = rocksdb_create_column_family(db, db_options, "txn_db_cf2", &err);
+ txn = rocksdb_optimistictransaction_begin(otxn_db, woptions, otxn_options,
+ NULL);
+ rocksdb_transaction_put_cf(txn, cfh1, "key_cf1", 7, "val_cf1", 7, &err);
+ CheckNoError(err);
+ rocksdb_transaction_put_cf(txn, cfh2, "key_cf2", 7, "val_cf2", 7, &err);
+ CheckNoError(err);
+ rocksdb_transaction_commit(txn, &err);
+ CheckNoError(err);
+ txn = rocksdb_optimistictransaction_begin(otxn_db, woptions, otxn_options,
+ txn);
+ CheckGetCF(db, roptions, cfh1, "key_cf1", "val_cf1");
+ CheckTxnGetCF(txn, roptions, cfh1, "key_cf1", "val_cf1");
+
+ // Check iterator with column family
+ rocksdb_transaction_put_cf(txn, cfh1, "key1_cf", 7, "val1_cf", 7, &err);
+ CheckNoError(err);
+ rocksdb_iterator_t* iter =
+ rocksdb_transaction_create_iterator_cf(txn, roptions, cfh1);
+ CheckCondition(!rocksdb_iter_valid(iter));
+ rocksdb_iter_seek_to_first(iter);
+ CheckCondition(rocksdb_iter_valid(iter));
+ CheckIter(iter, "key1_cf", "val1_cf");
+ rocksdb_iter_get_error(iter, &err);
+ CheckNoError(err);
+ rocksdb_iter_destroy(iter);
+
+ rocksdb_transaction_destroy(txn);
+ rocksdb_column_family_handle_destroy(cfh1);
+ rocksdb_column_family_handle_destroy(cfh2);
+ rocksdb_optimistictransactiondb_close_base_db(db);
+ rocksdb_optimistictransactiondb_close(otxn_db);
+
+ // Check open optimistic transaction db with column families
+ size_t cf_len;
+ char** column_fams =
+ rocksdb_list_column_families(db_options, dbname, &cf_len, &err);
+ CheckNoError(err);
+ CheckEqual("default", column_fams[0], 7);
+ CheckEqual("txn_db_cf1", column_fams[1], 10);
+ CheckEqual("txn_db_cf2", column_fams[2], 10);
+ CheckCondition(cf_len == 3);
+ rocksdb_list_column_families_destroy(column_fams, cf_len);
+
+ const char* cf_names[3] = {"default", "txn_db_cf1", "txn_db_cf2"};
+ rocksdb_options_t* cf_options = rocksdb_options_create();
+ const rocksdb_options_t* cf_opts[3] = {cf_options, cf_options, cf_options};
+
+ rocksdb_options_set_error_if_exists(cf_options, 0);
+ rocksdb_column_family_handle_t* cf_handles[3];
+ otxn_db = rocksdb_optimistictransactiondb_open_column_families(
+ db_options, dbname, 3, cf_names, cf_opts, cf_handles, &err);
+ CheckNoError(err);
+ rocksdb_transaction_t* txn_cf = rocksdb_optimistictransaction_begin(
+ otxn_db, woptions, otxn_options, NULL);
+ CheckTxnGetCF(txn_cf, roptions, cf_handles[0], "key", "value");
+ CheckTxnGetCF(txn_cf, roptions, cf_handles[1], "key_cf1", "val_cf1");
+ CheckTxnGetCF(txn_cf, roptions, cf_handles[2], "key_cf2", "val_cf2");
+ rocksdb_transaction_destroy(txn_cf);
+ rocksdb_options_destroy(cf_options);
+ rocksdb_column_family_handle_destroy(cf_handles[0]);
+ rocksdb_column_family_handle_destroy(cf_handles[1]);
+ rocksdb_column_family_handle_destroy(cf_handles[2]);
+ rocksdb_optimistictransactiondb_close(otxn_db);
+ rocksdb_destroy_db(db_options, dbname, &err);
+ rocksdb_options_destroy(db_options);
+ rocksdb_optimistictransaction_options_destroy(otxn_options);
+ CheckNoError(err);
+ }
+
+ // Simple sanity check that setting memtable rep works.
+ StartPhase("memtable_reps");
+ {
+ // Create database with vector memtable.
+ rocksdb_options_set_memtable_vector_rep(options);
+ db = rocksdb_open(options, dbname, &err);
+ CheckNoError(err);
+
+ // Create database with hash skiplist memtable.
+ rocksdb_close(db);
+ rocksdb_destroy_db(options, dbname, &err);
+ CheckNoError(err);
+
+ rocksdb_options_set_hash_skip_list_rep(options, 5000, 4, 4);
+ db = rocksdb_open(options, dbname, &err);
+ CheckNoError(err);
+ }
+
+ // Check that secondary instance works.
+ StartPhase("open_as_secondary");
+ {
+ rocksdb_close(db);
+ rocksdb_destroy_db(options, dbname, &err);
+
+ rocksdb_options_t* db_options = rocksdb_options_create();
+ rocksdb_options_set_create_if_missing(db_options, 1);
+ db = rocksdb_open(db_options, dbname, &err);
+ CheckNoError(err);
+ rocksdb_t* db1;
+ rocksdb_options_t* opts = rocksdb_options_create();
+ rocksdb_options_set_max_open_files(opts, -1);
+ rocksdb_options_set_create_if_missing(opts, 1);
+ snprintf(secondary_path, sizeof(secondary_path),
+ "%s/rocksdb_c_test_secondary-%d", GetTempDir(), ((int)geteuid()));
+ db1 = rocksdb_open_as_secondary(opts, dbname, secondary_path, &err);
+ CheckNoError(err);
+
+ rocksdb_writeoptions_set_sync(woptions, 0);
+ rocksdb_writeoptions_disable_WAL(woptions, 1);
+ rocksdb_put(db, woptions, "key0", 4, "value0", 6, &err);
+ CheckNoError(err);
+ rocksdb_flushoptions_t* flush_opts = rocksdb_flushoptions_create();
+ rocksdb_flushoptions_set_wait(flush_opts, 1);
+ rocksdb_flush(db, flush_opts, &err);
+ CheckNoError(err);
+ rocksdb_try_catch_up_with_primary(db1, &err);
+ CheckNoError(err);
+ rocksdb_readoptions_t* ropts = rocksdb_readoptions_create();
+ rocksdb_readoptions_set_verify_checksums(ropts, 1);
+ rocksdb_readoptions_set_snapshot(ropts, NULL);
+ CheckGet(db, ropts, "key0", "value0");
+ CheckGet(db1, ropts, "key0", "value0");
+
+ rocksdb_writeoptions_disable_WAL(woptions, 0);
+ rocksdb_put(db, woptions, "key1", 4, "value1", 6, &err);
+ CheckNoError(err);
+ rocksdb_try_catch_up_with_primary(db1, &err);
+ CheckNoError(err);
+ CheckGet(db1, ropts, "key0", "value0");
+ CheckGet(db1, ropts, "key1", "value1");
+
+ rocksdb_close(db1);
+ rocksdb_destroy_db(opts, secondary_path, &err);
+ CheckNoError(err);
+
+ rocksdb_options_destroy(db_options);
+ rocksdb_options_destroy(opts);
+ rocksdb_readoptions_destroy(ropts);
+ rocksdb_flushoptions_destroy(flush_opts);
+ }
+
+ // Simple sanity check that options setting db_paths work.
+ StartPhase("open_db_paths");
+ {
+ rocksdb_close(db);
+ rocksdb_destroy_db(options, dbname, &err);
+
+ const rocksdb_dbpath_t* paths[1] = {dbpath};
+ rocksdb_options_set_db_paths(options, paths, 1);
+ db = rocksdb_open(options, dbname, &err);
+ CheckNoError(err);
+ }
+
+ StartPhase("cleanup");
+ rocksdb_close(db);
+ rocksdb_options_destroy(options);
+ rocksdb_block_based_options_destroy(table_options);
+ rocksdb_readoptions_destroy(roptions);
+ rocksdb_writeoptions_destroy(woptions);
+ rocksdb_compactoptions_destroy(coptions);
+ rocksdb_cache_destroy(cache);
+ rocksdb_comparator_destroy(cmp);
+ rocksdb_dbpath_destroy(dbpath);
+ rocksdb_env_destroy(env);
+
+ fprintf(stderr, "PASS\n");
+ return 0;
+}
+
+#else
+
+int main() {
+ fprintf(stderr, "SKIPPED\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/column_family.cc b/src/rocksdb/db/column_family.cc
new file mode 100644
index 000000000..928a02a1f
--- /dev/null
+++ b/src/rocksdb/db/column_family.cc
@@ -0,0 +1,1523 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/column_family.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "db/compaction/compaction_picker.h"
+#include "db/compaction/compaction_picker_fifo.h"
+#include "db/compaction/compaction_picker_level.h"
+#include "db/compaction/compaction_picker_universal.h"
+#include "db/db_impl/db_impl.h"
+#include "db/internal_stats.h"
+#include "db/job_context.h"
+#include "db/range_del_aggregator.h"
+#include "db/table_properties_collector.h"
+#include "db/version_set.h"
+#include "db/write_controller.h"
+#include "file/sst_file_manager_impl.h"
+#include "memtable/hash_skiplist_rep.h"
+#include "monitoring/thread_status_util.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/merging_iterator.h"
+#include "util/autovector.h"
+#include "util/compression.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(
+ ColumnFamilyData* column_family_data, DBImpl* db, InstrumentedMutex* mutex)
+ : cfd_(column_family_data), db_(db), mutex_(mutex) {
+ if (cfd_ != nullptr) {
+ cfd_->Ref();
+ }
+}
+
+ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
+ if (cfd_ != nullptr) {
+#ifndef ROCKSDB_LITE
+ for (auto& listener : cfd_->ioptions()->listeners) {
+ listener->OnColumnFamilyHandleDeletionStarted(this);
+ }
+#endif // ROCKSDB_LITE
+ // Job id == 0 means that this is not our background process, but rather
+ // user thread
+ // Need to hold some shared pointers owned by the initial_cf_options
+ // before final cleaning up finishes.
+ ColumnFamilyOptions initial_cf_options_copy = cfd_->initial_cf_options();
+ JobContext job_context(0);
+ mutex_->Lock();
+ bool dropped = cfd_->IsDropped();
+ if (cfd_->UnrefAndTryDelete()) {
+ if (dropped) {
+ db_->FindObsoleteFiles(&job_context, false, true);
+ }
+ }
+ mutex_->Unlock();
+ if (job_context.HaveSomethingToDelete()) {
+ bool defer_purge =
+ db_->immutable_db_options().avoid_unnecessary_blocking_io;
+ db_->PurgeObsoleteFiles(job_context, defer_purge);
+ if (defer_purge) {
+ mutex_->Lock();
+ db_->SchedulePurge();
+ mutex_->Unlock();
+ }
+ }
+ job_context.Clean();
+ }
+}
+
+uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); }
+
+const std::string& ColumnFamilyHandleImpl::GetName() const {
+ return cfd()->GetName();
+}
+
+Status ColumnFamilyHandleImpl::GetDescriptor(ColumnFamilyDescriptor* desc) {
+#ifndef ROCKSDB_LITE
+ // accessing mutable cf-options requires db mutex.
+ InstrumentedMutexLock l(mutex_);
+ *desc = ColumnFamilyDescriptor(cfd()->GetName(), cfd()->GetLatestCFOptions());
+ return Status::OK();
+#else
+ (void)desc;
+ return Status::NotSupported();
+#endif // !ROCKSDB_LITE
+}
+
+const Comparator* ColumnFamilyHandleImpl::GetComparator() const {
+ return cfd()->user_comparator();
+}
+
+void GetIntTblPropCollectorFactory(
+ const ImmutableCFOptions& ioptions,
+ std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+ int_tbl_prop_collector_factories) {
+ auto& collector_factories = ioptions.table_properties_collector_factories;
+ for (size_t i = 0; i < ioptions.table_properties_collector_factories.size();
+ ++i) {
+ assert(collector_factories[i]);
+ int_tbl_prop_collector_factories->emplace_back(
+ new UserKeyTablePropertiesCollectorFactory(collector_factories[i]));
+ }
+}
+
+Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options) {
+ if (!cf_options.compression_per_level.empty()) {
+ for (size_t level = 0; level < cf_options.compression_per_level.size();
+ ++level) {
+ if (!CompressionTypeSupported(cf_options.compression_per_level[level])) {
+ return Status::InvalidArgument(
+ "Compression type " +
+ CompressionTypeToString(cf_options.compression_per_level[level]) +
+ " is not linked with the binary.");
+ }
+ }
+ } else {
+ if (!CompressionTypeSupported(cf_options.compression)) {
+ return Status::InvalidArgument(
+ "Compression type " +
+ CompressionTypeToString(cf_options.compression) +
+ " is not linked with the binary.");
+ }
+ }
+ if (cf_options.compression_opts.zstd_max_train_bytes > 0) {
+ if (!ZSTD_TrainDictionarySupported()) {
+ return Status::InvalidArgument(
+ "zstd dictionary trainer cannot be used because ZSTD 1.1.3+ "
+ "is not linked with the binary.");
+ }
+ if (cf_options.compression_opts.max_dict_bytes == 0) {
+ return Status::InvalidArgument(
+ "The dictionary size limit (`CompressionOptions::max_dict_bytes`) "
+ "should be nonzero if we're using zstd's dictionary generator.");
+ }
+ }
+ return Status::OK();
+}
+
+Status CheckConcurrentWritesSupported(const ColumnFamilyOptions& cf_options) {
+ if (cf_options.inplace_update_support) {
+ return Status::InvalidArgument(
+ "In-place memtable updates (inplace_update_support) is not compatible "
+ "with concurrent writes (allow_concurrent_memtable_write)");
+ }
+ if (!cf_options.memtable_factory->IsInsertConcurrentlySupported()) {
+ return Status::InvalidArgument(
+ "Memtable doesn't concurrent writes (allow_concurrent_memtable_write)");
+ }
+ return Status::OK();
+}
+
+Status CheckCFPathsSupported(const DBOptions& db_options,
+ const ColumnFamilyOptions& cf_options) {
+ // More than one cf_paths are supported only in universal
+ // and level compaction styles. This function also checks the case
+ // in which cf_paths is not specified, which results in db_paths
+ // being used.
+ if ((cf_options.compaction_style != kCompactionStyleUniversal) &&
+ (cf_options.compaction_style != kCompactionStyleLevel)) {
+ if (cf_options.cf_paths.size() > 1) {
+ return Status::NotSupported(
+ "More than one CF paths are only supported in "
+ "universal and level compaction styles. ");
+ } else if (cf_options.cf_paths.empty() &&
+ db_options.db_paths.size() > 1) {
+ return Status::NotSupported(
+ "More than one DB paths are only supported in "
+ "universal and level compaction styles. ");
+ }
+ }
+ return Status::OK();
+}
+
+namespace {
+const uint64_t kDefaultTtl = 0xfffffffffffffffe;
+const uint64_t kDefaultPeriodicCompSecs = 0xfffffffffffffffe;
+}; // namespace
+
+ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
+ const ColumnFamilyOptions& src) {
+ ColumnFamilyOptions result = src;
+ size_t clamp_max = std::conditional<
+ sizeof(size_t) == 4, std::integral_constant<size_t, 0xffffffff>,
+ std::integral_constant<uint64_t, 64ull << 30>>::type::value;
+ ClipToRange(&result.write_buffer_size, ((size_t)64) << 10, clamp_max);
+ // if user sets arena_block_size, we trust user to use this value. Otherwise,
+ // calculate a proper value from writer_buffer_size;
+ if (result.arena_block_size <= 0) {
+ result.arena_block_size = result.write_buffer_size / 8;
+
+ // Align up to 4k
+ const size_t align = 4 * 1024;
+ result.arena_block_size =
+ ((result.arena_block_size + align - 1) / align) * align;
+ }
+ result.min_write_buffer_number_to_merge =
+ std::min(result.min_write_buffer_number_to_merge,
+ result.max_write_buffer_number - 1);
+ if (result.min_write_buffer_number_to_merge < 1) {
+ result.min_write_buffer_number_to_merge = 1;
+ }
+
+ if (result.num_levels < 1) {
+ result.num_levels = 1;
+ }
+ if (result.compaction_style == kCompactionStyleLevel &&
+ result.num_levels < 2) {
+ result.num_levels = 2;
+ }
+
+ if (result.compaction_style == kCompactionStyleUniversal &&
+ db_options.allow_ingest_behind && result.num_levels < 3) {
+ result.num_levels = 3;
+ }
+
+ if (result.max_write_buffer_number < 2) {
+ result.max_write_buffer_number = 2;
+ }
+ // fall back max_write_buffer_number_to_maintain if
+ // max_write_buffer_size_to_maintain is not set
+ if (result.max_write_buffer_size_to_maintain < 0) {
+ result.max_write_buffer_size_to_maintain =
+ result.max_write_buffer_number *
+ static_cast<int64_t>(result.write_buffer_size);
+ } else if (result.max_write_buffer_size_to_maintain == 0 &&
+ result.max_write_buffer_number_to_maintain < 0) {
+ result.max_write_buffer_number_to_maintain = result.max_write_buffer_number;
+ }
+ // bloom filter size shouldn't exceed 1/4 of memtable size.
+ if (result.memtable_prefix_bloom_size_ratio > 0.25) {
+ result.memtable_prefix_bloom_size_ratio = 0.25;
+ } else if (result.memtable_prefix_bloom_size_ratio < 0) {
+ result.memtable_prefix_bloom_size_ratio = 0;
+ }
+
+ if (!result.prefix_extractor) {
+ assert(result.memtable_factory);
+ Slice name = result.memtable_factory->Name();
+ if (name.compare("HashSkipListRepFactory") == 0 ||
+ name.compare("HashLinkListRepFactory") == 0) {
+ result.memtable_factory = std::make_shared<SkipListFactory>();
+ }
+ }
+
+ if (result.compaction_style == kCompactionStyleFIFO) {
+ result.num_levels = 1;
+ // since we delete level0 files in FIFO compaction when there are too many
+ // of them, these options don't really mean anything
+ result.level0_slowdown_writes_trigger = std::numeric_limits<int>::max();
+ result.level0_stop_writes_trigger = std::numeric_limits<int>::max();
+ }
+
+ if (result.max_bytes_for_level_multiplier <= 0) {
+ result.max_bytes_for_level_multiplier = 1;
+ }
+
+ if (result.level0_file_num_compaction_trigger == 0) {
+ ROCKS_LOG_WARN(db_options.info_log.get(),
+ "level0_file_num_compaction_trigger cannot be 0");
+ result.level0_file_num_compaction_trigger = 1;
+ }
+
+ if (result.level0_stop_writes_trigger <
+ result.level0_slowdown_writes_trigger ||
+ result.level0_slowdown_writes_trigger <
+ result.level0_file_num_compaction_trigger) {
+ ROCKS_LOG_WARN(db_options.info_log.get(),
+ "This condition must be satisfied: "
+ "level0_stop_writes_trigger(%d) >= "
+ "level0_slowdown_writes_trigger(%d) >= "
+ "level0_file_num_compaction_trigger(%d)",
+ result.level0_stop_writes_trigger,
+ result.level0_slowdown_writes_trigger,
+ result.level0_file_num_compaction_trigger);
+ if (result.level0_slowdown_writes_trigger <
+ result.level0_file_num_compaction_trigger) {
+ result.level0_slowdown_writes_trigger =
+ result.level0_file_num_compaction_trigger;
+ }
+ if (result.level0_stop_writes_trigger <
+ result.level0_slowdown_writes_trigger) {
+ result.level0_stop_writes_trigger = result.level0_slowdown_writes_trigger;
+ }
+ ROCKS_LOG_WARN(db_options.info_log.get(),
+ "Adjust the value to "
+ "level0_stop_writes_trigger(%d)"
+ "level0_slowdown_writes_trigger(%d)"
+ "level0_file_num_compaction_trigger(%d)",
+ result.level0_stop_writes_trigger,
+ result.level0_slowdown_writes_trigger,
+ result.level0_file_num_compaction_trigger);
+ }
+
+ if (result.soft_pending_compaction_bytes_limit == 0) {
+ result.soft_pending_compaction_bytes_limit =
+ result.hard_pending_compaction_bytes_limit;
+ } else if (result.hard_pending_compaction_bytes_limit > 0 &&
+ result.soft_pending_compaction_bytes_limit >
+ result.hard_pending_compaction_bytes_limit) {
+ result.soft_pending_compaction_bytes_limit =
+ result.hard_pending_compaction_bytes_limit;
+ }
+
+#ifndef ROCKSDB_LITE
+ // When the DB is stopped, it's possible that there are some .trash files that
+ // were not deleted yet, when we open the DB we will find these .trash files
+ // and schedule them to be deleted (or delete immediately if SstFileManager
+ // was not used)
+ auto sfm = static_cast<SstFileManagerImpl*>(db_options.sst_file_manager.get());
+ for (size_t i = 0; i < result.cf_paths.size(); i++) {
+ DeleteScheduler::CleanupDirectory(db_options.env, sfm, result.cf_paths[i].path);
+ }
+#endif
+
+ if (result.cf_paths.empty()) {
+ result.cf_paths = db_options.db_paths;
+ }
+
+ if (result.level_compaction_dynamic_level_bytes) {
+ if (result.compaction_style != kCompactionStyleLevel ||
+ result.cf_paths.size() > 1U) {
+ // 1. level_compaction_dynamic_level_bytes only makes sense for
+ // level-based compaction.
+ // 2. we don't yet know how to make both of this feature and multiple
+ // DB path work.
+ result.level_compaction_dynamic_level_bytes = false;
+ }
+ }
+
+ if (result.max_compaction_bytes == 0) {
+ result.max_compaction_bytes = result.target_file_size_base * 25;
+ }
+
+ bool is_block_based_table =
+ (result.table_factory->Name() == BlockBasedTableFactory().Name());
+
+ const uint64_t kAdjustedTtl = 30 * 24 * 60 * 60;
+ if (result.ttl == kDefaultTtl) {
+ if (is_block_based_table &&
+ result.compaction_style != kCompactionStyleFIFO) {
+ result.ttl = kAdjustedTtl;
+ } else {
+ result.ttl = 0;
+ }
+ }
+
+ const uint64_t kAdjustedPeriodicCompSecs = 30 * 24 * 60 * 60;
+
+ // Turn on periodic compactions and set them to occur once every 30 days if
+ // compaction filters are used and periodic_compaction_seconds is set to the
+ // default value.
+ if (result.compaction_style != kCompactionStyleFIFO) {
+ if ((result.compaction_filter != nullptr ||
+ result.compaction_filter_factory != nullptr) &&
+ result.periodic_compaction_seconds == kDefaultPeriodicCompSecs &&
+ is_block_based_table) {
+ result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs;
+ }
+ } else {
+ // result.compaction_style == kCompactionStyleFIFO
+ if (result.ttl == 0) {
+ if (is_block_based_table) {
+ if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) {
+ result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs;
+ }
+ result.ttl = result.periodic_compaction_seconds;
+ }
+ } else if (result.periodic_compaction_seconds != 0) {
+ result.ttl = std::min(result.ttl, result.periodic_compaction_seconds);
+ }
+ }
+
+ // TTL compactions would work similar to Periodic Compactions in Universal in
+ // most of the cases. So, if ttl is set, execute the periodic compaction
+ // codepath.
+ if (result.compaction_style == kCompactionStyleUniversal && result.ttl != 0) {
+ if (result.periodic_compaction_seconds != 0) {
+ result.periodic_compaction_seconds =
+ std::min(result.ttl, result.periodic_compaction_seconds);
+ } else {
+ result.periodic_compaction_seconds = result.ttl;
+ }
+ }
+
+ if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) {
+ result.periodic_compaction_seconds = 0;
+ }
+
+ return result;
+}
+
+int SuperVersion::dummy = 0;
+void* const SuperVersion::kSVInUse = &SuperVersion::dummy;
+void* const SuperVersion::kSVObsolete = nullptr;
+
+SuperVersion::~SuperVersion() {
+ for (auto td : to_delete) {
+ delete td;
+ }
+}
+
+SuperVersion* SuperVersion::Ref() {
+ refs.fetch_add(1, std::memory_order_relaxed);
+ return this;
+}
+
+bool SuperVersion::Unref() {
+ // fetch_sub returns the previous value of ref
+ uint32_t previous_refs = refs.fetch_sub(1);
+ assert(previous_refs > 0);
+ return previous_refs == 1;
+}
+
+void SuperVersion::Cleanup() {
+ assert(refs.load(std::memory_order_relaxed) == 0);
+ imm->Unref(&to_delete);
+ MemTable* m = mem->Unref();
+ if (m != nullptr) {
+ auto* memory_usage = current->cfd()->imm()->current_memory_usage();
+ assert(*memory_usage >= m->ApproximateMemoryUsage());
+ *memory_usage -= m->ApproximateMemoryUsage();
+ to_delete.push_back(m);
+ }
+ current->Unref();
+ if (cfd->Unref()) {
+ delete cfd;
+ }
+}
+
+void SuperVersion::Init(ColumnFamilyData* new_cfd, MemTable* new_mem,
+ MemTableListVersion* new_imm, Version* new_current) {
+ cfd = new_cfd;
+ mem = new_mem;
+ imm = new_imm;
+ current = new_current;
+ cfd->Ref();
+ mem->Ref();
+ imm->Ref();
+ current->Ref();
+ refs.store(1, std::memory_order_relaxed);
+}
+
+namespace {
+void SuperVersionUnrefHandle(void* ptr) {
+ // UnrefHandle is called when a thread exists or a ThreadLocalPtr gets
+ // destroyed. When former happens, the thread shouldn't see kSVInUse.
+ // When latter happens, we are in ~ColumnFamilyData(), no get should happen as
+ // well.
+ SuperVersion* sv = static_cast<SuperVersion*>(ptr);
+ bool was_last_ref __attribute__((__unused__));
+ was_last_ref = sv->Unref();
+ // Thread-local SuperVersions can't outlive ColumnFamilyData::super_version_.
+ // This is important because we can't do SuperVersion cleanup here.
+ // That would require locking DB mutex, which would deadlock because
+ // SuperVersionUnrefHandle is called with locked ThreadLocalPtr mutex.
+ assert(!was_last_ref);
+}
+} // anonymous namespace
+
+ColumnFamilyData::ColumnFamilyData(
+ uint32_t id, const std::string& name, Version* _dummy_versions,
+ Cache* _table_cache, WriteBufferManager* write_buffer_manager,
+ const ColumnFamilyOptions& cf_options, const ImmutableDBOptions& db_options,
+ const FileOptions& file_options, ColumnFamilySet* column_family_set,
+ BlockCacheTracer* const block_cache_tracer)
+ : id_(id),
+ name_(name),
+ dummy_versions_(_dummy_versions),
+ current_(nullptr),
+ refs_(0),
+ initialized_(false),
+ dropped_(false),
+ internal_comparator_(cf_options.comparator),
+ initial_cf_options_(SanitizeOptions(db_options, cf_options)),
+ ioptions_(db_options, initial_cf_options_),
+ mutable_cf_options_(initial_cf_options_),
+ is_delete_range_supported_(
+ cf_options.table_factory->IsDeleteRangeSupported()),
+ write_buffer_manager_(write_buffer_manager),
+ mem_(nullptr),
+ imm_(ioptions_.min_write_buffer_number_to_merge,
+ ioptions_.max_write_buffer_number_to_maintain,
+ ioptions_.max_write_buffer_size_to_maintain),
+ super_version_(nullptr),
+ super_version_number_(0),
+ local_sv_(new ThreadLocalPtr(&SuperVersionUnrefHandle)),
+ next_(nullptr),
+ prev_(nullptr),
+ log_number_(0),
+ flush_reason_(FlushReason::kOthers),
+ column_family_set_(column_family_set),
+ queued_for_flush_(false),
+ queued_for_compaction_(false),
+ prev_compaction_needed_bytes_(0),
+ allow_2pc_(db_options.allow_2pc),
+ last_memtable_id_(0) {
+ Ref();
+
+ // Convert user defined table properties collector factories to internal ones.
+ GetIntTblPropCollectorFactory(ioptions_, &int_tbl_prop_collector_factories_);
+
+ // if _dummy_versions is nullptr, then this is a dummy column family.
+ if (_dummy_versions != nullptr) {
+ internal_stats_.reset(
+ new InternalStats(ioptions_.num_levels, db_options.env, this));
+ table_cache_.reset(new TableCache(ioptions_, file_options, _table_cache,
+ block_cache_tracer));
+ if (ioptions_.compaction_style == kCompactionStyleLevel) {
+ compaction_picker_.reset(
+ new LevelCompactionPicker(ioptions_, &internal_comparator_));
+#ifndef ROCKSDB_LITE
+ } else if (ioptions_.compaction_style == kCompactionStyleUniversal) {
+ compaction_picker_.reset(
+ new UniversalCompactionPicker(ioptions_, &internal_comparator_));
+ } else if (ioptions_.compaction_style == kCompactionStyleFIFO) {
+ compaction_picker_.reset(
+ new FIFOCompactionPicker(ioptions_, &internal_comparator_));
+ } else if (ioptions_.compaction_style == kCompactionStyleNone) {
+ compaction_picker_.reset(new NullCompactionPicker(
+ ioptions_, &internal_comparator_));
+ ROCKS_LOG_WARN(ioptions_.info_log,
+ "Column family %s does not use any background compaction. "
+ "Compactions can only be done via CompactFiles\n",
+ GetName().c_str());
+#endif // !ROCKSDB_LITE
+ } else {
+ ROCKS_LOG_ERROR(ioptions_.info_log,
+ "Unable to recognize the specified compaction style %d. "
+ "Column family %s will use kCompactionStyleLevel.\n",
+ ioptions_.compaction_style, GetName().c_str());
+ compaction_picker_.reset(
+ new LevelCompactionPicker(ioptions_, &internal_comparator_));
+ }
+
+ if (column_family_set_->NumberOfColumnFamilies() < 10) {
+ ROCKS_LOG_INFO(ioptions_.info_log,
+ "--------------- Options for column family [%s]:\n",
+ name.c_str());
+ initial_cf_options_.Dump(ioptions_.info_log);
+ } else {
+ ROCKS_LOG_INFO(ioptions_.info_log, "\t(skipping printing options)\n");
+ }
+ }
+
+ RecalculateWriteStallConditions(mutable_cf_options_);
+}
+
+// DB mutex held
+ColumnFamilyData::~ColumnFamilyData() {
+ assert(refs_.load(std::memory_order_relaxed) == 0);
+ // remove from linked list
+ auto prev = prev_;
+ auto next = next_;
+ prev->next_ = next;
+ next->prev_ = prev;
+
+ if (!dropped_ && column_family_set_ != nullptr) {
+ // If it's dropped, it's already removed from column family set
+ // If column_family_set_ == nullptr, this is dummy CFD and not in
+ // ColumnFamilySet
+ column_family_set_->RemoveColumnFamily(this);
+ }
+
+ if (current_ != nullptr) {
+ current_->Unref();
+ }
+
+ // It would be wrong if this ColumnFamilyData is in flush_queue_ or
+ // compaction_queue_ and we destroyed it
+ assert(!queued_for_flush_);
+ assert(!queued_for_compaction_);
+ assert(super_version_ == nullptr);
+
+ if (dummy_versions_ != nullptr) {
+ // List must be empty
+ assert(dummy_versions_->TEST_Next() == dummy_versions_);
+ bool deleted __attribute__((__unused__));
+ deleted = dummy_versions_->Unref();
+ assert(deleted);
+ }
+
+ if (mem_ != nullptr) {
+ delete mem_->Unref();
+ }
+ autovector<MemTable*> to_delete;
+ imm_.current()->Unref(&to_delete);
+ for (MemTable* m : to_delete) {
+ delete m;
+ }
+}
+
+bool ColumnFamilyData::UnrefAndTryDelete() {
+ int old_refs = refs_.fetch_sub(1);
+ assert(old_refs > 0);
+
+ if (old_refs == 1) {
+ assert(super_version_ == nullptr);
+ delete this;
+ return true;
+ }
+
+ if (old_refs == 2 && super_version_ != nullptr) {
+ // Only the super_version_ holds me
+ SuperVersion* sv = super_version_;
+ super_version_ = nullptr;
+ // Release SuperVersion reference kept in ThreadLocalPtr.
+ // This must be done outside of mutex_ since unref handler can lock mutex.
+ sv->db_mutex->Unlock();
+ local_sv_.reset();
+ sv->db_mutex->Lock();
+
+ if (sv->Unref()) {
+ // May delete this ColumnFamilyData after calling Cleanup()
+ sv->Cleanup();
+ delete sv;
+ return true;
+ }
+ }
+ return false;
+}
+
+void ColumnFamilyData::SetDropped() {
+ // can't drop default CF
+ assert(id_ != 0);
+ dropped_ = true;
+ write_controller_token_.reset();
+
+ // remove from column_family_set
+ column_family_set_->RemoveColumnFamily(this);
+}
+
+ColumnFamilyOptions ColumnFamilyData::GetLatestCFOptions() const {
+ return BuildColumnFamilyOptions(initial_cf_options_, mutable_cf_options_);
+}
+
+uint64_t ColumnFamilyData::OldestLogToKeep() {
+ auto current_log = GetLogNumber();
+
+ if (allow_2pc_) {
+ autovector<MemTable*> empty_list;
+ auto imm_prep_log =
+ imm()->PrecomputeMinLogContainingPrepSection(empty_list);
+ auto mem_prep_log = mem()->GetMinLogContainingPrepSection();
+
+ if (imm_prep_log > 0 && imm_prep_log < current_log) {
+ current_log = imm_prep_log;
+ }
+
+ if (mem_prep_log > 0 && mem_prep_log < current_log) {
+ current_log = mem_prep_log;
+ }
+ }
+
+ return current_log;
+}
+
+const double kIncSlowdownRatio = 0.8;
+const double kDecSlowdownRatio = 1 / kIncSlowdownRatio;
+const double kNearStopSlowdownRatio = 0.6;
+const double kDelayRecoverSlowdownRatio = 1.4;
+
+namespace {
+// If penalize_stop is true, we further reduce slowdown rate.
+std::unique_ptr<WriteControllerToken> SetupDelay(
+ WriteController* write_controller, uint64_t compaction_needed_bytes,
+ uint64_t prev_compaction_need_bytes, bool penalize_stop,
+ bool auto_comapctions_disabled) {
+ const uint64_t kMinWriteRate = 16 * 1024u; // Minimum write rate 16KB/s.
+
+ uint64_t max_write_rate = write_controller->max_delayed_write_rate();
+ uint64_t write_rate = write_controller->delayed_write_rate();
+
+ if (auto_comapctions_disabled) {
+ // When auto compaction is disabled, always use the value user gave.
+ write_rate = max_write_rate;
+ } else if (write_controller->NeedsDelay() && max_write_rate > kMinWriteRate) {
+ // If user gives rate less than kMinWriteRate, don't adjust it.
+ //
+ // If already delayed, need to adjust based on previous compaction debt.
+ // When there are two or more column families require delay, we always
+ // increase or reduce write rate based on information for one single
+ // column family. It is likely to be OK but we can improve if there is a
+ // problem.
+ // Ignore compaction_needed_bytes = 0 case because compaction_needed_bytes
+ // is only available in level-based compaction
+ //
+ // If the compaction debt stays the same as previously, we also further slow
+ // down. It usually means a mem table is full. It's mainly for the case
+ // where both of flush and compaction are much slower than the speed we
+ // insert to mem tables, so we need to actively slow down before we get
+ // feedback signal from compaction and flushes to avoid the full stop
+ // because of hitting the max write buffer number.
+ //
+ // If DB just falled into the stop condition, we need to further reduce
+ // the write rate to avoid the stop condition.
+ if (penalize_stop) {
+ // Penalize the near stop or stop condition by more aggressive slowdown.
+ // This is to provide the long term slowdown increase signal.
+ // The penalty is more than the reward of recovering to the normal
+ // condition.
+ write_rate = static_cast<uint64_t>(static_cast<double>(write_rate) *
+ kNearStopSlowdownRatio);
+ if (write_rate < kMinWriteRate) {
+ write_rate = kMinWriteRate;
+ }
+ } else if (prev_compaction_need_bytes > 0 &&
+ prev_compaction_need_bytes <= compaction_needed_bytes) {
+ write_rate = static_cast<uint64_t>(static_cast<double>(write_rate) *
+ kIncSlowdownRatio);
+ if (write_rate < kMinWriteRate) {
+ write_rate = kMinWriteRate;
+ }
+ } else if (prev_compaction_need_bytes > compaction_needed_bytes) {
+ // We are speeding up by ratio of kSlowdownRatio when we have paid
+ // compaction debt. But we'll never speed up to faster than the write rate
+ // given by users.
+ write_rate = static_cast<uint64_t>(static_cast<double>(write_rate) *
+ kDecSlowdownRatio);
+ if (write_rate > max_write_rate) {
+ write_rate = max_write_rate;
+ }
+ }
+ }
+ return write_controller->GetDelayToken(write_rate);
+}
+
+int GetL0ThresholdSpeedupCompaction(int level0_file_num_compaction_trigger,
+ int level0_slowdown_writes_trigger) {
+ // SanitizeOptions() ensures it.
+ assert(level0_file_num_compaction_trigger <= level0_slowdown_writes_trigger);
+
+ if (level0_file_num_compaction_trigger < 0) {
+ return std::numeric_limits<int>::max();
+ }
+
+ const int64_t twice_level0_trigger =
+ static_cast<int64_t>(level0_file_num_compaction_trigger) * 2;
+
+ const int64_t one_fourth_trigger_slowdown =
+ static_cast<int64_t>(level0_file_num_compaction_trigger) +
+ ((level0_slowdown_writes_trigger - level0_file_num_compaction_trigger) /
+ 4);
+
+ assert(twice_level0_trigger >= 0);
+ assert(one_fourth_trigger_slowdown >= 0);
+
+ // 1/4 of the way between L0 compaction trigger threshold and slowdown
+ // condition.
+ // Or twice as compaction trigger, if it is smaller.
+ int64_t res = std::min(twice_level0_trigger, one_fourth_trigger_slowdown);
+ if (res >= port::kMaxInt32) {
+ return port::kMaxInt32;
+ } else {
+ // res fits in int
+ return static_cast<int>(res);
+ }
+}
+} // namespace
+
+std::pair<WriteStallCondition, ColumnFamilyData::WriteStallCause>
+ColumnFamilyData::GetWriteStallConditionAndCause(
+ int num_unflushed_memtables, int num_l0_files,
+ uint64_t num_compaction_needed_bytes,
+ const MutableCFOptions& mutable_cf_options) {
+ if (num_unflushed_memtables >= mutable_cf_options.max_write_buffer_number) {
+ return {WriteStallCondition::kStopped, WriteStallCause::kMemtableLimit};
+ } else if (!mutable_cf_options.disable_auto_compactions &&
+ num_l0_files >= mutable_cf_options.level0_stop_writes_trigger) {
+ return {WriteStallCondition::kStopped, WriteStallCause::kL0FileCountLimit};
+ } else if (!mutable_cf_options.disable_auto_compactions &&
+ mutable_cf_options.hard_pending_compaction_bytes_limit > 0 &&
+ num_compaction_needed_bytes >=
+ mutable_cf_options.hard_pending_compaction_bytes_limit) {
+ return {WriteStallCondition::kStopped,
+ WriteStallCause::kPendingCompactionBytes};
+ } else if (mutable_cf_options.max_write_buffer_number > 3 &&
+ num_unflushed_memtables >=
+ mutable_cf_options.max_write_buffer_number - 1) {
+ return {WriteStallCondition::kDelayed, WriteStallCause::kMemtableLimit};
+ } else if (!mutable_cf_options.disable_auto_compactions &&
+ mutable_cf_options.level0_slowdown_writes_trigger >= 0 &&
+ num_l0_files >=
+ mutable_cf_options.level0_slowdown_writes_trigger) {
+ return {WriteStallCondition::kDelayed, WriteStallCause::kL0FileCountLimit};
+ } else if (!mutable_cf_options.disable_auto_compactions &&
+ mutable_cf_options.soft_pending_compaction_bytes_limit > 0 &&
+ num_compaction_needed_bytes >=
+ mutable_cf_options.soft_pending_compaction_bytes_limit) {
+ return {WriteStallCondition::kDelayed,
+ WriteStallCause::kPendingCompactionBytes};
+ }
+ return {WriteStallCondition::kNormal, WriteStallCause::kNone};
+}
+
+WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions(
+ const MutableCFOptions& mutable_cf_options) {
+ auto write_stall_condition = WriteStallCondition::kNormal;
+ if (current_ != nullptr) {
+ auto* vstorage = current_->storage_info();
+ auto write_controller = column_family_set_->write_controller_;
+ uint64_t compaction_needed_bytes =
+ vstorage->estimated_compaction_needed_bytes();
+
+ auto write_stall_condition_and_cause = GetWriteStallConditionAndCause(
+ imm()->NumNotFlushed(), vstorage->l0_delay_trigger_count(),
+ vstorage->estimated_compaction_needed_bytes(), mutable_cf_options);
+ write_stall_condition = write_stall_condition_and_cause.first;
+ auto write_stall_cause = write_stall_condition_and_cause.second;
+
+ bool was_stopped = write_controller->IsStopped();
+ bool needed_delay = write_controller->NeedsDelay();
+
+ if (write_stall_condition == WriteStallCondition::kStopped &&
+ write_stall_cause == WriteStallCause::kMemtableLimit) {
+ write_controller_token_ = write_controller->GetStopToken();
+ internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_STOPS, 1);
+ ROCKS_LOG_WARN(
+ ioptions_.info_log,
+ "[%s] Stopping writes because we have %d immutable memtables "
+ "(waiting for flush), max_write_buffer_number is set to %d",
+ name_.c_str(), imm()->NumNotFlushed(),
+ mutable_cf_options.max_write_buffer_number);
+ } else if (write_stall_condition == WriteStallCondition::kStopped &&
+ write_stall_cause == WriteStallCause::kL0FileCountLimit) {
+ write_controller_token_ = write_controller->GetStopToken();
+ internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_STOPS, 1);
+ if (compaction_picker_->IsLevel0CompactionInProgress()) {
+ internal_stats_->AddCFStats(
+ InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_STOPS, 1);
+ }
+ ROCKS_LOG_WARN(ioptions_.info_log,
+ "[%s] Stopping writes because we have %d level-0 files",
+ name_.c_str(), vstorage->l0_delay_trigger_count());
+ } else if (write_stall_condition == WriteStallCondition::kStopped &&
+ write_stall_cause == WriteStallCause::kPendingCompactionBytes) {
+ write_controller_token_ = write_controller->GetStopToken();
+ internal_stats_->AddCFStats(
+ InternalStats::PENDING_COMPACTION_BYTES_LIMIT_STOPS, 1);
+ ROCKS_LOG_WARN(
+ ioptions_.info_log,
+ "[%s] Stopping writes because of estimated pending compaction "
+ "bytes %" PRIu64,
+ name_.c_str(), compaction_needed_bytes);
+ } else if (write_stall_condition == WriteStallCondition::kDelayed &&
+ write_stall_cause == WriteStallCause::kMemtableLimit) {
+ write_controller_token_ =
+ SetupDelay(write_controller, compaction_needed_bytes,
+ prev_compaction_needed_bytes_, was_stopped,
+ mutable_cf_options.disable_auto_compactions);
+ internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_SLOWDOWNS, 1);
+ ROCKS_LOG_WARN(
+ ioptions_.info_log,
+ "[%s] Stalling writes because we have %d immutable memtables "
+ "(waiting for flush), max_write_buffer_number is set to %d "
+ "rate %" PRIu64,
+ name_.c_str(), imm()->NumNotFlushed(),
+ mutable_cf_options.max_write_buffer_number,
+ write_controller->delayed_write_rate());
+ } else if (write_stall_condition == WriteStallCondition::kDelayed &&
+ write_stall_cause == WriteStallCause::kL0FileCountLimit) {
+ // L0 is the last two files from stopping.
+ bool near_stop = vstorage->l0_delay_trigger_count() >=
+ mutable_cf_options.level0_stop_writes_trigger - 2;
+ write_controller_token_ =
+ SetupDelay(write_controller, compaction_needed_bytes,
+ prev_compaction_needed_bytes_, was_stopped || near_stop,
+ mutable_cf_options.disable_auto_compactions);
+ internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_SLOWDOWNS,
+ 1);
+ if (compaction_picker_->IsLevel0CompactionInProgress()) {
+ internal_stats_->AddCFStats(
+ InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS, 1);
+ }
+ ROCKS_LOG_WARN(ioptions_.info_log,
+ "[%s] Stalling writes because we have %d level-0 files "
+ "rate %" PRIu64,
+ name_.c_str(), vstorage->l0_delay_trigger_count(),
+ write_controller->delayed_write_rate());
+ } else if (write_stall_condition == WriteStallCondition::kDelayed &&
+ write_stall_cause == WriteStallCause::kPendingCompactionBytes) {
+ // If the distance to hard limit is less than 1/4 of the gap between soft
+ // and
+ // hard bytes limit, we think it is near stop and speed up the slowdown.
+ bool near_stop =
+ mutable_cf_options.hard_pending_compaction_bytes_limit > 0 &&
+ (compaction_needed_bytes -
+ mutable_cf_options.soft_pending_compaction_bytes_limit) >
+ 3 * (mutable_cf_options.hard_pending_compaction_bytes_limit -
+ mutable_cf_options.soft_pending_compaction_bytes_limit) /
+ 4;
+
+ write_controller_token_ =
+ SetupDelay(write_controller, compaction_needed_bytes,
+ prev_compaction_needed_bytes_, was_stopped || near_stop,
+ mutable_cf_options.disable_auto_compactions);
+ internal_stats_->AddCFStats(
+ InternalStats::PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS, 1);
+ ROCKS_LOG_WARN(
+ ioptions_.info_log,
+ "[%s] Stalling writes because of estimated pending compaction "
+ "bytes %" PRIu64 " rate %" PRIu64,
+ name_.c_str(), vstorage->estimated_compaction_needed_bytes(),
+ write_controller->delayed_write_rate());
+ } else {
+ assert(write_stall_condition == WriteStallCondition::kNormal);
+ if (vstorage->l0_delay_trigger_count() >=
+ GetL0ThresholdSpeedupCompaction(
+ mutable_cf_options.level0_file_num_compaction_trigger,
+ mutable_cf_options.level0_slowdown_writes_trigger)) {
+ write_controller_token_ =
+ write_controller->GetCompactionPressureToken();
+ ROCKS_LOG_INFO(
+ ioptions_.info_log,
+ "[%s] Increasing compaction threads because we have %d level-0 "
+ "files ",
+ name_.c_str(), vstorage->l0_delay_trigger_count());
+ } else if (vstorage->estimated_compaction_needed_bytes() >=
+ mutable_cf_options.soft_pending_compaction_bytes_limit / 4) {
+ // Increase compaction threads if bytes needed for compaction exceeds
+ // 1/4 of threshold for slowing down.
+ // If soft pending compaction byte limit is not set, always speed up
+ // compaction.
+ write_controller_token_ =
+ write_controller->GetCompactionPressureToken();
+ if (mutable_cf_options.soft_pending_compaction_bytes_limit > 0) {
+ ROCKS_LOG_INFO(
+ ioptions_.info_log,
+ "[%s] Increasing compaction threads because of estimated pending "
+ "compaction "
+ "bytes %" PRIu64,
+ name_.c_str(), vstorage->estimated_compaction_needed_bytes());
+ }
+ } else {
+ write_controller_token_.reset();
+ }
+ // If the DB recovers from delay conditions, we reward with reducing
+ // double the slowdown ratio. This is to balance the long term slowdown
+ // increase signal.
+ if (needed_delay) {
+ uint64_t write_rate = write_controller->delayed_write_rate();
+ write_controller->set_delayed_write_rate(static_cast<uint64_t>(
+ static_cast<double>(write_rate) * kDelayRecoverSlowdownRatio));
+ // Set the low pri limit to be 1/4 the delayed write rate.
+ // Note we don't reset this value even after delay condition is relased.
+ // Low-pri rate will continue to apply if there is a compaction
+ // pressure.
+ write_controller->low_pri_rate_limiter()->SetBytesPerSecond(write_rate /
+ 4);
+ }
+ }
+ prev_compaction_needed_bytes_ = compaction_needed_bytes;
+ }
+ return write_stall_condition;
+}
+
+const FileOptions* ColumnFamilyData::soptions() const {
+ return &(column_family_set_->file_options_);
+}
+
+void ColumnFamilyData::SetCurrent(Version* current_version) {
+ current_ = current_version;
+}
+
+uint64_t ColumnFamilyData::GetNumLiveVersions() const {
+ return VersionSet::GetNumLiveVersions(dummy_versions_);
+}
+
+uint64_t ColumnFamilyData::GetTotalSstFilesSize() const {
+ return VersionSet::GetTotalSstFilesSize(dummy_versions_);
+}
+
+uint64_t ColumnFamilyData::GetLiveSstFilesSize() const {
+ return current_->GetSstFilesSize();
+}
+
+MemTable* ColumnFamilyData::ConstructNewMemtable(
+ const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq) {
+ return new MemTable(internal_comparator_, ioptions_, mutable_cf_options,
+ write_buffer_manager_, earliest_seq, id_);
+}
+
+void ColumnFamilyData::CreateNewMemtable(
+ const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq) {
+ if (mem_ != nullptr) {
+ delete mem_->Unref();
+ }
+ SetMemtable(ConstructNewMemtable(mutable_cf_options, earliest_seq));
+ mem_->Ref();
+}
+
+bool ColumnFamilyData::NeedsCompaction() const {
+ return compaction_picker_->NeedsCompaction(current_->storage_info());
+}
+
+Compaction* ColumnFamilyData::PickCompaction(
+ const MutableCFOptions& mutable_options, LogBuffer* log_buffer) {
+ SequenceNumber earliest_mem_seqno =
+ std::min(mem_->GetEarliestSequenceNumber(),
+ imm_.current()->GetEarliestSequenceNumber(false));
+ auto* result = compaction_picker_->PickCompaction(
+ GetName(), mutable_options, current_->storage_info(), log_buffer,
+ earliest_mem_seqno);
+ if (result != nullptr) {
+ result->SetInputVersion(current_);
+ }
+ return result;
+}
+
+bool ColumnFamilyData::RangeOverlapWithCompaction(
+ const Slice& smallest_user_key, const Slice& largest_user_key,
+ int level) const {
+ return compaction_picker_->RangeOverlapWithCompaction(
+ smallest_user_key, largest_user_key, level);
+}
+
+Status ColumnFamilyData::RangesOverlapWithMemtables(
+ const autovector<Range>& ranges, SuperVersion* super_version,
+ bool* overlap) {
+ assert(overlap != nullptr);
+ *overlap = false;
+ // Create an InternalIterator over all unflushed memtables
+ Arena arena;
+ ReadOptions read_opts;
+ read_opts.total_order_seek = true;
+ MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena);
+ merge_iter_builder.AddIterator(
+ super_version->mem->NewIterator(read_opts, &arena));
+ super_version->imm->AddIterators(read_opts, &merge_iter_builder);
+ ScopedArenaIterator memtable_iter(merge_iter_builder.Finish());
+
+ auto read_seq = super_version->current->version_set()->LastSequence();
+ ReadRangeDelAggregator range_del_agg(&internal_comparator_, read_seq);
+ auto* active_range_del_iter =
+ super_version->mem->NewRangeTombstoneIterator(read_opts, read_seq);
+ range_del_agg.AddTombstones(
+ std::unique_ptr<FragmentedRangeTombstoneIterator>(active_range_del_iter));
+ super_version->imm->AddRangeTombstoneIterators(read_opts, nullptr /* arena */,
+ &range_del_agg);
+
+ Status status;
+ for (size_t i = 0; i < ranges.size() && status.ok() && !*overlap; ++i) {
+ auto* vstorage = super_version->current->storage_info();
+ auto* ucmp = vstorage->InternalComparator()->user_comparator();
+ InternalKey range_start(ranges[i].start, kMaxSequenceNumber,
+ kValueTypeForSeek);
+ memtable_iter->Seek(range_start.Encode());
+ status = memtable_iter->status();
+ ParsedInternalKey seek_result;
+ if (status.ok()) {
+ if (memtable_iter->Valid() &&
+ !ParseInternalKey(memtable_iter->key(), &seek_result)) {
+ status = Status::Corruption("DB have corrupted keys");
+ }
+ }
+ if (status.ok()) {
+ if (memtable_iter->Valid() &&
+ ucmp->Compare(seek_result.user_key, ranges[i].limit) <= 0) {
+ *overlap = true;
+ } else if (range_del_agg.IsRangeOverlapped(ranges[i].start,
+ ranges[i].limit)) {
+ *overlap = true;
+ }
+ }
+ }
+ return status;
+}
+
+const int ColumnFamilyData::kCompactAllLevels = -1;
+const int ColumnFamilyData::kCompactToBaseLevel = -2;
+
+Compaction* ColumnFamilyData::CompactRange(
+ const MutableCFOptions& mutable_cf_options, int input_level,
+ int output_level, const CompactRangeOptions& compact_range_options,
+ const InternalKey* begin, const InternalKey* end,
+ InternalKey** compaction_end, bool* conflict,
+ uint64_t max_file_num_to_ignore) {
+ auto* result = compaction_picker_->CompactRange(
+ GetName(), mutable_cf_options, current_->storage_info(), input_level,
+ output_level, compact_range_options, begin, end, compaction_end, conflict,
+ max_file_num_to_ignore);
+ if (result != nullptr) {
+ result->SetInputVersion(current_);
+ }
+ return result;
+}
+
+SuperVersion* ColumnFamilyData::GetReferencedSuperVersion(DBImpl* db) {
+ SuperVersion* sv = GetThreadLocalSuperVersion(db);
+ sv->Ref();
+ if (!ReturnThreadLocalSuperVersion(sv)) {
+ // This Unref() corresponds to the Ref() in GetThreadLocalSuperVersion()
+ // when the thread-local pointer was populated. So, the Ref() earlier in
+ // this function still prevents the returned SuperVersion* from being
+ // deleted out from under the caller.
+ sv->Unref();
+ }
+ return sv;
+}
+
+SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(DBImpl* db) {
+ // The SuperVersion is cached in thread local storage to avoid acquiring
+ // mutex when SuperVersion does not change since the last use. When a new
+ // SuperVersion is installed, the compaction or flush thread cleans up
+ // cached SuperVersion in all existing thread local storage. To avoid
+ // acquiring mutex for this operation, we use atomic Swap() on the thread
+ // local pointer to guarantee exclusive access. If the thread local pointer
+ // is being used while a new SuperVersion is installed, the cached
+ // SuperVersion can become stale. In that case, the background thread would
+ // have swapped in kSVObsolete. We re-check the value at when returning
+ // SuperVersion back to thread local, with an atomic compare and swap.
+ // The superversion will need to be released if detected to be stale.
+ void* ptr = local_sv_->Swap(SuperVersion::kSVInUse);
+ // Invariant:
+ // (1) Scrape (always) installs kSVObsolete in ThreadLocal storage
+ // (2) the Swap above (always) installs kSVInUse, ThreadLocal storage
+ // should only keep kSVInUse before ReturnThreadLocalSuperVersion call
+ // (if no Scrape happens).
+ assert(ptr != SuperVersion::kSVInUse);
+ SuperVersion* sv = static_cast<SuperVersion*>(ptr);
+ if (sv == SuperVersion::kSVObsolete ||
+ sv->version_number != super_version_number_.load()) {
+ RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_ACQUIRES);
+ SuperVersion* sv_to_delete = nullptr;
+
+ if (sv && sv->Unref()) {
+ RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_CLEANUPS);
+ db->mutex()->Lock();
+ // NOTE: underlying resources held by superversion (sst files) might
+ // not be released until the next background job.
+ sv->Cleanup();
+ if (db->immutable_db_options().avoid_unnecessary_blocking_io) {
+ db->AddSuperVersionsToFreeQueue(sv);
+ db->SchedulePurge();
+ } else {
+ sv_to_delete = sv;
+ }
+ } else {
+ db->mutex()->Lock();
+ }
+ sv = super_version_->Ref();
+ db->mutex()->Unlock();
+
+ delete sv_to_delete;
+ }
+ assert(sv != nullptr);
+ return sv;
+}
+
+bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) {
+ assert(sv != nullptr);
+ // Put the SuperVersion back
+ void* expected = SuperVersion::kSVInUse;
+ if (local_sv_->CompareAndSwap(static_cast<void*>(sv), expected)) {
+ // When we see kSVInUse in the ThreadLocal, we are sure ThreadLocal
+ // storage has not been altered and no Scrape has happened. The
+ // SuperVersion is still current.
+ return true;
+ } else {
+ // ThreadLocal scrape happened in the process of this GetImpl call (after
+ // thread local Swap() at the beginning and before CompareAndSwap()).
+ // This means the SuperVersion it holds is obsolete.
+ assert(expected == SuperVersion::kSVObsolete);
+ }
+ return false;
+}
+
+void ColumnFamilyData::InstallSuperVersion(
+ SuperVersionContext* sv_context, InstrumentedMutex* db_mutex) {
+ db_mutex->AssertHeld();
+ return InstallSuperVersion(sv_context, db_mutex, mutable_cf_options_);
+}
+
+void ColumnFamilyData::InstallSuperVersion(
+ SuperVersionContext* sv_context, InstrumentedMutex* db_mutex,
+ const MutableCFOptions& mutable_cf_options) {
+ SuperVersion* new_superversion = sv_context->new_superversion.release();
+ new_superversion->db_mutex = db_mutex;
+ new_superversion->mutable_cf_options = mutable_cf_options;
+ new_superversion->Init(this, mem_, imm_.current(), current_);
+ SuperVersion* old_superversion = super_version_;
+ super_version_ = new_superversion;
+ ++super_version_number_;
+ super_version_->version_number = super_version_number_;
+ super_version_->write_stall_condition =
+ RecalculateWriteStallConditions(mutable_cf_options);
+
+ if (old_superversion != nullptr) {
+ // Reset SuperVersions cached in thread local storage.
+ // This should be done before old_superversion->Unref(). That's to ensure
+ // that local_sv_ never holds the last reference to SuperVersion, since
+ // it has no means to safely do SuperVersion cleanup.
+ ResetThreadLocalSuperVersions();
+
+ if (old_superversion->mutable_cf_options.write_buffer_size !=
+ mutable_cf_options.write_buffer_size) {
+ mem_->UpdateWriteBufferSize(mutable_cf_options.write_buffer_size);
+ }
+ if (old_superversion->write_stall_condition !=
+ new_superversion->write_stall_condition) {
+ sv_context->PushWriteStallNotification(
+ old_superversion->write_stall_condition,
+ new_superversion->write_stall_condition, GetName(), ioptions());
+ }
+ if (old_superversion->Unref()) {
+ old_superversion->Cleanup();
+ sv_context->superversions_to_free.push_back(old_superversion);
+ }
+ }
+}
+
+void ColumnFamilyData::ResetThreadLocalSuperVersions() {
+ autovector<void*> sv_ptrs;
+ local_sv_->Scrape(&sv_ptrs, SuperVersion::kSVObsolete);
+ for (auto ptr : sv_ptrs) {
+ assert(ptr);
+ if (ptr == SuperVersion::kSVInUse) {
+ continue;
+ }
+ auto sv = static_cast<SuperVersion*>(ptr);
+ bool was_last_ref __attribute__((__unused__));
+ was_last_ref = sv->Unref();
+ // sv couldn't have been the last reference because
+ // ResetThreadLocalSuperVersions() is called before
+ // unref'ing super_version_.
+ assert(!was_last_ref);
+ }
+}
+
+Status ColumnFamilyData::ValidateOptions(
+ const DBOptions& db_options, const ColumnFamilyOptions& cf_options) {
+ Status s;
+ s = CheckCompressionSupported(cf_options);
+ if (s.ok() && db_options.allow_concurrent_memtable_write) {
+ s = CheckConcurrentWritesSupported(cf_options);
+ }
+ if (s.ok() && db_options.unordered_write &&
+ cf_options.max_successive_merges != 0) {
+ s = Status::InvalidArgument(
+ "max_successive_merges > 0 is incompatible with unordered_write");
+ }
+ if (s.ok()) {
+ s = CheckCFPathsSupported(db_options, cf_options);
+ }
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (cf_options.ttl > 0 && cf_options.ttl != kDefaultTtl) {
+ if (cf_options.table_factory->Name() != BlockBasedTableFactory().Name()) {
+ return Status::NotSupported(
+ "TTL is only supported in Block-Based Table format. ");
+ }
+ }
+
+ if (cf_options.periodic_compaction_seconds > 0 &&
+ cf_options.periodic_compaction_seconds != kDefaultPeriodicCompSecs) {
+ if (cf_options.table_factory->Name() != BlockBasedTableFactory().Name()) {
+ return Status::NotSupported(
+ "Periodic Compaction is only supported in "
+ "Block-Based Table format. ");
+ }
+ }
+ return s;
+}
+
+#ifndef ROCKSDB_LITE
+Status ColumnFamilyData::SetOptions(
+ const DBOptions& db_options,
+ const std::unordered_map<std::string, std::string>& options_map) {
+ MutableCFOptions new_mutable_cf_options;
+ Status s =
+ GetMutableOptionsFromStrings(mutable_cf_options_, options_map,
+ ioptions_.info_log, &new_mutable_cf_options);
+ if (s.ok()) {
+ ColumnFamilyOptions cf_options =
+ BuildColumnFamilyOptions(initial_cf_options_, new_mutable_cf_options);
+ s = ValidateOptions(db_options, cf_options);
+ }
+ if (s.ok()) {
+ mutable_cf_options_ = new_mutable_cf_options;
+ mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+ }
+ return s;
+}
+#endif // ROCKSDB_LITE
+
+// REQUIRES: DB mutex held
+Env::WriteLifeTimeHint ColumnFamilyData::CalculateSSTWriteHint(int level) {
+ if (initial_cf_options_.compaction_style != kCompactionStyleLevel) {
+ return Env::WLTH_NOT_SET;
+ }
+ if (level == 0) {
+ return Env::WLTH_MEDIUM;
+ }
+ int base_level = current_->storage_info()->base_level();
+
+ // L1: medium, L2: long, ...
+ if (level - base_level >= 2) {
+ return Env::WLTH_EXTREME;
+ } else if (level < base_level) {
+ // There is no restriction which prevents level passed in to be smaller
+ // than base_level.
+ return Env::WLTH_MEDIUM;
+ }
+ return static_cast<Env::WriteLifeTimeHint>(level - base_level +
+ static_cast<int>(Env::WLTH_MEDIUM));
+}
+
+Status ColumnFamilyData::AddDirectories(
+ std::map<std::string, std::shared_ptr<Directory>>* created_dirs) {
+ Status s;
+ assert(created_dirs != nullptr);
+ assert(data_dirs_.empty());
+ for (auto& p : ioptions_.cf_paths) {
+ auto existing_dir = created_dirs->find(p.path);
+
+ if (existing_dir == created_dirs->end()) {
+ std::unique_ptr<Directory> path_directory;
+ s = DBImpl::CreateAndNewDirectory(ioptions_.env, p.path, &path_directory);
+ if (!s.ok()) {
+ return s;
+ }
+ assert(path_directory != nullptr);
+ data_dirs_.emplace_back(path_directory.release());
+ (*created_dirs)[p.path] = data_dirs_.back();
+ } else {
+ data_dirs_.emplace_back(existing_dir->second);
+ }
+ }
+ assert(data_dirs_.size() == ioptions_.cf_paths.size());
+ return s;
+}
+
+Directory* ColumnFamilyData::GetDataDir(size_t path_id) const {
+ if (data_dirs_.empty()) {
+ return nullptr;
+ }
+
+ assert(path_id < data_dirs_.size());
+ return data_dirs_[path_id].get();
+}
+
+ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
+ const ImmutableDBOptions* db_options,
+ const FileOptions& file_options,
+ Cache* table_cache,
+ WriteBufferManager* write_buffer_manager,
+ WriteController* write_controller,
+ BlockCacheTracer* const block_cache_tracer)
+ : max_column_family_(0),
+ dummy_cfd_(new ColumnFamilyData(
+ 0, "", nullptr, nullptr, nullptr, ColumnFamilyOptions(), *db_options,
+ file_options, nullptr, block_cache_tracer)),
+ default_cfd_cache_(nullptr),
+ db_name_(dbname),
+ db_options_(db_options),
+ file_options_(file_options),
+ table_cache_(table_cache),
+ write_buffer_manager_(write_buffer_manager),
+ write_controller_(write_controller),
+ block_cache_tracer_(block_cache_tracer) {
+ // initialize linked list
+ dummy_cfd_->prev_ = dummy_cfd_;
+ dummy_cfd_->next_ = dummy_cfd_;
+}
+
+ColumnFamilySet::~ColumnFamilySet() {
+ while (column_family_data_.size() > 0) {
+ // cfd destructor will delete itself from column_family_data_
+ auto cfd = column_family_data_.begin()->second;
+ bool last_ref __attribute__((__unused__));
+ last_ref = cfd->UnrefAndTryDelete();
+ assert(last_ref);
+ }
+ bool dummy_last_ref __attribute__((__unused__));
+ dummy_last_ref = dummy_cfd_->UnrefAndTryDelete();
+ assert(dummy_last_ref);
+}
+
+ColumnFamilyData* ColumnFamilySet::GetDefault() const {
+ assert(default_cfd_cache_ != nullptr);
+ return default_cfd_cache_;
+}
+
+ColumnFamilyData* ColumnFamilySet::GetColumnFamily(uint32_t id) const {
+ auto cfd_iter = column_family_data_.find(id);
+ if (cfd_iter != column_family_data_.end()) {
+ return cfd_iter->second;
+ } else {
+ return nullptr;
+ }
+}
+
+ColumnFamilyData* ColumnFamilySet::GetColumnFamily(const std::string& name)
+ const {
+ auto cfd_iter = column_families_.find(name);
+ if (cfd_iter != column_families_.end()) {
+ auto cfd = GetColumnFamily(cfd_iter->second);
+ assert(cfd != nullptr);
+ return cfd;
+ } else {
+ return nullptr;
+ }
+}
+
+uint32_t ColumnFamilySet::GetNextColumnFamilyID() {
+ return ++max_column_family_;
+}
+
+uint32_t ColumnFamilySet::GetMaxColumnFamily() { return max_column_family_; }
+
+void ColumnFamilySet::UpdateMaxColumnFamily(uint32_t new_max_column_family) {
+ max_column_family_ = std::max(new_max_column_family, max_column_family_);
+}
+
+size_t ColumnFamilySet::NumberOfColumnFamilies() const {
+ return column_families_.size();
+}
+
+// under a DB mutex AND write thread
+ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
+ const std::string& name, uint32_t id, Version* dummy_versions,
+ const ColumnFamilyOptions& options) {
+ assert(column_families_.find(name) == column_families_.end());
+ ColumnFamilyData* new_cfd = new ColumnFamilyData(
+ id, name, dummy_versions, table_cache_, write_buffer_manager_, options,
+ *db_options_, file_options_, this, block_cache_tracer_);
+ column_families_.insert({name, id});
+ column_family_data_.insert({id, new_cfd});
+ max_column_family_ = std::max(max_column_family_, id);
+ // add to linked list
+ new_cfd->next_ = dummy_cfd_;
+ auto prev = dummy_cfd_->prev_;
+ new_cfd->prev_ = prev;
+ prev->next_ = new_cfd;
+ dummy_cfd_->prev_ = new_cfd;
+ if (id == 0) {
+ default_cfd_cache_ = new_cfd;
+ }
+ return new_cfd;
+}
+
+// REQUIRES: DB mutex held
+void ColumnFamilySet::FreeDeadColumnFamilies() {
+ autovector<ColumnFamilyData*> to_delete;
+ for (auto cfd = dummy_cfd_->next_; cfd != dummy_cfd_; cfd = cfd->next_) {
+ if (cfd->refs_.load(std::memory_order_relaxed) == 0) {
+ to_delete.push_back(cfd);
+ }
+ }
+ for (auto cfd : to_delete) {
+ // this is very rare, so it's not a problem that we do it under a mutex
+ delete cfd;
+ }
+}
+
+// under a DB mutex AND from a write thread
+void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) {
+ auto cfd_iter = column_family_data_.find(cfd->GetID());
+ assert(cfd_iter != column_family_data_.end());
+ column_family_data_.erase(cfd_iter);
+ column_families_.erase(cfd->GetName());
+}
+
+// under a DB mutex OR from a write thread
+bool ColumnFamilyMemTablesImpl::Seek(uint32_t column_family_id) {
+ if (column_family_id == 0) {
+ // optimization for common case
+ current_ = column_family_set_->GetDefault();
+ } else {
+ current_ = column_family_set_->GetColumnFamily(column_family_id);
+ }
+ handle_.SetCFD(current_);
+ return current_ != nullptr;
+}
+
+uint64_t ColumnFamilyMemTablesImpl::GetLogNumber() const {
+ assert(current_ != nullptr);
+ return current_->GetLogNumber();
+}
+
+MemTable* ColumnFamilyMemTablesImpl::GetMemTable() const {
+ assert(current_ != nullptr);
+ return current_->mem();
+}
+
+ColumnFamilyHandle* ColumnFamilyMemTablesImpl::GetColumnFamilyHandle() {
+ assert(current_ != nullptr);
+ return &handle_;
+}
+
+uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) {
+ uint32_t column_family_id = 0;
+ if (column_family != nullptr) {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ column_family_id = cfh->GetID();
+ }
+ return column_family_id;
+}
+
+const Comparator* GetColumnFamilyUserComparator(
+ ColumnFamilyHandle* column_family) {
+ if (column_family != nullptr) {
+ return column_family->GetComparator();
+ }
+ return nullptr;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/column_family.h b/src/rocksdb/db/column_family.h
new file mode 100644
index 000000000..fcc8ea2cf
--- /dev/null
+++ b/src/rocksdb/db/column_family.h
@@ -0,0 +1,757 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <unordered_map>
+#include <string>
+#include <vector>
+#include <atomic>
+
+#include "db/memtable_list.h"
+#include "db/table_cache.h"
+#include "db/table_properties_collector.h"
+#include "db/write_batch_internal.h"
+#include "db/write_controller.h"
+#include "options/cf_options.h"
+#include "rocksdb/compaction_job_stats.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "trace_replay/block_cache_tracer.h"
+#include "util/thread_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Version;
+class VersionSet;
+class VersionStorageInfo;
+class MemTable;
+class MemTableListVersion;
+class CompactionPicker;
+class Compaction;
+class InternalKey;
+class InternalStats;
+class ColumnFamilyData;
+class DBImpl;
+class LogBuffer;
+class InstrumentedMutex;
+class InstrumentedMutexLock;
+struct SuperVersionContext;
+
+extern const double kIncSlowdownRatio;
+// This file contains a list of data structures for managing column family
+// level metadata.
+//
+// The basic relationships among classes declared here are illustrated as
+// following:
+//
+// +----------------------+ +----------------------+ +--------+
+// +---+ ColumnFamilyHandle 1 | +--+ ColumnFamilyHandle 2 | | DBImpl |
+// | +----------------------+ | +----------------------+ +----+---+
+// | +--------------------------+ |
+// | | +-----------------------------+
+// | | |
+// | | +-----------------------------v-------------------------------+
+// | | | |
+// | | | ColumnFamilySet |
+// | | | |
+// | | +-------------+--------------------------+----------------+---+
+// | | | | |
+// | +-------------------------------------+ | |
+// | | | | v
+// | +-------------v-------------+ +-----v----v---------+
+// | | | | |
+// | | ColumnFamilyData 1 | | ColumnFamilyData 2 | ......
+// | | | | |
+// +---> | | |
+// | +---------+ | |
+// | | MemTable| | |
+// | | List | | |
+// +--------+---+--+-+----+----+ +--------------------++
+// | | | |
+// | | | |
+// | | | +-----------------------+
+// | | +-----------+ |
+// v +--------+ | |
+// +--------+--------+ | | |
+// | | | | +----------v----------+
+// +---> |SuperVersion 1.a +-----------------> |
+// | +------+ | | MemTableListVersion |
+// +---+-------------+ | | | | |
+// | | | | +----+------------+---+
+// | current | | | | |
+// | +-------------+ | |mem | |
+// | | | | | |
+// +-v---v-------+ +---v--v---+ +-----v----+ +----v-----+
+// | | | | | | | |
+// | Version 1.a | | memtable | | memtable | | memtable |
+// | | | 1.a | | 1.b | | 1.c |
+// +-------------+ | | | | | |
+// +----------+ +----------+ +----------+
+//
+// DBImpl keeps a ColumnFamilySet, which references to all column families by
+// pointing to respective ColumnFamilyData object of each column family.
+// This is how DBImpl can list and operate on all the column families.
+// ColumnFamilyHandle also points to ColumnFamilyData directly, so that
+// when a user executes a query, it can directly find memtables and Version
+// as well as SuperVersion to the column family, without going through
+// ColumnFamilySet.
+//
+// ColumnFamilySet points to the latest view of the LSM-tree (list of memtables
+// and SST files) indirectly, while ongoing operations may hold references
+// to a current or an out-of-date SuperVersion, which in turn points to a
+// point-in-time view of the LSM-tree. This guarantees the memtables and SST
+// files being operated on will not go away, until the SuperVersion is
+// unreferenced to 0 and destoryed.
+//
+// The following graph illustrates a possible referencing relationships:
+//
+// Column +--------------+ current +-----------+
+// Family +---->+ +------------------->+ |
+// Data | SuperVersion +----------+ | Version A |
+// | 3 | imm | | |
+// Iter2 +----->+ | +-------v------+ +-----------+
+// +-----+--------+ | MemtableList +----------------> Empty
+// | | Version r | +-----------+
+// | +--------------+ | |
+// +------------------+ current| Version B |
+// +--------------+ | +----->+ |
+// | | | | +-----+-----+
+// Compaction +>+ SuperVersion +-------------+ ^
+// Job | 2 +------+ | |current
+// | +----+ | | mem | +------------+
+// +--------------+ | | +---------------------> |
+// | +------------------------> MemTable a |
+// | mem | | |
+// +--------------+ | | +------------+
+// | +--------------------------+
+// Iter1 +-----> SuperVersion | | +------------+
+// | 1 +------------------------------>+ |
+// | +-+ | mem | MemTable b |
+// +--------------+ | | | |
+// | | +--------------+ +-----^------+
+// | |imm | MemtableList | |
+// | +--->+ Version s +------------+
+// | +--------------+
+// | +--------------+
+// | | MemtableList |
+// +------>+ Version t +--------> Empty
+// imm +--------------+
+//
+// In this example, even if the current LSM-tree consists of Version A and
+// memtable a, which is also referenced by SuperVersion, two older SuperVersion
+// SuperVersion2 and Superversion1 still exist, and are referenced by a
+// compaction job and an old iterator Iter1, respectively. SuperVersion2
+// contains Version B, memtable a and memtable b; SuperVersion1 contains
+// Version B and memtable b (mutable). As a result, Version B and memtable b
+// are prevented from being destroyed or deleted.
+
+// ColumnFamilyHandleImpl is the class that clients use to access different
+// column families. It has non-trivial destructor, which gets called when client
+// is done using the column family
+class ColumnFamilyHandleImpl : public ColumnFamilyHandle {
+ public:
+ // create while holding the mutex
+ ColumnFamilyHandleImpl(
+ ColumnFamilyData* cfd, DBImpl* db, InstrumentedMutex* mutex);
+ // destroy without mutex
+ virtual ~ColumnFamilyHandleImpl();
+ virtual ColumnFamilyData* cfd() const { return cfd_; }
+
+ virtual uint32_t GetID() const override;
+ virtual const std::string& GetName() const override;
+ virtual Status GetDescriptor(ColumnFamilyDescriptor* desc) override;
+ virtual const Comparator* GetComparator() const override;
+
+ private:
+ ColumnFamilyData* cfd_;
+ DBImpl* db_;
+ InstrumentedMutex* mutex_;
+};
+
+// Does not ref-count ColumnFamilyData
+// We use this dummy ColumnFamilyHandleImpl because sometimes MemTableInserter
+// calls DBImpl methods. When this happens, MemTableInserter need access to
+// ColumnFamilyHandle (same as the client would need). In that case, we feed
+// MemTableInserter dummy ColumnFamilyHandle and enable it to call DBImpl
+// methods
+class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl {
+ public:
+ ColumnFamilyHandleInternal()
+ : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), internal_cfd_(nullptr) {}
+
+ void SetCFD(ColumnFamilyData* _cfd) { internal_cfd_ = _cfd; }
+ virtual ColumnFamilyData* cfd() const override { return internal_cfd_; }
+
+ private:
+ ColumnFamilyData* internal_cfd_;
+};
+
+// holds references to memtable, all immutable memtables and version
+struct SuperVersion {
+ // Accessing members of this class is not thread-safe and requires external
+ // synchronization (ie db mutex held or on write thread).
+ ColumnFamilyData* cfd;
+ MemTable* mem;
+ MemTableListVersion* imm;
+ Version* current;
+ MutableCFOptions mutable_cf_options;
+ // Version number of the current SuperVersion
+ uint64_t version_number;
+ WriteStallCondition write_stall_condition;
+
+ InstrumentedMutex* db_mutex;
+
+ // should be called outside the mutex
+ SuperVersion() = default;
+ ~SuperVersion();
+ SuperVersion* Ref();
+ // If Unref() returns true, Cleanup() should be called with mutex held
+ // before deleting this SuperVersion.
+ bool Unref();
+
+ // call these two methods with db mutex held
+ // Cleanup unrefs mem, imm and current. Also, it stores all memtables
+ // that needs to be deleted in to_delete vector. Unrefing those
+ // objects needs to be done in the mutex
+ void Cleanup();
+ void Init(ColumnFamilyData* new_cfd, MemTable* new_mem,
+ MemTableListVersion* new_imm, Version* new_current);
+
+ // The value of dummy is not actually used. kSVInUse takes its address as a
+ // mark in the thread local storage to indicate the SuperVersion is in use
+ // by thread. This way, the value of kSVInUse is guaranteed to have no
+ // conflict with SuperVersion object address and portable on different
+ // platform.
+ static int dummy;
+ static void* const kSVInUse;
+ static void* const kSVObsolete;
+
+ private:
+ std::atomic<uint32_t> refs;
+ // We need to_delete because during Cleanup(), imm->Unref() returns
+ // all memtables that we need to free through this vector. We then
+ // delete all those memtables outside of mutex, during destruction
+ autovector<MemTable*> to_delete;
+};
+
+extern Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options);
+
+extern Status CheckConcurrentWritesSupported(
+ const ColumnFamilyOptions& cf_options);
+
+extern Status CheckCFPathsSupported(const DBOptions& db_options,
+ const ColumnFamilyOptions& cf_options);
+
+extern ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
+ const ColumnFamilyOptions& src);
+// Wrap user defined table proproties collector factories `from cf_options`
+// into internal ones in int_tbl_prop_collector_factories. Add a system internal
+// one too.
+extern void GetIntTblPropCollectorFactory(
+ const ImmutableCFOptions& ioptions,
+ std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+ int_tbl_prop_collector_factories);
+
+class ColumnFamilySet;
+
+// This class keeps all the data that a column family needs.
+// Most methods require DB mutex held, unless otherwise noted
+class ColumnFamilyData {
+ public:
+ ~ColumnFamilyData();
+
+ // thread-safe
+ uint32_t GetID() const { return id_; }
+ // thread-safe
+ const std::string& GetName() const { return name_; }
+
+ // Ref() can only be called from a context where the caller can guarantee
+ // that ColumnFamilyData is alive (while holding a non-zero ref already,
+ // holding a DB mutex, or as the leader in a write batch group).
+ void Ref() { refs_.fetch_add(1); }
+
+ // Unref decreases the reference count, but does not handle deletion
+ // when the count goes to 0. If this method returns true then the
+ // caller should delete the instance immediately, or later, by calling
+ // FreeDeadColumnFamilies(). Unref() can only be called while holding
+ // a DB mutex, or during single-threaded recovery.
+ bool Unref() {
+ int old_refs = refs_.fetch_sub(1);
+ assert(old_refs > 0);
+ return old_refs == 1;
+ }
+
+ // UnrefAndTryDelete() decreases the reference count and do free if needed,
+ // return true if this is freed else false, UnrefAndTryDelete() can only
+ // be called while holding a DB mutex, or during single-threaded recovery.
+ bool UnrefAndTryDelete();
+
+ // SetDropped() can only be called under following conditions:
+ // 1) Holding a DB mutex,
+ // 2) from single-threaded write thread, AND
+ // 3) from single-threaded VersionSet::LogAndApply()
+ // After dropping column family no other operation on that column family
+ // will be executed. All the files and memory will be, however, kept around
+ // until client drops the column family handle. That way, client can still
+ // access data from dropped column family.
+ // Column family can be dropped and still alive. In that state:
+ // *) Compaction and flush is not executed on the dropped column family.
+ // *) Client can continue reading from column family. Writes will fail unless
+ // WriteOptions::ignore_missing_column_families is true
+ // When the dropped column family is unreferenced, then we:
+ // *) Remove column family from the linked list maintained by ColumnFamilySet
+ // *) delete all memory associated with that column family
+ // *) delete all the files associated with that column family
+ void SetDropped();
+ bool IsDropped() const { return dropped_.load(std::memory_order_relaxed); }
+
+ // thread-safe
+ int NumberLevels() const { return ioptions_.num_levels; }
+
+ void SetLogNumber(uint64_t log_number) { log_number_ = log_number; }
+ uint64_t GetLogNumber() const { return log_number_; }
+
+ void SetFlushReason(FlushReason flush_reason) {
+ flush_reason_ = flush_reason;
+ }
+ FlushReason GetFlushReason() const { return flush_reason_; }
+ // thread-safe
+ const FileOptions* soptions() const;
+ const ImmutableCFOptions* ioptions() const { return &ioptions_; }
+ // REQUIRES: DB mutex held
+ // This returns the MutableCFOptions used by current SuperVersion
+ // You should use this API to reference MutableCFOptions most of the time.
+ const MutableCFOptions* GetCurrentMutableCFOptions() const {
+ return &(super_version_->mutable_cf_options);
+ }
+ // REQUIRES: DB mutex held
+ // This returns the latest MutableCFOptions, which may be not in effect yet.
+ const MutableCFOptions* GetLatestMutableCFOptions() const {
+ return &mutable_cf_options_;
+ }
+
+ // REQUIRES: DB mutex held
+ // Build ColumnFamiliesOptions with immutable options and latest mutable
+ // options.
+ ColumnFamilyOptions GetLatestCFOptions() const;
+
+ bool is_delete_range_supported() { return is_delete_range_supported_; }
+
+ // Validate CF options against DB options
+ static Status ValidateOptions(const DBOptions& db_options,
+ const ColumnFamilyOptions& cf_options);
+#ifndef ROCKSDB_LITE
+ // REQUIRES: DB mutex held
+ Status SetOptions(
+ const DBOptions& db_options,
+ const std::unordered_map<std::string, std::string>& options_map);
+#endif // ROCKSDB_LITE
+
+ InternalStats* internal_stats() { return internal_stats_.get(); }
+
+ MemTableList* imm() { return &imm_; }
+ MemTable* mem() { return mem_; }
+ Version* current() { return current_; }
+ Version* dummy_versions() { return dummy_versions_; }
+ void SetCurrent(Version* _current);
+ uint64_t GetNumLiveVersions() const; // REQUIRE: DB mutex held
+ uint64_t GetTotalSstFilesSize() const; // REQUIRE: DB mutex held
+ uint64_t GetLiveSstFilesSize() const; // REQUIRE: DB mutex held
+ void SetMemtable(MemTable* new_mem) {
+ uint64_t memtable_id = last_memtable_id_.fetch_add(1) + 1;
+ new_mem->SetID(memtable_id);
+ mem_ = new_mem;
+ }
+
+ // calculate the oldest log needed for the durability of this column family
+ uint64_t OldestLogToKeep();
+
+ // See Memtable constructor for explanation of earliest_seq param.
+ MemTable* ConstructNewMemtable(const MutableCFOptions& mutable_cf_options,
+ SequenceNumber earliest_seq);
+ void CreateNewMemtable(const MutableCFOptions& mutable_cf_options,
+ SequenceNumber earliest_seq);
+
+ TableCache* table_cache() const { return table_cache_.get(); }
+
+ // See documentation in compaction_picker.h
+ // REQUIRES: DB mutex held
+ bool NeedsCompaction() const;
+ // REQUIRES: DB mutex held
+ Compaction* PickCompaction(const MutableCFOptions& mutable_options,
+ LogBuffer* log_buffer);
+
+ // Check if the passed range overlap with any running compactions.
+ // REQUIRES: DB mutex held
+ bool RangeOverlapWithCompaction(const Slice& smallest_user_key,
+ const Slice& largest_user_key,
+ int level) const;
+
+ // Check if the passed ranges overlap with any unflushed memtables
+ // (immutable or mutable).
+ //
+ // @param super_version A referenced SuperVersion that will be held for the
+ // duration of this function.
+ //
+ // Thread-safe
+ Status RangesOverlapWithMemtables(const autovector<Range>& ranges,
+ SuperVersion* super_version, bool* overlap);
+
+ // A flag to tell a manual compaction is to compact all levels together
+ // instead of a specific level.
+ static const int kCompactAllLevels;
+ // A flag to tell a manual compaction's output is base level.
+ static const int kCompactToBaseLevel;
+ // REQUIRES: DB mutex held
+ Compaction* CompactRange(const MutableCFOptions& mutable_cf_options,
+ int input_level, int output_level,
+ const CompactRangeOptions& compact_range_options,
+ const InternalKey* begin, const InternalKey* end,
+ InternalKey** compaction_end, bool* manual_conflict,
+ uint64_t max_file_num_to_ignore);
+
+ CompactionPicker* compaction_picker() { return compaction_picker_.get(); }
+ // thread-safe
+ const Comparator* user_comparator() const {
+ return internal_comparator_.user_comparator();
+ }
+ // thread-safe
+ const InternalKeyComparator& internal_comparator() const {
+ return internal_comparator_;
+ }
+
+ const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+ int_tbl_prop_collector_factories() const {
+ return &int_tbl_prop_collector_factories_;
+ }
+
+ SuperVersion* GetSuperVersion() { return super_version_; }
+ // thread-safe
+ // Return a already referenced SuperVersion to be used safely.
+ SuperVersion* GetReferencedSuperVersion(DBImpl* db);
+ // thread-safe
+ // Get SuperVersion stored in thread local storage. If it does not exist,
+ // get a reference from a current SuperVersion.
+ SuperVersion* GetThreadLocalSuperVersion(DBImpl* db);
+ // Try to return SuperVersion back to thread local storage. Retrun true on
+ // success and false on failure. It fails when the thread local storage
+ // contains anything other than SuperVersion::kSVInUse flag.
+ bool ReturnThreadLocalSuperVersion(SuperVersion* sv);
+ // thread-safe
+ uint64_t GetSuperVersionNumber() const {
+ return super_version_number_.load();
+ }
+ // will return a pointer to SuperVersion* if previous SuperVersion
+ // if its reference count is zero and needs deletion or nullptr if not
+ // As argument takes a pointer to allocated SuperVersion to enable
+ // the clients to allocate SuperVersion outside of mutex.
+ // IMPORTANT: Only call this from DBImpl::InstallSuperVersion()
+ void InstallSuperVersion(SuperVersionContext* sv_context,
+ InstrumentedMutex* db_mutex,
+ const MutableCFOptions& mutable_cf_options);
+ void InstallSuperVersion(SuperVersionContext* sv_context,
+ InstrumentedMutex* db_mutex);
+
+ void ResetThreadLocalSuperVersions();
+
+ // Protected by DB mutex
+ void set_queued_for_flush(bool value) { queued_for_flush_ = value; }
+ void set_queued_for_compaction(bool value) { queued_for_compaction_ = value; }
+ bool queued_for_flush() { return queued_for_flush_; }
+ bool queued_for_compaction() { return queued_for_compaction_; }
+
+ enum class WriteStallCause {
+ kNone,
+ kMemtableLimit,
+ kL0FileCountLimit,
+ kPendingCompactionBytes,
+ };
+ static std::pair<WriteStallCondition, WriteStallCause>
+ GetWriteStallConditionAndCause(int num_unflushed_memtables, int num_l0_files,
+ uint64_t num_compaction_needed_bytes,
+ const MutableCFOptions& mutable_cf_options);
+
+ // Recalculate some small conditions, which are changed only during
+ // compaction, adding new memtable and/or
+ // recalculation of compaction score. These values are used in
+ // DBImpl::MakeRoomForWrite function to decide, if it need to make
+ // a write stall
+ WriteStallCondition RecalculateWriteStallConditions(
+ const MutableCFOptions& mutable_cf_options);
+
+ void set_initialized() { initialized_.store(true); }
+
+ bool initialized() const { return initialized_.load(); }
+
+ const ColumnFamilyOptions& initial_cf_options() {
+ return initial_cf_options_;
+ }
+
+ Env::WriteLifeTimeHint CalculateSSTWriteHint(int level);
+
+ // created_dirs remembers directory created, so that we don't need to call
+ // the same data creation operation again.
+ Status AddDirectories(
+ std::map<std::string, std::shared_ptr<Directory>>* created_dirs);
+
+ Directory* GetDataDir(size_t path_id) const;
+
+ ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); }
+
+ private:
+ friend class ColumnFamilySet;
+ ColumnFamilyData(uint32_t id, const std::string& name,
+ Version* dummy_versions, Cache* table_cache,
+ WriteBufferManager* write_buffer_manager,
+ const ColumnFamilyOptions& options,
+ const ImmutableDBOptions& db_options,
+ const FileOptions& file_options,
+ ColumnFamilySet* column_family_set,
+ BlockCacheTracer* const block_cache_tracer);
+
+ uint32_t id_;
+ const std::string name_;
+ Version* dummy_versions_; // Head of circular doubly-linked list of versions.
+ Version* current_; // == dummy_versions->prev_
+
+ std::atomic<int> refs_; // outstanding references to ColumnFamilyData
+ std::atomic<bool> initialized_;
+ std::atomic<bool> dropped_; // true if client dropped it
+
+ const InternalKeyComparator internal_comparator_;
+ std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
+ int_tbl_prop_collector_factories_;
+
+ const ColumnFamilyOptions initial_cf_options_;
+ const ImmutableCFOptions ioptions_;
+ MutableCFOptions mutable_cf_options_;
+
+ const bool is_delete_range_supported_;
+
+ std::unique_ptr<TableCache> table_cache_;
+
+ std::unique_ptr<InternalStats> internal_stats_;
+
+ WriteBufferManager* write_buffer_manager_;
+
+ MemTable* mem_;
+ MemTableList imm_;
+ SuperVersion* super_version_;
+
+ // An ordinal representing the current SuperVersion. Updated by
+ // InstallSuperVersion(), i.e. incremented every time super_version_
+ // changes.
+ std::atomic<uint64_t> super_version_number_;
+
+ // Thread's local copy of SuperVersion pointer
+ // This needs to be destructed before mutex_
+ std::unique_ptr<ThreadLocalPtr> local_sv_;
+
+ // pointers for a circular linked list. we use it to support iterations over
+ // all column families that are alive (note: dropped column families can also
+ // be alive as long as client holds a reference)
+ ColumnFamilyData* next_;
+ ColumnFamilyData* prev_;
+
+ // This is the earliest log file number that contains data from this
+ // Column Family. All earlier log files must be ignored and not
+ // recovered from
+ uint64_t log_number_;
+
+ std::atomic<FlushReason> flush_reason_;
+
+ // An object that keeps all the compaction stats
+ // and picks the next compaction
+ std::unique_ptr<CompactionPicker> compaction_picker_;
+
+ ColumnFamilySet* column_family_set_;
+
+ std::unique_ptr<WriteControllerToken> write_controller_token_;
+
+ // If true --> this ColumnFamily is currently present in DBImpl::flush_queue_
+ bool queued_for_flush_;
+
+ // If true --> this ColumnFamily is currently present in
+ // DBImpl::compaction_queue_
+ bool queued_for_compaction_;
+
+ uint64_t prev_compaction_needed_bytes_;
+
+ // if the database was opened with 2pc enabled
+ bool allow_2pc_;
+
+ // Memtable id to track flush.
+ std::atomic<uint64_t> last_memtable_id_;
+
+ // Directories corresponding to cf_paths.
+ std::vector<std::shared_ptr<Directory>> data_dirs_;
+};
+
+// ColumnFamilySet has interesting thread-safety requirements
+// * CreateColumnFamily() or RemoveColumnFamily() -- need to be protected by DB
+// mutex AND executed in the write thread.
+// CreateColumnFamily() should ONLY be called from VersionSet::LogAndApply() AND
+// single-threaded write thread. It is also called during Recovery and in
+// DumpManifest().
+// RemoveColumnFamily() is only called from SetDropped(). DB mutex needs to be
+// held and it needs to be executed from the write thread. SetDropped() also
+// guarantees that it will be called only from single-threaded LogAndApply(),
+// but this condition is not that important.
+// * Iteration -- hold DB mutex, but you can release it in the body of
+// iteration. If you release DB mutex in body, reference the column
+// family before the mutex and unreference after you unlock, since the column
+// family might get dropped when the DB mutex is released
+// * GetDefault() -- thread safe
+// * GetColumnFamily() -- either inside of DB mutex or from a write thread
+// * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(),
+// NumberOfColumnFamilies -- inside of DB mutex
+class ColumnFamilySet {
+ public:
+ // ColumnFamilySet supports iteration
+ class iterator {
+ public:
+ explicit iterator(ColumnFamilyData* cfd)
+ : current_(cfd) {}
+ iterator& operator++() {
+ // dropped column families might still be included in this iteration
+ // (we're only removing them when client drops the last reference to the
+ // column family).
+ // dummy is never dead, so this will never be infinite
+ do {
+ current_ = current_->next_;
+ } while (current_->refs_.load(std::memory_order_relaxed) == 0);
+ return *this;
+ }
+ bool operator!=(const iterator& other) {
+ return this->current_ != other.current_;
+ }
+ ColumnFamilyData* operator*() { return current_; }
+
+ private:
+ ColumnFamilyData* current_;
+ };
+
+ ColumnFamilySet(const std::string& dbname,
+ const ImmutableDBOptions* db_options,
+ const FileOptions& file_options, Cache* table_cache,
+ WriteBufferManager* write_buffer_manager,
+ WriteController* write_controller,
+ BlockCacheTracer* const block_cache_tracer);
+ ~ColumnFamilySet();
+
+ ColumnFamilyData* GetDefault() const;
+ // GetColumnFamily() calls return nullptr if column family is not found
+ ColumnFamilyData* GetColumnFamily(uint32_t id) const;
+ ColumnFamilyData* GetColumnFamily(const std::string& name) const;
+ // this call will return the next available column family ID. it guarantees
+ // that there is no column family with id greater than or equal to the
+ // returned value in the current running instance or anytime in RocksDB
+ // instance history.
+ uint32_t GetNextColumnFamilyID();
+ uint32_t GetMaxColumnFamily();
+ void UpdateMaxColumnFamily(uint32_t new_max_column_family);
+ size_t NumberOfColumnFamilies() const;
+
+ ColumnFamilyData* CreateColumnFamily(const std::string& name, uint32_t id,
+ Version* dummy_version,
+ const ColumnFamilyOptions& options);
+
+ iterator begin() { return iterator(dummy_cfd_->next_); }
+ iterator end() { return iterator(dummy_cfd_); }
+
+ // REQUIRES: DB mutex held
+ // Don't call while iterating over ColumnFamilySet
+ void FreeDeadColumnFamilies();
+
+ Cache* get_table_cache() { return table_cache_; }
+
+ private:
+ friend class ColumnFamilyData;
+ // helper function that gets called from cfd destructor
+ // REQUIRES: DB mutex held
+ void RemoveColumnFamily(ColumnFamilyData* cfd);
+
+ // column_families_ and column_family_data_ need to be protected:
+ // * when mutating both conditions have to be satisfied:
+ // 1. DB mutex locked
+ // 2. thread currently in single-threaded write thread
+ // * when reading, at least one condition needs to be satisfied:
+ // 1. DB mutex locked
+ // 2. accessed from a single-threaded write thread
+ std::unordered_map<std::string, uint32_t> column_families_;
+ std::unordered_map<uint32_t, ColumnFamilyData*> column_family_data_;
+
+ uint32_t max_column_family_;
+ ColumnFamilyData* dummy_cfd_;
+ // We don't hold the refcount here, since default column family always exists
+ // We are also not responsible for cleaning up default_cfd_cache_. This is
+ // just a cache that makes common case (accessing default column family)
+ // faster
+ ColumnFamilyData* default_cfd_cache_;
+
+ const std::string db_name_;
+ const ImmutableDBOptions* const db_options_;
+ const FileOptions file_options_;
+ Cache* table_cache_;
+ WriteBufferManager* write_buffer_manager_;
+ WriteController* write_controller_;
+ BlockCacheTracer* const block_cache_tracer_;
+};
+
+// We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
+// memtables of different column families (specified by ID in the write batch)
+class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
+ public:
+ explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set)
+ : column_family_set_(column_family_set), current_(nullptr) {}
+
+ // Constructs a ColumnFamilyMemTablesImpl equivalent to one constructed
+ // with the arguments used to construct *orig.
+ explicit ColumnFamilyMemTablesImpl(ColumnFamilyMemTablesImpl* orig)
+ : column_family_set_(orig->column_family_set_), current_(nullptr) {}
+
+ // sets current_ to ColumnFamilyData with column_family_id
+ // returns false if column family doesn't exist
+ // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
+ // under a DB mutex OR from a write thread
+ bool Seek(uint32_t column_family_id) override;
+
+ // Returns log number of the selected column family
+ // REQUIRES: under a DB mutex OR from a write thread
+ uint64_t GetLogNumber() const override;
+
+ // REQUIRES: Seek() called first
+ // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
+ // under a DB mutex OR from a write thread
+ virtual MemTable* GetMemTable() const override;
+
+ // Returns column family handle for the selected column family
+ // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
+ // under a DB mutex OR from a write thread
+ virtual ColumnFamilyHandle* GetColumnFamilyHandle() override;
+
+ // Cannot be called while another thread is calling Seek().
+ // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
+ // under a DB mutex OR from a write thread
+ virtual ColumnFamilyData* current() override { return current_; }
+
+ private:
+ ColumnFamilySet* column_family_set_;
+ ColumnFamilyData* current_;
+ ColumnFamilyHandleInternal handle_;
+};
+
+extern uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family);
+
+extern const Comparator* GetColumnFamilyUserComparator(
+ ColumnFamilyHandle* column_family);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/column_family_test.cc b/src/rocksdb/db/column_family_test.cc
new file mode 100644
index 000000000..24ff4e08b
--- /dev/null
+++ b/src/rocksdb/db/column_family_test.cc
@@ -0,0 +1,3387 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <algorithm>
+#include <vector>
+#include <string>
+#include <thread>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "memtable/hash_skiplist_rep.h"
+#include "options/options_parser.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static const int kValueSize = 1000;
+
+namespace {
+std::string RandomString(Random* rnd, int len) {
+ std::string r;
+ test::RandomString(rnd, len, &r);
+ return r;
+}
+} // anonymous namespace
+
+// counts how many operations were performed
+class EnvCounter : public EnvWrapper {
+ public:
+ explicit EnvCounter(Env* base)
+ : EnvWrapper(base), num_new_writable_file_(0) {}
+ int GetNumberOfNewWritableFileCalls() {
+ return num_new_writable_file_;
+ }
+ Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
+ const EnvOptions& soptions) override {
+ ++num_new_writable_file_;
+ return EnvWrapper::NewWritableFile(f, r, soptions);
+ }
+
+ private:
+ std::atomic<int> num_new_writable_file_;
+};
+
+class ColumnFamilyTestBase : public testing::Test {
+ public:
+ explicit ColumnFamilyTestBase(uint32_t format) : rnd_(139), format_(format) {
+ Env* base_env = Env::Default();
+#ifndef ROCKSDB_LITE
+ const char* test_env_uri = getenv("TEST_ENV_URI");
+ if (test_env_uri) {
+ Env* test_env = nullptr;
+ Status s = Env::LoadEnv(test_env_uri, &test_env, &env_guard_);
+ base_env = test_env;
+ EXPECT_OK(s);
+ EXPECT_NE(Env::Default(), base_env);
+ }
+#endif // !ROCKSDB_LITE
+ EXPECT_NE(nullptr, base_env);
+ env_ = new EnvCounter(base_env);
+ dbname_ = test::PerThreadDBPath("column_family_test");
+ db_options_.create_if_missing = true;
+ db_options_.fail_if_options_file_error = true;
+ db_options_.env = env_;
+ DestroyDB(dbname_, Options(db_options_, column_family_options_));
+ }
+
+ ~ColumnFamilyTestBase() override {
+ std::vector<ColumnFamilyDescriptor> column_families;
+ for (auto h : handles_) {
+ ColumnFamilyDescriptor cfdescriptor;
+ h->GetDescriptor(&cfdescriptor);
+ column_families.push_back(cfdescriptor);
+ }
+ Close();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ Destroy(column_families);
+ delete env_;
+ }
+
+ BlockBasedTableOptions GetBlockBasedTableOptions() {
+ BlockBasedTableOptions options;
+ options.format_version = format_;
+ return options;
+ }
+
+ // Return the value to associate with the specified key
+ Slice Value(int k, std::string* storage) {
+ if (k == 0) {
+ // Ugh. Random seed of 0 used to produce no entropy. This code
+ // preserves the implementation that was in place when all of the
+ // magic values in this file were picked.
+ *storage = std::string(kValueSize, ' ');
+ return Slice(*storage);
+ } else {
+ Random r(k);
+ return test::RandomString(&r, kValueSize, storage);
+ }
+ }
+
+ void Build(int base, int n, int flush_every = 0) {
+ std::string key_space, value_space;
+ WriteBatch batch;
+
+ for (int i = 0; i < n; i++) {
+ if (flush_every != 0 && i != 0 && i % flush_every == 0) {
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+ dbi->TEST_FlushMemTable();
+ }
+
+ int keyi = base + i;
+ Slice key(DBTestBase::Key(keyi));
+
+ batch.Clear();
+ batch.Put(handles_[0], key, Value(keyi, &value_space));
+ batch.Put(handles_[1], key, Value(keyi, &value_space));
+ batch.Put(handles_[2], key, Value(keyi, &value_space));
+ ASSERT_OK(db_->Write(WriteOptions(), &batch));
+ }
+ }
+
+ void CheckMissed() {
+ uint64_t next_expected = 0;
+ uint64_t missed = 0;
+ int bad_keys = 0;
+ int bad_values = 0;
+ int correct = 0;
+ std::string value_space;
+ for (int cf = 0; cf < 3; cf++) {
+ next_expected = 0;
+ Iterator* iter = db_->NewIterator(ReadOptions(false, true), handles_[cf]);
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ uint64_t key;
+ Slice in(iter->key());
+ in.remove_prefix(3);
+ if (!ConsumeDecimalNumber(&in, &key) || !in.empty() ||
+ key < next_expected) {
+ bad_keys++;
+ continue;
+ }
+ missed += (key - next_expected);
+ next_expected = key + 1;
+ if (iter->value() != Value(static_cast<int>(key), &value_space)) {
+ bad_values++;
+ } else {
+ correct++;
+ }
+ }
+ delete iter;
+ }
+
+ ASSERT_EQ(0, bad_keys);
+ ASSERT_EQ(0, bad_values);
+ ASSERT_EQ(0, missed);
+ (void)correct;
+ }
+
+ void Close() {
+ for (auto h : handles_) {
+ if (h) {
+ db_->DestroyColumnFamilyHandle(h);
+ }
+ }
+ handles_.clear();
+ names_.clear();
+ delete db_;
+ db_ = nullptr;
+ }
+
+ Status TryOpen(std::vector<std::string> cf,
+ std::vector<ColumnFamilyOptions> options = {}) {
+ std::vector<ColumnFamilyDescriptor> column_families;
+ names_.clear();
+ for (size_t i = 0; i < cf.size(); ++i) {
+ column_families.push_back(ColumnFamilyDescriptor(
+ cf[i], options.size() == 0 ? column_family_options_ : options[i]));
+ names_.push_back(cf[i]);
+ }
+ return DB::Open(db_options_, dbname_, column_families, &handles_, &db_);
+ }
+
+ Status OpenReadOnly(std::vector<std::string> cf,
+ std::vector<ColumnFamilyOptions> options = {}) {
+ std::vector<ColumnFamilyDescriptor> column_families;
+ names_.clear();
+ for (size_t i = 0; i < cf.size(); ++i) {
+ column_families.push_back(ColumnFamilyDescriptor(
+ cf[i], options.size() == 0 ? column_family_options_ : options[i]));
+ names_.push_back(cf[i]);
+ }
+ return DB::OpenForReadOnly(db_options_, dbname_, column_families, &handles_,
+ &db_);
+ }
+
+#ifndef ROCKSDB_LITE // ReadOnlyDB is not supported
+ void AssertOpenReadOnly(std::vector<std::string> cf,
+ std::vector<ColumnFamilyOptions> options = {}) {
+ ASSERT_OK(OpenReadOnly(cf, options));
+ }
+#endif // !ROCKSDB_LITE
+
+
+ void Open(std::vector<std::string> cf,
+ std::vector<ColumnFamilyOptions> options = {}) {
+ ASSERT_OK(TryOpen(cf, options));
+ }
+
+ void Open() {
+ Open({"default"});
+ }
+
+ DBImpl* dbfull() { return reinterpret_cast<DBImpl*>(db_); }
+
+ int GetProperty(int cf, std::string property) {
+ std::string value;
+ EXPECT_TRUE(dbfull()->GetProperty(handles_[cf], property, &value));
+#ifndef CYGWIN
+ return std::stoi(value);
+#else
+ return std::strtol(value.c_str(), 0 /* off */, 10 /* base */);
+#endif
+ }
+
+ bool IsDbWriteStopped() {
+#ifndef ROCKSDB_LITE
+ uint64_t v;
+ EXPECT_TRUE(dbfull()->GetIntProperty("rocksdb.is-write-stopped", &v));
+ return (v == 1);
+#else
+ return dbfull()->TEST_write_controler().IsStopped();
+#endif // !ROCKSDB_LITE
+ }
+
+ uint64_t GetDbDelayedWriteRate() {
+#ifndef ROCKSDB_LITE
+ uint64_t v;
+ EXPECT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.actual-delayed-write-rate", &v));
+ return v;
+#else
+ if (!dbfull()->TEST_write_controler().NeedsDelay()) {
+ return 0;
+ }
+ return dbfull()->TEST_write_controler().delayed_write_rate();
+#endif // !ROCKSDB_LITE
+ }
+
+ void Destroy(const std::vector<ColumnFamilyDescriptor>& column_families =
+ std::vector<ColumnFamilyDescriptor>()) {
+ Close();
+ ASSERT_OK(DestroyDB(dbname_, Options(db_options_, column_family_options_),
+ column_families));
+ }
+
+ void CreateColumnFamilies(
+ const std::vector<std::string>& cfs,
+ const std::vector<ColumnFamilyOptions> options = {}) {
+ int cfi = static_cast<int>(handles_.size());
+ handles_.resize(cfi + cfs.size());
+ names_.resize(cfi + cfs.size());
+ for (size_t i = 0; i < cfs.size(); ++i) {
+ const auto& current_cf_opt =
+ options.size() == 0 ? column_family_options_ : options[i];
+ ASSERT_OK(
+ db_->CreateColumnFamily(current_cf_opt, cfs[i], &handles_[cfi]));
+ names_[cfi] = cfs[i];
+
+#ifndef ROCKSDB_LITE // RocksDBLite does not support GetDescriptor
+ // Verify the CF options of the returned CF handle.
+ ColumnFamilyDescriptor desc;
+ ASSERT_OK(handles_[cfi]->GetDescriptor(&desc));
+ RocksDBOptionsParser::VerifyCFOptions(desc.options, current_cf_opt);
+#endif // !ROCKSDB_LITE
+ cfi++;
+ }
+ }
+
+ void Reopen(const std::vector<ColumnFamilyOptions> options = {}) {
+ std::vector<std::string> names;
+ for (auto name : names_) {
+ if (name != "") {
+ names.push_back(name);
+ }
+ }
+ Close();
+ assert(options.size() == 0 || names.size() == options.size());
+ Open(names, options);
+ }
+
+ void CreateColumnFamiliesAndReopen(const std::vector<std::string>& cfs) {
+ CreateColumnFamilies(cfs);
+ Reopen();
+ }
+
+ void DropColumnFamilies(const std::vector<int>& cfs) {
+ for (auto cf : cfs) {
+ ASSERT_OK(db_->DropColumnFamily(handles_[cf]));
+ db_->DestroyColumnFamilyHandle(handles_[cf]);
+ handles_[cf] = nullptr;
+ names_[cf] = "";
+ }
+ }
+
+ void PutRandomData(int cf, int num, int key_value_size, bool save = false) {
+ if (cf >= static_cast<int>(keys_.size())) {
+ keys_.resize(cf + 1);
+ }
+ for (int i = 0; i < num; ++i) {
+ // 10 bytes for key, rest is value
+ if (!save) {
+ ASSERT_OK(Put(cf, test::RandomKey(&rnd_, 11),
+ RandomString(&rnd_, key_value_size - 10)));
+ } else {
+ std::string key = test::RandomKey(&rnd_, 11);
+ keys_[cf].insert(key);
+ ASSERT_OK(Put(cf, key, RandomString(&rnd_, key_value_size - 10)));
+ }
+ }
+ db_->FlushWAL(false);
+ }
+
+#ifndef ROCKSDB_LITE // TEST functions in DB are not supported in lite
+ void WaitForFlush(int cf) {
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
+ }
+
+ void WaitForCompaction() {
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+
+ uint64_t MaxTotalInMemoryState() {
+ return dbfull()->TEST_MaxTotalInMemoryState();
+ }
+
+ void AssertMaxTotalInMemoryState(uint64_t value) {
+ ASSERT_EQ(value, MaxTotalInMemoryState());
+ }
+#endif // !ROCKSDB_LITE
+
+ Status Put(int cf, const std::string& key, const std::string& value) {
+ return db_->Put(WriteOptions(), handles_[cf], Slice(key), Slice(value));
+ }
+ Status Merge(int cf, const std::string& key, const std::string& value) {
+ return db_->Merge(WriteOptions(), handles_[cf], Slice(key), Slice(value));
+ }
+ Status Flush(int cf) {
+ return db_->Flush(FlushOptions(), handles_[cf]);
+ }
+
+ std::string Get(int cf, const std::string& key) {
+ ReadOptions options;
+ options.verify_checksums = true;
+ std::string result;
+ Status s = db_->Get(options, handles_[cf], Slice(key), &result);
+ if (s.IsNotFound()) {
+ result = "NOT_FOUND";
+ } else if (!s.ok()) {
+ result = s.ToString();
+ }
+ return result;
+ }
+
+ void CompactAll(int cf) {
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[cf], nullptr,
+ nullptr));
+ }
+
+ void Compact(int cf, const Slice& start, const Slice& limit) {
+ ASSERT_OK(
+ db_->CompactRange(CompactRangeOptions(), handles_[cf], &start, &limit));
+ }
+
+ int NumTableFilesAtLevel(int level, int cf) {
+ return GetProperty(cf,
+ "rocksdb.num-files-at-level" + ToString(level));
+ }
+
+#ifndef ROCKSDB_LITE
+ // Return spread of files per level
+ std::string FilesPerLevel(int cf) {
+ std::string result;
+ int last_non_zero_offset = 0;
+ for (int level = 0; level < dbfull()->NumberLevels(handles_[cf]); level++) {
+ int f = NumTableFilesAtLevel(level, cf);
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+ result += buf;
+ if (f > 0) {
+ last_non_zero_offset = static_cast<int>(result.size());
+ }
+ }
+ result.resize(last_non_zero_offset);
+ return result;
+ }
+#endif
+
+ void AssertFilesPerLevel(const std::string& value, int cf) {
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ(value, FilesPerLevel(cf));
+#else
+ (void) value;
+ (void) cf;
+#endif
+ }
+
+#ifndef ROCKSDB_LITE // GetLiveFilesMetaData is not supported
+ int CountLiveFiles() {
+ std::vector<LiveFileMetaData> metadata;
+ db_->GetLiveFilesMetaData(&metadata);
+ return static_cast<int>(metadata.size());
+ }
+#endif // !ROCKSDB_LITE
+
+ void AssertCountLiveFiles(int expected_value) {
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ(expected_value, CountLiveFiles());
+#else
+ (void) expected_value;
+#endif
+ }
+
+ // Do n memtable flushes, each of which produces an sstable
+ // covering the range [small,large].
+ void MakeTables(int cf, int n, const std::string& small,
+ const std::string& large) {
+ for (int i = 0; i < n; i++) {
+ ASSERT_OK(Put(cf, small, "begin"));
+ ASSERT_OK(Put(cf, large, "end"));
+ ASSERT_OK(db_->Flush(FlushOptions(), handles_[cf]));
+ }
+ }
+
+#ifndef ROCKSDB_LITE // GetSortedWalFiles is not supported
+ int CountLiveLogFiles() {
+ int micros_wait_for_log_deletion = 20000;
+ env_->SleepForMicroseconds(micros_wait_for_log_deletion);
+ int ret = 0;
+ VectorLogPtr wal_files;
+ Status s;
+ // GetSortedWalFiles is a flakey function -- it gets all the wal_dir
+ // children files and then later checks for their existence. if some of the
+ // log files doesn't exist anymore, it reports an error. it does all of this
+ // without DB mutex held, so if a background process deletes the log file
+ // while the function is being executed, it returns an error. We retry the
+ // function 10 times to avoid the error failing the test
+ for (int retries = 0; retries < 10; ++retries) {
+ wal_files.clear();
+ s = db_->GetSortedWalFiles(wal_files);
+ if (s.ok()) {
+ break;
+ }
+ }
+ EXPECT_OK(s);
+ for (const auto& wal : wal_files) {
+ if (wal->Type() == kAliveLogFile) {
+ ++ret;
+ }
+ }
+ return ret;
+ return 0;
+ }
+#endif // !ROCKSDB_LITE
+
+ void AssertCountLiveLogFiles(int value) {
+#ifndef ROCKSDB_LITE // GetSortedWalFiles is not supported
+ ASSERT_EQ(value, CountLiveLogFiles());
+#else
+ (void) value;
+#endif // !ROCKSDB_LITE
+ }
+
+ void AssertNumberOfImmutableMemtables(std::vector<int> num_per_cf) {
+ assert(num_per_cf.size() == handles_.size());
+
+#ifndef ROCKSDB_LITE // GetProperty is not supported in lite
+ for (size_t i = 0; i < num_per_cf.size(); ++i) {
+ ASSERT_EQ(num_per_cf[i], GetProperty(static_cast<int>(i),
+ "rocksdb.num-immutable-mem-table"));
+ }
+#endif // !ROCKSDB_LITE
+ }
+
+ void CopyFile(const std::string& source, const std::string& destination,
+ uint64_t size = 0) {
+ const EnvOptions soptions;
+ std::unique_ptr<SequentialFile> srcfile;
+ ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions));
+ std::unique_ptr<WritableFile> destfile;
+ ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions));
+
+ if (size == 0) {
+ // default argument means copy everything
+ ASSERT_OK(env_->GetFileSize(source, &size));
+ }
+
+ char buffer[4096];
+ Slice slice;
+ while (size > 0) {
+ uint64_t one = std::min(uint64_t(sizeof(buffer)), size);
+ ASSERT_OK(srcfile->Read(one, &slice, buffer));
+ ASSERT_OK(destfile->Append(slice));
+ size -= slice.size();
+ }
+ ASSERT_OK(destfile->Close());
+ }
+
+ int GetSstFileCount(std::string path) {
+ std::vector<std::string> files;
+ DBTestBase::GetSstFiles(env_, path, &files);
+ return static_cast<int>(files.size());
+ }
+
+ void RecalculateWriteStallConditions(ColumnFamilyData* cfd,
+ const MutableCFOptions& mutable_cf_options) {
+ // add lock to avoid race condition between
+ // `RecalculateWriteStallConditions` which writes to CFStats and
+ // background `DBImpl::DumpStats()` threads which read CFStats
+ dbfull()->TEST_LockMutex();
+ cfd->RecalculateWriteStallConditions(mutable_cf_options);
+ dbfull()-> TEST_UnlockMutex();
+ }
+
+ std::vector<ColumnFamilyHandle*> handles_;
+ std::vector<std::string> names_;
+ std::vector<std::set<std::string>> keys_;
+ ColumnFamilyOptions column_family_options_;
+ DBOptions db_options_;
+ std::string dbname_;
+ DB* db_ = nullptr;
+ EnvCounter* env_;
+ std::shared_ptr<Env> env_guard_;
+ Random rnd_;
+ uint32_t format_;
+};
+
+class ColumnFamilyTest
+ : public ColumnFamilyTestBase,
+ virtual public ::testing::WithParamInterface<uint32_t> {
+ public:
+ ColumnFamilyTest() : ColumnFamilyTestBase(GetParam()) {}
+};
+
+INSTANTIATE_TEST_CASE_P(FormatDef, ColumnFamilyTest,
+ testing::Values(test::kDefaultFormatVersion));
+INSTANTIATE_TEST_CASE_P(FormatLatest, ColumnFamilyTest,
+ testing::Values(test::kLatestFormatVersion));
+
+TEST_P(ColumnFamilyTest, DontReuseColumnFamilyID) {
+ for (int iter = 0; iter < 3; ++iter) {
+ Open();
+ CreateColumnFamilies({"one", "two", "three"});
+ for (size_t i = 0; i < handles_.size(); ++i) {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+ ASSERT_EQ(i, cfh->GetID());
+ }
+ if (iter == 1) {
+ Reopen();
+ }
+ DropColumnFamilies({3});
+ Reopen();
+ if (iter == 2) {
+ // this tests if max_column_family is correctly persisted with
+ // WriteSnapshot()
+ Reopen();
+ }
+ CreateColumnFamilies({"three2"});
+ // ID 3 that was used for dropped column family "three" should not be
+ // reused
+ auto cfh3 = reinterpret_cast<ColumnFamilyHandleImpl*>(handles_[3]);
+ ASSERT_EQ(4U, cfh3->GetID());
+ Close();
+ Destroy();
+ }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_P(ColumnFamilyTest, CreateCFRaceWithGetAggProperty) {
+ Open();
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::WriteOptionsFile:1",
+ "ColumnFamilyTest.CreateCFRaceWithGetAggProperty:1"},
+ {"ColumnFamilyTest.CreateCFRaceWithGetAggProperty:2",
+ "DBImpl::WriteOptionsFile:2"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ROCKSDB_NAMESPACE::port::Thread thread(
+ [&] { CreateColumnFamilies({"one"}); });
+
+ TEST_SYNC_POINT("ColumnFamilyTest.CreateCFRaceWithGetAggProperty:1");
+ uint64_t pv;
+ db_->GetAggregatedIntProperty(DB::Properties::kEstimateTableReadersMem, &pv);
+ TEST_SYNC_POINT("ColumnFamilyTest.CreateCFRaceWithGetAggProperty:2");
+
+ thread.join();
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+#endif // !ROCKSDB_LITE
+
+class FlushEmptyCFTestWithParam
+ : public ColumnFamilyTestBase,
+ virtual public testing::WithParamInterface<std::tuple<uint32_t, bool>> {
+ public:
+ FlushEmptyCFTestWithParam()
+ : ColumnFamilyTestBase(std::get<0>(GetParam())),
+ allow_2pc_(std::get<1>(GetParam())) {}
+
+ // Required if inheriting from testing::WithParamInterface<>
+ static void SetUpTestCase() {}
+ static void TearDownTestCase() {}
+
+ bool allow_2pc_;
+};
+
+TEST_P(FlushEmptyCFTestWithParam, FlushEmptyCFTest) {
+ std::unique_ptr<FaultInjectionTestEnv> fault_env(
+ new FaultInjectionTestEnv(env_));
+ db_options_.env = fault_env.get();
+ db_options_.allow_2pc = allow_2pc_;
+ Open();
+ CreateColumnFamilies({"one", "two"});
+ // Generate log file A.
+ ASSERT_OK(Put(1, "foo", "v1")); // seqID 1
+
+ Reopen();
+ // Log file A is not dropped after reopening because default column family's
+ // min log number is 0.
+ // It flushes to SST file X
+ ASSERT_OK(Put(1, "foo", "v1")); // seqID 2
+ ASSERT_OK(Put(1, "bar", "v2")); // seqID 3
+ // Current log file is file B now. While flushing, a new log file C is created
+ // and is set to current. Boths' min log number is set to file C in memory, so
+ // after flushing file B is deleted. At the same time, the min log number of
+ // default CF is not written to manifest. Log file A still remains.
+ // Flushed to SST file Y.
+ Flush(1);
+ Flush(0);
+ ASSERT_OK(Put(1, "bar", "v3")); // seqID 4
+ ASSERT_OK(Put(1, "foo", "v4")); // seqID 5
+ db_->FlushWAL(false);
+
+ // Preserve file system state up to here to simulate a crash condition.
+ fault_env->SetFilesystemActive(false);
+ std::vector<std::string> names;
+ for (auto name : names_) {
+ if (name != "") {
+ names.push_back(name);
+ }
+ }
+
+ Close();
+ fault_env->ResetState();
+
+ // Before opening, there are four files:
+ // Log file A contains seqID 1
+ // Log file C contains seqID 4, 5
+ // SST file X contains seqID 1
+ // SST file Y contains seqID 2, 3
+ // Min log number:
+ // default CF: 0
+ // CF one, two: C
+ // When opening the DB, all the seqID should be preserved.
+ Open(names, {});
+ ASSERT_EQ("v4", Get(1, "foo"));
+ ASSERT_EQ("v3", Get(1, "bar"));
+ Close();
+
+ db_options_.env = env_;
+}
+
+TEST_P(FlushEmptyCFTestWithParam, FlushEmptyCFTest2) {
+ std::unique_ptr<FaultInjectionTestEnv> fault_env(
+ new FaultInjectionTestEnv(env_));
+ db_options_.env = fault_env.get();
+ db_options_.allow_2pc = allow_2pc_;
+ Open();
+ CreateColumnFamilies({"one", "two"});
+ // Generate log file A.
+ ASSERT_OK(Put(1, "foo", "v1")); // seqID 1
+
+ Reopen();
+ // Log file A is not dropped after reopening because default column family's
+ // min log number is 0.
+ // It flushes to SST file X
+ ASSERT_OK(Put(1, "foo", "v1")); // seqID 2
+ ASSERT_OK(Put(1, "bar", "v2")); // seqID 3
+ // Current log file is file B now. While flushing, a new log file C is created
+ // and is set to current. Both CFs' min log number is set to file C so after
+ // flushing file B is deleted. Log file A still remains.
+ // Flushed to SST file Y.
+ Flush(1);
+ ASSERT_OK(Put(0, "bar", "v2")); // seqID 4
+ ASSERT_OK(Put(2, "bar", "v2")); // seqID 5
+ ASSERT_OK(Put(1, "bar", "v3")); // seqID 6
+ // Flushing all column families. This forces all CFs' min log to current. This
+ // is written to the manifest file. Log file C is cleared.
+ Flush(0);
+ Flush(1);
+ Flush(2);
+ // Write to log file D
+ ASSERT_OK(Put(1, "bar", "v4")); // seqID 7
+ ASSERT_OK(Put(1, "bar", "v5")); // seqID 8
+ db_->FlushWAL(false);
+ // Preserve file system state up to here to simulate a crash condition.
+ fault_env->SetFilesystemActive(false);
+ std::vector<std::string> names;
+ for (auto name : names_) {
+ if (name != "") {
+ names.push_back(name);
+ }
+ }
+
+ Close();
+ fault_env->ResetState();
+ // Before opening, there are two logfiles:
+ // Log file A contains seqID 1
+ // Log file D contains seqID 7, 8
+ // Min log number:
+ // default CF: D
+ // CF one, two: D
+ // When opening the DB, log file D should be replayed using the seqID
+ // specified in the file.
+ Open(names, {});
+ ASSERT_EQ("v1", Get(1, "foo"));
+ ASSERT_EQ("v5", Get(1, "bar"));
+ Close();
+
+ db_options_.env = env_;
+}
+
+INSTANTIATE_TEST_CASE_P(
+ FormatDef, FlushEmptyCFTestWithParam,
+ testing::Values(std::make_tuple(test::kDefaultFormatVersion, true),
+ std::make_tuple(test::kDefaultFormatVersion, false)));
+INSTANTIATE_TEST_CASE_P(
+ FormatLatest, FlushEmptyCFTestWithParam,
+ testing::Values(std::make_tuple(test::kLatestFormatVersion, true),
+ std::make_tuple(test::kLatestFormatVersion, false)));
+
+TEST_P(ColumnFamilyTest, AddDrop) {
+ Open();
+ CreateColumnFamilies({"one", "two", "three"});
+ ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
+ ASSERT_EQ("NOT_FOUND", Get(2, "fodor"));
+ DropColumnFamilies({2});
+ ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
+ CreateColumnFamilies({"four"});
+ ASSERT_EQ("NOT_FOUND", Get(3, "fodor"));
+ ASSERT_OK(Put(1, "fodor", "mirko"));
+ ASSERT_EQ("mirko", Get(1, "fodor"));
+ ASSERT_EQ("NOT_FOUND", Get(3, "fodor"));
+ Close();
+ ASSERT_TRUE(TryOpen({"default"}).IsInvalidArgument());
+ Open({"default", "one", "three", "four"});
+ DropColumnFamilies({1});
+ Reopen();
+ Close();
+
+ std::vector<std::string> families;
+ ASSERT_OK(DB::ListColumnFamilies(db_options_, dbname_, &families));
+ std::sort(families.begin(), families.end());
+ ASSERT_TRUE(families ==
+ std::vector<std::string>({"default", "four", "three"}));
+}
+
+TEST_P(ColumnFamilyTest, BulkAddDrop) {
+ constexpr int kNumCF = 1000;
+ ColumnFamilyOptions cf_options;
+ WriteOptions write_options;
+ Open();
+ std::vector<std::string> cf_names;
+ std::vector<ColumnFamilyHandle*> cf_handles;
+ for (int i = 1; i <= kNumCF; i++) {
+ cf_names.push_back("cf1-" + ToString(i));
+ }
+ ASSERT_OK(db_->CreateColumnFamilies(cf_options, cf_names, &cf_handles));
+ for (int i = 1; i <= kNumCF; i++) {
+ ASSERT_OK(db_->Put(write_options, cf_handles[i - 1], "foo", "bar"));
+ }
+ ASSERT_OK(db_->DropColumnFamilies(cf_handles));
+ std::vector<ColumnFamilyDescriptor> cf_descriptors;
+ for (auto* handle : cf_handles) {
+ delete handle;
+ }
+ cf_handles.clear();
+ for (int i = 1; i <= kNumCF; i++) {
+ cf_descriptors.emplace_back("cf2-" + ToString(i), ColumnFamilyOptions());
+ }
+ ASSERT_OK(db_->CreateColumnFamilies(cf_descriptors, &cf_handles));
+ for (int i = 1; i <= kNumCF; i++) {
+ ASSERT_OK(db_->Put(write_options, cf_handles[i - 1], "foo", "bar"));
+ }
+ ASSERT_OK(db_->DropColumnFamilies(cf_handles));
+ for (auto* handle : cf_handles) {
+ delete handle;
+ }
+ Close();
+ std::vector<std::string> families;
+ ASSERT_OK(DB::ListColumnFamilies(db_options_, dbname_, &families));
+ std::sort(families.begin(), families.end());
+ ASSERT_TRUE(families == std::vector<std::string>({"default"}));
+}
+
+TEST_P(ColumnFamilyTest, DropTest) {
+ // first iteration - dont reopen DB before dropping
+ // second iteration - reopen DB before dropping
+ for (int iter = 0; iter < 2; ++iter) {
+ Open({"default"});
+ CreateColumnFamiliesAndReopen({"pikachu"});
+ for (int i = 0; i < 100; ++i) {
+ ASSERT_OK(Put(1, ToString(i), "bar" + ToString(i)));
+ }
+ ASSERT_OK(Flush(1));
+
+ if (iter == 1) {
+ Reopen();
+ }
+ ASSERT_EQ("bar1", Get(1, "1"));
+
+ AssertCountLiveFiles(1);
+ DropColumnFamilies({1});
+ // make sure that all files are deleted when we drop the column family
+ AssertCountLiveFiles(0);
+ Destroy();
+ }
+}
+
+TEST_P(ColumnFamilyTest, WriteBatchFailure) {
+ Open();
+ CreateColumnFamiliesAndReopen({"one", "two"});
+ WriteBatch batch;
+ batch.Put(handles_[0], Slice("existing"), Slice("column-family"));
+ batch.Put(handles_[1], Slice("non-existing"), Slice("column-family"));
+ ASSERT_OK(db_->Write(WriteOptions(), &batch));
+ DropColumnFamilies({1});
+ WriteOptions woptions_ignore_missing_cf;
+ woptions_ignore_missing_cf.ignore_missing_column_families = true;
+ batch.Put(handles_[0], Slice("still here"), Slice("column-family"));
+ ASSERT_OK(db_->Write(woptions_ignore_missing_cf, &batch));
+ ASSERT_EQ("column-family", Get(0, "still here"));
+ Status s = db_->Write(WriteOptions(), &batch);
+ ASSERT_TRUE(s.IsInvalidArgument());
+ Close();
+}
+
+TEST_P(ColumnFamilyTest, ReadWrite) {
+ Open();
+ CreateColumnFamiliesAndReopen({"one", "two"});
+ ASSERT_OK(Put(0, "foo", "v1"));
+ ASSERT_OK(Put(0, "bar", "v2"));
+ ASSERT_OK(Put(1, "mirko", "v3"));
+ ASSERT_OK(Put(0, "foo", "v2"));
+ ASSERT_OK(Put(2, "fodor", "v5"));
+
+ for (int iter = 0; iter <= 3; ++iter) {
+ ASSERT_EQ("v2", Get(0, "foo"));
+ ASSERT_EQ("v2", Get(0, "bar"));
+ ASSERT_EQ("v3", Get(1, "mirko"));
+ ASSERT_EQ("v5", Get(2, "fodor"));
+ ASSERT_EQ("NOT_FOUND", Get(0, "fodor"));
+ ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
+ ASSERT_EQ("NOT_FOUND", Get(2, "foo"));
+ if (iter <= 1) {
+ Reopen();
+ }
+ }
+ Close();
+}
+
+TEST_P(ColumnFamilyTest, IgnoreRecoveredLog) {
+ std::string backup_logs = dbname_ + "/backup_logs";
+
+ // delete old files in backup_logs directory
+ ASSERT_OK(env_->CreateDirIfMissing(dbname_));
+ ASSERT_OK(env_->CreateDirIfMissing(backup_logs));
+ std::vector<std::string> old_files;
+ env_->GetChildren(backup_logs, &old_files);
+ for (auto& file : old_files) {
+ if (file != "." && file != "..") {
+ env_->DeleteFile(backup_logs + "/" + file);
+ }
+ }
+
+ column_family_options_.merge_operator =
+ MergeOperators::CreateUInt64AddOperator();
+ db_options_.wal_dir = dbname_ + "/logs";
+ Destroy();
+ Open();
+ CreateColumnFamilies({"cf1", "cf2"});
+
+ // fill up the DB
+ std::string one, two, three;
+ PutFixed64(&one, 1);
+ PutFixed64(&two, 2);
+ PutFixed64(&three, 3);
+ ASSERT_OK(Merge(0, "foo", one));
+ ASSERT_OK(Merge(1, "mirko", one));
+ ASSERT_OK(Merge(0, "foo", one));
+ ASSERT_OK(Merge(2, "bla", one));
+ ASSERT_OK(Merge(2, "fodor", one));
+ ASSERT_OK(Merge(0, "bar", one));
+ ASSERT_OK(Merge(2, "bla", one));
+ ASSERT_OK(Merge(1, "mirko", two));
+ ASSERT_OK(Merge(1, "franjo", one));
+
+ // copy the logs to backup
+ std::vector<std::string> logs;
+ env_->GetChildren(db_options_.wal_dir, &logs);
+ for (auto& log : logs) {
+ if (log != ".." && log != ".") {
+ CopyFile(db_options_.wal_dir + "/" + log, backup_logs + "/" + log);
+ }
+ }
+
+ // recover the DB
+ Close();
+
+ // 1. check consistency
+ // 2. copy the logs from backup back to WAL dir. if the recovery happens
+ // again on the same log files, this should lead to incorrect results
+ // due to applying merge operator twice
+ // 3. check consistency
+ for (int iter = 0; iter < 2; ++iter) {
+ // assert consistency
+ Open({"default", "cf1", "cf2"});
+ ASSERT_EQ(two, Get(0, "foo"));
+ ASSERT_EQ(one, Get(0, "bar"));
+ ASSERT_EQ(three, Get(1, "mirko"));
+ ASSERT_EQ(one, Get(1, "franjo"));
+ ASSERT_EQ(one, Get(2, "fodor"));
+ ASSERT_EQ(two, Get(2, "bla"));
+ Close();
+
+ if (iter == 0) {
+ // copy the logs from backup back to wal dir
+ for (auto& log : logs) {
+ if (log != ".." && log != ".") {
+ CopyFile(backup_logs + "/" + log, db_options_.wal_dir + "/" + log);
+ }
+ }
+ }
+ }
+}
+
+#ifndef ROCKSDB_LITE // TEST functions used are not supported
+TEST_P(ColumnFamilyTest, FlushTest) {
+ Open();
+ CreateColumnFamiliesAndReopen({"one", "two"});
+ ASSERT_OK(Put(0, "foo", "v1"));
+ ASSERT_OK(Put(0, "bar", "v2"));
+ ASSERT_OK(Put(1, "mirko", "v3"));
+ ASSERT_OK(Put(0, "foo", "v2"));
+ ASSERT_OK(Put(2, "fodor", "v5"));
+
+ for (int j = 0; j < 2; j++) {
+ ReadOptions ro;
+ std::vector<Iterator*> iterators;
+ // Hold super version.
+ if (j == 0) {
+ ASSERT_OK(db_->NewIterators(ro, handles_, &iterators));
+ }
+
+ for (int i = 0; i < 3; ++i) {
+ uint64_t max_total_in_memory_state =
+ MaxTotalInMemoryState();
+ Flush(i);
+ AssertMaxTotalInMemoryState(max_total_in_memory_state);
+ }
+ ASSERT_OK(Put(1, "foofoo", "bar"));
+ ASSERT_OK(Put(0, "foofoo", "bar"));
+
+ for (auto* it : iterators) {
+ delete it;
+ }
+ }
+ Reopen();
+
+ for (int iter = 0; iter <= 2; ++iter) {
+ ASSERT_EQ("v2", Get(0, "foo"));
+ ASSERT_EQ("v2", Get(0, "bar"));
+ ASSERT_EQ("v3", Get(1, "mirko"));
+ ASSERT_EQ("v5", Get(2, "fodor"));
+ ASSERT_EQ("NOT_FOUND", Get(0, "fodor"));
+ ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
+ ASSERT_EQ("NOT_FOUND", Get(2, "foo"));
+ if (iter <= 1) {
+ Reopen();
+ }
+ }
+ Close();
+}
+
+// Makes sure that obsolete log files get deleted
+TEST_P(ColumnFamilyTest, LogDeletionTest) {
+ db_options_.max_total_wal_size = std::numeric_limits<uint64_t>::max();
+ column_family_options_.arena_block_size = 4 * 1024;
+ column_family_options_.write_buffer_size = 128000; // 128KB
+ Open();
+ CreateColumnFamilies({"one", "two", "three", "four"});
+ // Each bracket is one log file. if number is in (), it means
+ // we don't need it anymore (it's been flushed)
+ // []
+ AssertCountLiveLogFiles(0);
+ PutRandomData(0, 1, 128);
+ // [0]
+ PutRandomData(1, 1, 128);
+ // [0, 1]
+ PutRandomData(1, 1000, 128);
+ WaitForFlush(1);
+ // [0, (1)] [1]
+ AssertCountLiveLogFiles(2);
+ PutRandomData(0, 1, 128);
+ // [0, (1)] [0, 1]
+ AssertCountLiveLogFiles(2);
+ PutRandomData(2, 1, 128);
+ // [0, (1)] [0, 1, 2]
+ PutRandomData(2, 1000, 128);
+ WaitForFlush(2);
+ // [0, (1)] [0, 1, (2)] [2]
+ AssertCountLiveLogFiles(3);
+ PutRandomData(2, 1000, 128);
+ WaitForFlush(2);
+ // [0, (1)] [0, 1, (2)] [(2)] [2]
+ AssertCountLiveLogFiles(4);
+ PutRandomData(3, 1, 128);
+ // [0, (1)] [0, 1, (2)] [(2)] [2, 3]
+ PutRandomData(1, 1, 128);
+ // [0, (1)] [0, 1, (2)] [(2)] [1, 2, 3]
+ AssertCountLiveLogFiles(4);
+ PutRandomData(1, 1000, 128);
+ WaitForFlush(1);
+ // [0, (1)] [0, (1), (2)] [(2)] [(1), 2, 3] [1]
+ AssertCountLiveLogFiles(5);
+ PutRandomData(0, 1000, 128);
+ WaitForFlush(0);
+ // [(0), (1)] [(0), (1), (2)] [(2)] [(1), 2, 3] [1, (0)] [0]
+ // delete obsolete logs -->
+ // [(1), 2, 3] [1, (0)] [0]
+ AssertCountLiveLogFiles(3);
+ PutRandomData(0, 1000, 128);
+ WaitForFlush(0);
+ // [(1), 2, 3] [1, (0)], [(0)] [0]
+ AssertCountLiveLogFiles(4);
+ PutRandomData(1, 1000, 128);
+ WaitForFlush(1);
+ // [(1), 2, 3] [(1), (0)] [(0)] [0, (1)] [1]
+ AssertCountLiveLogFiles(5);
+ PutRandomData(2, 1000, 128);
+ WaitForFlush(2);
+ // [(1), (2), 3] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2]
+ AssertCountLiveLogFiles(6);
+ PutRandomData(3, 1000, 128);
+ WaitForFlush(3);
+ // [(1), (2), (3)] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2, (3)] [3]
+ // delete obsolete logs -->
+ // [0, (1)] [1, (2)], [2, (3)] [3]
+ AssertCountLiveLogFiles(4);
+ Close();
+}
+#endif // !ROCKSDB_LITE
+
+TEST_P(ColumnFamilyTest, CrashAfterFlush) {
+ std::unique_ptr<FaultInjectionTestEnv> fault_env(
+ new FaultInjectionTestEnv(env_));
+ db_options_.env = fault_env.get();
+ Open();
+ CreateColumnFamilies({"one"});
+
+ WriteBatch batch;
+ batch.Put(handles_[0], Slice("foo"), Slice("bar"));
+ batch.Put(handles_[1], Slice("foo"), Slice("bar"));
+ ASSERT_OK(db_->Write(WriteOptions(), &batch));
+ Flush(0);
+ fault_env->SetFilesystemActive(false);
+
+ std::vector<std::string> names;
+ for (auto name : names_) {
+ if (name != "") {
+ names.push_back(name);
+ }
+ }
+ Close();
+ fault_env->DropUnsyncedFileData();
+ fault_env->ResetState();
+ Open(names, {});
+
+ // Write batch should be atomic.
+ ASSERT_EQ(Get(0, "foo"), Get(1, "foo"));
+
+ Close();
+ db_options_.env = env_;
+}
+
+TEST_P(ColumnFamilyTest, OpenNonexistentColumnFamily) {
+ ASSERT_OK(TryOpen({"default"}));
+ Close();
+ ASSERT_TRUE(TryOpen({"default", "dne"}).IsInvalidArgument());
+}
+
+#ifndef ROCKSDB_LITE // WaitForFlush() is not supported
+// Makes sure that obsolete log files get deleted
+TEST_P(ColumnFamilyTest, DifferentWriteBufferSizes) {
+ // disable flushing stale column families
+ db_options_.max_total_wal_size = std::numeric_limits<uint64_t>::max();
+ Open();
+ CreateColumnFamilies({"one", "two", "three"});
+ ColumnFamilyOptions default_cf, one, two, three;
+ // setup options. all column families have max_write_buffer_number setup to 10
+ // "default" -> 100KB memtable, start flushing immediatelly
+ // "one" -> 200KB memtable, start flushing with two immutable memtables
+ // "two" -> 1MB memtable, start flushing with three immutable memtables
+ // "three" -> 90KB memtable, start flushing with four immutable memtables
+ default_cf.write_buffer_size = 100000;
+ default_cf.arena_block_size = 4 * 4096;
+ default_cf.max_write_buffer_number = 10;
+ default_cf.min_write_buffer_number_to_merge = 1;
+ default_cf.max_write_buffer_size_to_maintain = 0;
+ one.write_buffer_size = 200000;
+ one.arena_block_size = 4 * 4096;
+ one.max_write_buffer_number = 10;
+ one.min_write_buffer_number_to_merge = 2;
+ one.max_write_buffer_size_to_maintain =
+ static_cast<int>(one.write_buffer_size);
+ two.write_buffer_size = 1000000;
+ two.arena_block_size = 4 * 4096;
+ two.max_write_buffer_number = 10;
+ two.min_write_buffer_number_to_merge = 3;
+ two.max_write_buffer_size_to_maintain =
+ static_cast<int>(two.write_buffer_size);
+ three.write_buffer_size = 4096 * 22;
+ three.arena_block_size = 4096;
+ three.max_write_buffer_number = 10;
+ three.min_write_buffer_number_to_merge = 4;
+ three.max_write_buffer_size_to_maintain =
+ static_cast<int>(three.write_buffer_size);
+
+ Reopen({default_cf, one, two, three});
+
+ int micros_wait_for_flush = 10000;
+ PutRandomData(0, 100, 1000);
+ WaitForFlush(0);
+ AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+ AssertCountLiveLogFiles(1);
+ PutRandomData(1, 200, 1000);
+ env_->SleepForMicroseconds(micros_wait_for_flush);
+ AssertNumberOfImmutableMemtables({0, 1, 0, 0});
+ AssertCountLiveLogFiles(2);
+ PutRandomData(2, 1000, 1000);
+ env_->SleepForMicroseconds(micros_wait_for_flush);
+ AssertNumberOfImmutableMemtables({0, 1, 1, 0});
+ AssertCountLiveLogFiles(3);
+ PutRandomData(2, 1000, 1000);
+ env_->SleepForMicroseconds(micros_wait_for_flush);
+ AssertNumberOfImmutableMemtables({0, 1, 2, 0});
+ AssertCountLiveLogFiles(4);
+ PutRandomData(3, 93, 990);
+ env_->SleepForMicroseconds(micros_wait_for_flush);
+ AssertNumberOfImmutableMemtables({0, 1, 2, 1});
+ AssertCountLiveLogFiles(5);
+ PutRandomData(3, 88, 990);
+ env_->SleepForMicroseconds(micros_wait_for_flush);
+ AssertNumberOfImmutableMemtables({0, 1, 2, 2});
+ AssertCountLiveLogFiles(6);
+ PutRandomData(3, 88, 990);
+ env_->SleepForMicroseconds(micros_wait_for_flush);
+ AssertNumberOfImmutableMemtables({0, 1, 2, 3});
+ AssertCountLiveLogFiles(7);
+ PutRandomData(0, 100, 1000);
+ WaitForFlush(0);
+ AssertNumberOfImmutableMemtables({0, 1, 2, 3});
+ AssertCountLiveLogFiles(8);
+ PutRandomData(2, 100, 10000);
+ WaitForFlush(2);
+ AssertNumberOfImmutableMemtables({0, 1, 0, 3});
+ AssertCountLiveLogFiles(9);
+ PutRandomData(3, 88, 990);
+ WaitForFlush(3);
+ AssertNumberOfImmutableMemtables({0, 1, 0, 0});
+ AssertCountLiveLogFiles(10);
+ PutRandomData(3, 88, 990);
+ env_->SleepForMicroseconds(micros_wait_for_flush);
+ AssertNumberOfImmutableMemtables({0, 1, 0, 1});
+ AssertCountLiveLogFiles(11);
+ PutRandomData(1, 200, 1000);
+ WaitForFlush(1);
+ AssertNumberOfImmutableMemtables({0, 0, 0, 1});
+ AssertCountLiveLogFiles(5);
+ PutRandomData(3, 88 * 3, 990);
+ WaitForFlush(3);
+ PutRandomData(3, 88 * 4, 990);
+ WaitForFlush(3);
+ AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+ AssertCountLiveLogFiles(12);
+ PutRandomData(0, 100, 1000);
+ WaitForFlush(0);
+ AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+ AssertCountLiveLogFiles(12);
+ PutRandomData(2, 3 * 1000, 1000);
+ WaitForFlush(2);
+ AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+ AssertCountLiveLogFiles(12);
+ PutRandomData(1, 2*200, 1000);
+ WaitForFlush(1);
+ AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+ AssertCountLiveLogFiles(7);
+ Close();
+}
+#endif // !ROCKSDB_LITE
+
+// The test is commented out because we want to test that snapshot is
+// not created for memtables not supported it, but There isn't a memtable
+// that doesn't support snapshot right now. If we have one later, we can
+// re-enable the test.
+//
+// #ifndef ROCKSDB_LITE // Cuckoo is not supported in lite
+// TEST_P(ColumnFamilyTest, MemtableNotSupportSnapshot) {
+// db_options_.allow_concurrent_memtable_write = false;
+// Open();
+// auto* s1 = dbfull()->GetSnapshot();
+// ASSERT_TRUE(s1 != nullptr);
+// dbfull()->ReleaseSnapshot(s1);
+
+// // Add a column family that doesn't support snapshot
+// ColumnFamilyOptions first;
+// first.memtable_factory.reset(new DummyMemtableNotSupportingSnapshot());
+// CreateColumnFamilies({"first"}, {first});
+// auto* s2 = dbfull()->GetSnapshot();
+// ASSERT_TRUE(s2 == nullptr);
+
+// // Add a column family that supports snapshot. Snapshot stays not
+// supported. ColumnFamilyOptions second; CreateColumnFamilies({"second"},
+// {second}); auto* s3 = dbfull()->GetSnapshot(); ASSERT_TRUE(s3 == nullptr);
+// Close();
+// }
+// #endif // !ROCKSDB_LITE
+
+class TestComparator : public Comparator {
+ int Compare(const ROCKSDB_NAMESPACE::Slice& /*a*/,
+ const ROCKSDB_NAMESPACE::Slice& /*b*/) const override {
+ return 0;
+ }
+ const char* Name() const override { return "Test"; }
+ void FindShortestSeparator(
+ std::string* /*start*/,
+ const ROCKSDB_NAMESPACE::Slice& /*limit*/) const override {}
+ void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+static TestComparator third_comparator;
+static TestComparator fourth_comparator;
+
+// Test that we can retrieve the comparator from a created CF
+TEST_P(ColumnFamilyTest, GetComparator) {
+ Open();
+ // Add a column family with no comparator specified
+ CreateColumnFamilies({"first"});
+ const Comparator* comp = handles_[0]->GetComparator();
+ ASSERT_EQ(comp, BytewiseComparator());
+
+ // Add three column families - one with no comparator and two
+ // with comparators specified
+ ColumnFamilyOptions second, third, fourth;
+ second.comparator = &third_comparator;
+ third.comparator = &fourth_comparator;
+ CreateColumnFamilies({"second", "third", "fourth"}, {second, third, fourth});
+ ASSERT_EQ(handles_[1]->GetComparator(), BytewiseComparator());
+ ASSERT_EQ(handles_[2]->GetComparator(), &third_comparator);
+ ASSERT_EQ(handles_[3]->GetComparator(), &fourth_comparator);
+ Close();
+}
+
+TEST_P(ColumnFamilyTest, DifferentMergeOperators) {
+ Open();
+ CreateColumnFamilies({"first", "second"});
+ ColumnFamilyOptions default_cf, first, second;
+ first.merge_operator = MergeOperators::CreateUInt64AddOperator();
+ second.merge_operator = MergeOperators::CreateStringAppendOperator();
+ Reopen({default_cf, first, second});
+
+ std::string one, two, three;
+ PutFixed64(&one, 1);
+ PutFixed64(&two, 2);
+ PutFixed64(&three, 3);
+
+ ASSERT_OK(Put(0, "foo", two));
+ ASSERT_OK(Put(0, "foo", one));
+ ASSERT_TRUE(Merge(0, "foo", two).IsNotSupported());
+ ASSERT_EQ(Get(0, "foo"), one);
+
+ ASSERT_OK(Put(1, "foo", two));
+ ASSERT_OK(Put(1, "foo", one));
+ ASSERT_OK(Merge(1, "foo", two));
+ ASSERT_EQ(Get(1, "foo"), three);
+
+ ASSERT_OK(Put(2, "foo", two));
+ ASSERT_OK(Put(2, "foo", one));
+ ASSERT_OK(Merge(2, "foo", two));
+ ASSERT_EQ(Get(2, "foo"), one + "," + two);
+ Close();
+}
+
+#ifndef ROCKSDB_LITE // WaitForFlush() is not supported
+TEST_P(ColumnFamilyTest, DifferentCompactionStyles) {
+ Open();
+ CreateColumnFamilies({"one", "two"});
+ ColumnFamilyOptions default_cf, one, two;
+ db_options_.max_open_files = 20; // only 10 files in file cache
+
+ default_cf.compaction_style = kCompactionStyleLevel;
+ default_cf.num_levels = 3;
+ default_cf.write_buffer_size = 64 << 10; // 64KB
+ default_cf.target_file_size_base = 30 << 10;
+ default_cf.max_compaction_bytes = static_cast<uint64_t>(1) << 60;
+
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ table_options.no_block_cache = true;
+ default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ one.compaction_style = kCompactionStyleUniversal;
+
+ one.num_levels = 1;
+ // trigger compaction if there are >= 4 files
+ one.level0_file_num_compaction_trigger = 4;
+ one.write_buffer_size = 120000;
+
+ two.compaction_style = kCompactionStyleLevel;
+ two.num_levels = 4;
+ two.level0_file_num_compaction_trigger = 3;
+ two.write_buffer_size = 100000;
+
+ Reopen({default_cf, one, two});
+
+ // SETUP column family "one" -- universal style
+ for (int i = 0; i < one.level0_file_num_compaction_trigger - 1; ++i) {
+ PutRandomData(1, 10, 12000);
+ PutRandomData(1, 1, 10);
+ WaitForFlush(1);
+ AssertFilesPerLevel(ToString(i + 1), 1);
+ }
+
+ // SETUP column family "two" -- level style with 4 levels
+ for (int i = 0; i < two.level0_file_num_compaction_trigger - 1; ++i) {
+ PutRandomData(2, 10, 12000);
+ PutRandomData(2, 1, 10);
+ WaitForFlush(2);
+ AssertFilesPerLevel(ToString(i + 1), 2);
+ }
+
+ // TRIGGER compaction "one"
+ PutRandomData(1, 10, 12000);
+ PutRandomData(1, 1, 10);
+
+ // TRIGGER compaction "two"
+ PutRandomData(2, 10, 12000);
+ PutRandomData(2, 1, 10);
+
+ // WAIT for compactions
+ WaitForCompaction();
+
+ // VERIFY compaction "one"
+ AssertFilesPerLevel("1", 1);
+
+ // VERIFY compaction "two"
+ AssertFilesPerLevel("0,1", 2);
+ CompactAll(2);
+ AssertFilesPerLevel("0,1", 2);
+
+ Close();
+}
+#endif // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+// Sync points not supported in RocksDB Lite
+
+TEST_P(ColumnFamilyTest, MultipleManualCompactions) {
+ Open();
+ CreateColumnFamilies({"one", "two"});
+ ColumnFamilyOptions default_cf, one, two;
+ db_options_.max_open_files = 20; // only 10 files in file cache
+ db_options_.max_background_compactions = 3;
+
+ default_cf.compaction_style = kCompactionStyleLevel;
+ default_cf.num_levels = 3;
+ default_cf.write_buffer_size = 64 << 10; // 64KB
+ default_cf.target_file_size_base = 30 << 10;
+ default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ table_options.no_block_cache = true;
+ default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ one.compaction_style = kCompactionStyleUniversal;
+
+ one.num_levels = 1;
+ // trigger compaction if there are >= 4 files
+ one.level0_file_num_compaction_trigger = 4;
+ one.write_buffer_size = 120000;
+
+ two.compaction_style = kCompactionStyleLevel;
+ two.num_levels = 4;
+ two.level0_file_num_compaction_trigger = 3;
+ two.write_buffer_size = 100000;
+
+ Reopen({default_cf, one, two});
+
+ // SETUP column family "one" -- universal style
+ for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+ PutRandomData(1, 10, 12000, true);
+ PutRandomData(1, 1, 10, true);
+ WaitForFlush(1);
+ AssertFilesPerLevel(ToString(i + 1), 1);
+ }
+ bool cf_1_1 = true;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"ColumnFamilyTest::MultiManual:4", "ColumnFamilyTest::MultiManual:1"},
+ {"ColumnFamilyTest::MultiManual:2", "ColumnFamilyTest::MultiManual:5"},
+ {"ColumnFamilyTest::MultiManual:2", "ColumnFamilyTest::MultiManual:3"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+ if (cf_1_1) {
+ TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:4");
+ cf_1_1 = false;
+ TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:3");
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ std::vector<port::Thread> threads;
+ threads.emplace_back([&] {
+ CompactRangeOptions compact_options;
+ compact_options.exclusive_manual_compaction = false;
+ ASSERT_OK(
+ db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+ });
+
+ // SETUP column family "two" -- level style with 4 levels
+ for (int i = 0; i < two.level0_file_num_compaction_trigger - 2; ++i) {
+ PutRandomData(2, 10, 12000);
+ PutRandomData(2, 1, 10);
+ WaitForFlush(2);
+ AssertFilesPerLevel(ToString(i + 1), 2);
+ }
+ threads.emplace_back([&] {
+ TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:1");
+ CompactRangeOptions compact_options;
+ compact_options.exclusive_manual_compaction = false;
+ ASSERT_OK(
+ db_->CompactRange(compact_options, handles_[2], nullptr, nullptr));
+ TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:2");
+ });
+
+ TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:5");
+ for (auto& t : threads) {
+ t.join();
+ }
+
+ // VERIFY compaction "one"
+ AssertFilesPerLevel("1", 1);
+
+ // VERIFY compaction "two"
+ AssertFilesPerLevel("0,1", 2);
+ CompactAll(2);
+ AssertFilesPerLevel("0,1", 2);
+ // Compare against saved keys
+ std::set<std::string>::iterator key_iter = keys_[1].begin();
+ while (key_iter != keys_[1].end()) {
+ ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+ key_iter++;
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ Close();
+}
+
+TEST_P(ColumnFamilyTest, AutomaticAndManualCompactions) {
+ Open();
+ CreateColumnFamilies({"one", "two"});
+ ColumnFamilyOptions default_cf, one, two;
+ db_options_.max_open_files = 20; // only 10 files in file cache
+ db_options_.max_background_compactions = 3;
+
+ default_cf.compaction_style = kCompactionStyleLevel;
+ default_cf.num_levels = 3;
+ default_cf.write_buffer_size = 64 << 10; // 64KB
+ default_cf.target_file_size_base = 30 << 10;
+ default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ ;
+ table_options.no_block_cache = true;
+ default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ one.compaction_style = kCompactionStyleUniversal;
+
+ one.num_levels = 1;
+ // trigger compaction if there are >= 4 files
+ one.level0_file_num_compaction_trigger = 4;
+ one.write_buffer_size = 120000;
+
+ two.compaction_style = kCompactionStyleLevel;
+ two.num_levels = 4;
+ two.level0_file_num_compaction_trigger = 3;
+ two.write_buffer_size = 100000;
+
+ Reopen({default_cf, one, two});
+ // make sure all background compaction jobs can be scheduled
+ auto stop_token =
+ dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+ bool cf_1_1 = true;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"ColumnFamilyTest::AutoManual:4", "ColumnFamilyTest::AutoManual:1"},
+ {"ColumnFamilyTest::AutoManual:2", "ColumnFamilyTest::AutoManual:5"},
+ {"ColumnFamilyTest::AutoManual:2", "ColumnFamilyTest::AutoManual:3"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+ if (cf_1_1) {
+ cf_1_1 = false;
+ TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:4");
+ TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:3");
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ // SETUP column family "one" -- universal style
+ for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) {
+ PutRandomData(1, 10, 12000, true);
+ PutRandomData(1, 1, 10, true);
+ WaitForFlush(1);
+ AssertFilesPerLevel(ToString(i + 1), 1);
+ }
+
+ TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:1");
+
+ // SETUP column family "two" -- level style with 4 levels
+ for (int i = 0; i < two.level0_file_num_compaction_trigger - 2; ++i) {
+ PutRandomData(2, 10, 12000);
+ PutRandomData(2, 1, 10);
+ WaitForFlush(2);
+ AssertFilesPerLevel(ToString(i + 1), 2);
+ }
+ ROCKSDB_NAMESPACE::port::Thread threads([&] {
+ CompactRangeOptions compact_options;
+ compact_options.exclusive_manual_compaction = false;
+ ASSERT_OK(
+ db_->CompactRange(compact_options, handles_[2], nullptr, nullptr));
+ TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:2");
+ });
+
+ TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:5");
+ threads.join();
+
+ // WAIT for compactions
+ WaitForCompaction();
+
+ // VERIFY compaction "one"
+ AssertFilesPerLevel("1", 1);
+
+ // VERIFY compaction "two"
+ AssertFilesPerLevel("0,1", 2);
+ CompactAll(2);
+ AssertFilesPerLevel("0,1", 2);
+ // Compare against saved keys
+ std::set<std::string>::iterator key_iter = keys_[1].begin();
+ while (key_iter != keys_[1].end()) {
+ ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+ key_iter++;
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(ColumnFamilyTest, ManualAndAutomaticCompactions) {
+ Open();
+ CreateColumnFamilies({"one", "two"});
+ ColumnFamilyOptions default_cf, one, two;
+ db_options_.max_open_files = 20; // only 10 files in file cache
+ db_options_.max_background_compactions = 3;
+
+ default_cf.compaction_style = kCompactionStyleLevel;
+ default_cf.num_levels = 3;
+ default_cf.write_buffer_size = 64 << 10; // 64KB
+ default_cf.target_file_size_base = 30 << 10;
+ default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ ;
+ table_options.no_block_cache = true;
+ default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ one.compaction_style = kCompactionStyleUniversal;
+
+ one.num_levels = 1;
+ // trigger compaction if there are >= 4 files
+ one.level0_file_num_compaction_trigger = 4;
+ one.write_buffer_size = 120000;
+
+ two.compaction_style = kCompactionStyleLevel;
+ two.num_levels = 4;
+ two.level0_file_num_compaction_trigger = 3;
+ two.write_buffer_size = 100000;
+
+ Reopen({default_cf, one, two});
+ // make sure all background compaction jobs can be scheduled
+ auto stop_token =
+ dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+ // SETUP column family "one" -- universal style
+ for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+ PutRandomData(1, 10, 12000, true);
+ PutRandomData(1, 1, 10, true);
+ WaitForFlush(1);
+ AssertFilesPerLevel(ToString(i + 1), 1);
+ }
+ bool cf_1_1 = true;
+ bool cf_1_2 = true;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:1"},
+ {"ColumnFamilyTest::ManualAuto:5", "ColumnFamilyTest::ManualAuto:2"},
+ {"ColumnFamilyTest::ManualAuto:2", "ColumnFamilyTest::ManualAuto:3"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+ if (cf_1_1) {
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:4");
+ cf_1_1 = false;
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:3");
+ } else if (cf_1_2) {
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:2");
+ cf_1_2 = false;
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ROCKSDB_NAMESPACE::port::Thread threads([&] {
+ CompactRangeOptions compact_options;
+ compact_options.exclusive_manual_compaction = false;
+ ASSERT_OK(
+ db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+ });
+
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:1");
+
+ // SETUP column family "two" -- level style with 4 levels
+ for (int i = 0; i < two.level0_file_num_compaction_trigger; ++i) {
+ PutRandomData(2, 10, 12000);
+ PutRandomData(2, 1, 10);
+ WaitForFlush(2);
+ AssertFilesPerLevel(ToString(i + 1), 2);
+ }
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:5");
+ threads.join();
+
+ // WAIT for compactions
+ WaitForCompaction();
+
+ // VERIFY compaction "one"
+ AssertFilesPerLevel("1", 1);
+
+ // VERIFY compaction "two"
+ AssertFilesPerLevel("0,1", 2);
+ CompactAll(2);
+ AssertFilesPerLevel("0,1", 2);
+ // Compare against saved keys
+ std::set<std::string>::iterator key_iter = keys_[1].begin();
+ while (key_iter != keys_[1].end()) {
+ ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+ key_iter++;
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(ColumnFamilyTest, SameCFManualManualCompactions) {
+ Open();
+ CreateColumnFamilies({"one"});
+ ColumnFamilyOptions default_cf, one;
+ db_options_.max_open_files = 20; // only 10 files in file cache
+ db_options_.max_background_compactions = 3;
+
+ default_cf.compaction_style = kCompactionStyleLevel;
+ default_cf.num_levels = 3;
+ default_cf.write_buffer_size = 64 << 10; // 64KB
+ default_cf.target_file_size_base = 30 << 10;
+ default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ ;
+ table_options.no_block_cache = true;
+ default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ one.compaction_style = kCompactionStyleUniversal;
+
+ one.num_levels = 1;
+ // trigger compaction if there are >= 4 files
+ one.level0_file_num_compaction_trigger = 4;
+ one.write_buffer_size = 120000;
+
+ Reopen({default_cf, one});
+ // make sure all background compaction jobs can be scheduled
+ auto stop_token =
+ dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+ // SETUP column family "one" -- universal style
+ for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+ PutRandomData(1, 10, 12000, true);
+ PutRandomData(1, 1, 10, true);
+ WaitForFlush(1);
+ AssertFilesPerLevel(ToString(i + 1), 1);
+ }
+ bool cf_1_1 = true;
+ bool cf_1_2 = true;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"ColumnFamilyTest::ManualManual:4", "ColumnFamilyTest::ManualManual:2"},
+ {"ColumnFamilyTest::ManualManual:4", "ColumnFamilyTest::ManualManual:5"},
+ {"ColumnFamilyTest::ManualManual:1", "ColumnFamilyTest::ManualManual:2"},
+ {"ColumnFamilyTest::ManualManual:1",
+ "ColumnFamilyTest::ManualManual:3"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+ if (cf_1_1) {
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:4");
+ cf_1_1 = false;
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:3");
+ } else if (cf_1_2) {
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:2");
+ cf_1_2 = false;
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ROCKSDB_NAMESPACE::port::Thread threads([&] {
+ CompactRangeOptions compact_options;
+ compact_options.exclusive_manual_compaction = true;
+ ASSERT_OK(
+ db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+ });
+
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:5");
+
+ WaitForFlush(1);
+
+ // Add more L0 files and force another manual compaction
+ for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+ PutRandomData(1, 10, 12000, true);
+ PutRandomData(1, 1, 10, true);
+ WaitForFlush(1);
+ AssertFilesPerLevel(ToString(one.level0_file_num_compaction_trigger + i),
+ 1);
+ }
+
+ ROCKSDB_NAMESPACE::port::Thread threads1([&] {
+ CompactRangeOptions compact_options;
+ compact_options.exclusive_manual_compaction = false;
+ ASSERT_OK(
+ db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+ });
+
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:1");
+
+ threads.join();
+ threads1.join();
+ WaitForCompaction();
+ // VERIFY compaction "one"
+ ASSERT_LE(NumTableFilesAtLevel(0, 1), 2);
+
+ // Compare against saved keys
+ std::set<std::string>::iterator key_iter = keys_[1].begin();
+ while (key_iter != keys_[1].end()) {
+ ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+ key_iter++;
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(ColumnFamilyTest, SameCFManualAutomaticCompactions) {
+ Open();
+ CreateColumnFamilies({"one"});
+ ColumnFamilyOptions default_cf, one;
+ db_options_.max_open_files = 20; // only 10 files in file cache
+ db_options_.max_background_compactions = 3;
+
+ default_cf.compaction_style = kCompactionStyleLevel;
+ default_cf.num_levels = 3;
+ default_cf.write_buffer_size = 64 << 10; // 64KB
+ default_cf.target_file_size_base = 30 << 10;
+ default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ ;
+ table_options.no_block_cache = true;
+ default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ one.compaction_style = kCompactionStyleUniversal;
+
+ one.num_levels = 1;
+ // trigger compaction if there are >= 4 files
+ one.level0_file_num_compaction_trigger = 4;
+ one.write_buffer_size = 120000;
+
+ Reopen({default_cf, one});
+ // make sure all background compaction jobs can be scheduled
+ auto stop_token =
+ dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+ // SETUP column family "one" -- universal style
+ for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+ PutRandomData(1, 10, 12000, true);
+ PutRandomData(1, 1, 10, true);
+ WaitForFlush(1);
+ AssertFilesPerLevel(ToString(i + 1), 1);
+ }
+ bool cf_1_1 = true;
+ bool cf_1_2 = true;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:2"},
+ {"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:5"},
+ {"ColumnFamilyTest::ManualAuto:1", "ColumnFamilyTest::ManualAuto:2"},
+ {"ColumnFamilyTest::ManualAuto:1", "ColumnFamilyTest::ManualAuto:3"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+ if (cf_1_1) {
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:4");
+ cf_1_1 = false;
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:3");
+ } else if (cf_1_2) {
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:2");
+ cf_1_2 = false;
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ROCKSDB_NAMESPACE::port::Thread threads([&] {
+ CompactRangeOptions compact_options;
+ compact_options.exclusive_manual_compaction = false;
+ ASSERT_OK(
+ db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+ });
+
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:5");
+
+ WaitForFlush(1);
+
+ // Add more L0 files and force automatic compaction
+ for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) {
+ PutRandomData(1, 10, 12000, true);
+ PutRandomData(1, 1, 10, true);
+ WaitForFlush(1);
+ AssertFilesPerLevel(ToString(one.level0_file_num_compaction_trigger + i),
+ 1);
+ }
+
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:1");
+
+ threads.join();
+ WaitForCompaction();
+ // VERIFY compaction "one"
+ ASSERT_LE(NumTableFilesAtLevel(0, 1), 2);
+
+ // Compare against saved keys
+ std::set<std::string>::iterator key_iter = keys_[1].begin();
+ while (key_iter != keys_[1].end()) {
+ ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+ key_iter++;
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(ColumnFamilyTest, SameCFManualAutomaticCompactionsLevel) {
+ Open();
+ CreateColumnFamilies({"one"});
+ ColumnFamilyOptions default_cf, one;
+ db_options_.max_open_files = 20; // only 10 files in file cache
+ db_options_.max_background_compactions = 3;
+
+ default_cf.compaction_style = kCompactionStyleLevel;
+ default_cf.num_levels = 3;
+ default_cf.write_buffer_size = 64 << 10; // 64KB
+ default_cf.target_file_size_base = 30 << 10;
+ default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ ;
+ table_options.no_block_cache = true;
+ default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ one.compaction_style = kCompactionStyleLevel;
+
+ one.num_levels = 1;
+ // trigger compaction if there are >= 4 files
+ one.level0_file_num_compaction_trigger = 3;
+ one.write_buffer_size = 120000;
+
+ Reopen({default_cf, one});
+ // make sure all background compaction jobs can be scheduled
+ auto stop_token =
+ dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+ // SETUP column family "one" -- level style
+ for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+ PutRandomData(1, 10, 12000, true);
+ PutRandomData(1, 1, 10, true);
+ WaitForFlush(1);
+ AssertFilesPerLevel(ToString(i + 1), 1);
+ }
+ bool cf_1_1 = true;
+ bool cf_1_2 = true;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:2"},
+ {"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:5"},
+ {"ColumnFamilyTest::ManualAuto:3", "ColumnFamilyTest::ManualAuto:2"},
+ {"LevelCompactionPicker::PickCompactionBySize:0",
+ "ColumnFamilyTest::ManualAuto:3"},
+ {"ColumnFamilyTest::ManualAuto:1", "ColumnFamilyTest::ManualAuto:3"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+ if (cf_1_1) {
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:4");
+ cf_1_1 = false;
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:3");
+ } else if (cf_1_2) {
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:2");
+ cf_1_2 = false;
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ROCKSDB_NAMESPACE::port::Thread threads([&] {
+ CompactRangeOptions compact_options;
+ compact_options.exclusive_manual_compaction = false;
+ ASSERT_OK(
+ db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+ });
+
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:5");
+
+ // Add more L0 files and force automatic compaction
+ for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) {
+ PutRandomData(1, 10, 12000, true);
+ PutRandomData(1, 1, 10, true);
+ WaitForFlush(1);
+ AssertFilesPerLevel(ToString(one.level0_file_num_compaction_trigger + i),
+ 1);
+ }
+
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:1");
+
+ threads.join();
+ WaitForCompaction();
+ // VERIFY compaction "one"
+ AssertFilesPerLevel("0,1", 1);
+
+ // Compare against saved keys
+ std::set<std::string>::iterator key_iter = keys_[1].begin();
+ while (key_iter != keys_[1].end()) {
+ ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+ key_iter++;
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// In this test, we generate enough files to trigger automatic compactions.
+// The automatic compaction waits in NonTrivial:AfterRun
+// We generate more files and then trigger an automatic compaction
+// This will wait because the automatic compaction has files it needs.
+// Once the conflict is hit, the automatic compaction starts and ends
+// Then the manual will run and end.
+TEST_P(ColumnFamilyTest, SameCFAutomaticManualCompactions) {
+ Open();
+ CreateColumnFamilies({"one"});
+ ColumnFamilyOptions default_cf, one;
+ db_options_.max_open_files = 20; // only 10 files in file cache
+ db_options_.max_background_compactions = 3;
+
+ default_cf.compaction_style = kCompactionStyleLevel;
+ default_cf.num_levels = 3;
+ default_cf.write_buffer_size = 64 << 10; // 64KB
+ default_cf.target_file_size_base = 30 << 10;
+ default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ ;
+ table_options.no_block_cache = true;
+ default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ one.compaction_style = kCompactionStyleUniversal;
+
+ one.num_levels = 1;
+ // trigger compaction if there are >= 4 files
+ one.level0_file_num_compaction_trigger = 4;
+ one.write_buffer_size = 120000;
+
+ Reopen({default_cf, one});
+ // make sure all background compaction jobs can be scheduled
+ auto stop_token =
+ dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+ bool cf_1_1 = true;
+ bool cf_1_2 = true;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"ColumnFamilyTest::AutoManual:4", "ColumnFamilyTest::AutoManual:2"},
+ {"ColumnFamilyTest::AutoManual:4", "ColumnFamilyTest::AutoManual:5"},
+ {"CompactionPicker::CompactRange:Conflict",
+ "ColumnFamilyTest::AutoManual:3"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+ if (cf_1_1) {
+ TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:4");
+ cf_1_1 = false;
+ TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:3");
+ } else if (cf_1_2) {
+ TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:2");
+ cf_1_2 = false;
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // SETUP column family "one" -- universal style
+ for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) {
+ PutRandomData(1, 10, 12000, true);
+ PutRandomData(1, 1, 10, true);
+ WaitForFlush(1);
+ AssertFilesPerLevel(ToString(i + 1), 1);
+ }
+
+ TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:5");
+
+ // Add another L0 file and force automatic compaction
+ for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+ PutRandomData(1, 10, 12000, true);
+ PutRandomData(1, 1, 10, true);
+ WaitForFlush(1);
+ }
+
+ CompactRangeOptions compact_options;
+ compact_options.exclusive_manual_compaction = false;
+ ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+
+ TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:1");
+
+ WaitForCompaction();
+ // VERIFY compaction "one"
+ AssertFilesPerLevel("1", 1);
+ // Compare against saved keys
+ std::set<std::string>::iterator key_iter = keys_[1].begin();
+ while (key_iter != keys_[1].end()) {
+ ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+ key_iter++;
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE // Tailing iterator not supported
+namespace {
+std::string IterStatus(Iterator* iter) {
+ std::string result;
+ if (iter->Valid()) {
+ result = iter->key().ToString() + "->" + iter->value().ToString();
+ } else {
+ result = "(invalid)";
+ }
+ return result;
+}
+} // anonymous namespace
+
+TEST_P(ColumnFamilyTest, NewIteratorsTest) {
+ // iter == 0 -- no tailing
+ // iter == 2 -- tailing
+ for (int iter = 0; iter < 2; ++iter) {
+ Open();
+ CreateColumnFamiliesAndReopen({"one", "two"});
+ ASSERT_OK(Put(0, "a", "b"));
+ ASSERT_OK(Put(1, "b", "a"));
+ ASSERT_OK(Put(2, "c", "m"));
+ ASSERT_OK(Put(2, "v", "t"));
+ std::vector<Iterator*> iterators;
+ ReadOptions options;
+ options.tailing = (iter == 1);
+ ASSERT_OK(db_->NewIterators(options, handles_, &iterators));
+
+ for (auto it : iterators) {
+ it->SeekToFirst();
+ }
+ ASSERT_EQ(IterStatus(iterators[0]), "a->b");
+ ASSERT_EQ(IterStatus(iterators[1]), "b->a");
+ ASSERT_EQ(IterStatus(iterators[2]), "c->m");
+
+ ASSERT_OK(Put(1, "x", "x"));
+
+ for (auto it : iterators) {
+ it->Next();
+ }
+
+ ASSERT_EQ(IterStatus(iterators[0]), "(invalid)");
+ if (iter == 0) {
+ // no tailing
+ ASSERT_EQ(IterStatus(iterators[1]), "(invalid)");
+ } else {
+ // tailing
+ ASSERT_EQ(IterStatus(iterators[1]), "x->x");
+ }
+ ASSERT_EQ(IterStatus(iterators[2]), "v->t");
+
+ for (auto it : iterators) {
+ delete it;
+ }
+ Destroy();
+ }
+}
+#endif // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE // ReadOnlyDB is not supported
+TEST_P(ColumnFamilyTest, ReadOnlyDBTest) {
+ Open();
+ CreateColumnFamiliesAndReopen({"one", "two", "three", "four"});
+ ASSERT_OK(Put(0, "a", "b"));
+ ASSERT_OK(Put(1, "foo", "bla"));
+ ASSERT_OK(Put(2, "foo", "blabla"));
+ ASSERT_OK(Put(3, "foo", "blablabla"));
+ ASSERT_OK(Put(4, "foo", "blablablabla"));
+
+ DropColumnFamilies({2});
+ Close();
+ // open only a subset of column families
+ AssertOpenReadOnly({"default", "one", "four"});
+ ASSERT_EQ("NOT_FOUND", Get(0, "foo"));
+ ASSERT_EQ("bla", Get(1, "foo"));
+ ASSERT_EQ("blablablabla", Get(2, "foo"));
+
+
+ // test newiterators
+ {
+ std::vector<Iterator*> iterators;
+ ASSERT_OK(db_->NewIterators(ReadOptions(), handles_, &iterators));
+ for (auto it : iterators) {
+ it->SeekToFirst();
+ }
+ ASSERT_EQ(IterStatus(iterators[0]), "a->b");
+ ASSERT_EQ(IterStatus(iterators[1]), "foo->bla");
+ ASSERT_EQ(IterStatus(iterators[2]), "foo->blablablabla");
+ for (auto it : iterators) {
+ it->Next();
+ }
+ ASSERT_EQ(IterStatus(iterators[0]), "(invalid)");
+ ASSERT_EQ(IterStatus(iterators[1]), "(invalid)");
+ ASSERT_EQ(IterStatus(iterators[2]), "(invalid)");
+
+ for (auto it : iterators) {
+ delete it;
+ }
+ }
+
+ Close();
+ // can't open dropped column family
+ Status s = OpenReadOnly({"default", "one", "two"});
+ ASSERT_TRUE(!s.ok());
+
+ // Can't open without specifying default column family
+ s = OpenReadOnly({"one", "four"});
+ ASSERT_TRUE(!s.ok());
+}
+#endif // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE // WaitForFlush() is not supported in lite
+TEST_P(ColumnFamilyTest, DontRollEmptyLogs) {
+ Open();
+ CreateColumnFamiliesAndReopen({"one", "two", "three", "four"});
+
+ for (size_t i = 0; i < handles_.size(); ++i) {
+ PutRandomData(static_cast<int>(i), 10, 100);
+ }
+ int num_writable_file_start = env_->GetNumberOfNewWritableFileCalls();
+ // this will trigger the flushes
+ for (int i = 0; i <= 4; ++i) {
+ ASSERT_OK(Flush(i));
+ }
+
+ for (int i = 0; i < 4; ++i) {
+ WaitForFlush(i);
+ }
+ int total_new_writable_files =
+ env_->GetNumberOfNewWritableFileCalls() - num_writable_file_start;
+ ASSERT_EQ(static_cast<size_t>(total_new_writable_files), handles_.size() + 1);
+ Close();
+}
+#endif // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE // WaitForCompaction() is not supported in lite
+TEST_P(ColumnFamilyTest, FlushStaleColumnFamilies) {
+ Open();
+ CreateColumnFamilies({"one", "two"});
+ ColumnFamilyOptions default_cf, one, two;
+ default_cf.write_buffer_size = 100000; // small write buffer size
+ default_cf.arena_block_size = 4096;
+ default_cf.disable_auto_compactions = true;
+ one.disable_auto_compactions = true;
+ two.disable_auto_compactions = true;
+ db_options_.max_total_wal_size = 210000;
+
+ Reopen({default_cf, one, two});
+
+ PutRandomData(2, 1, 10); // 10 bytes
+ for (int i = 0; i < 2; ++i) {
+ PutRandomData(0, 100, 1000); // flush
+ WaitForFlush(0);
+
+ AssertCountLiveFiles(i + 1);
+ }
+ // third flush. now, CF [two] should be detected as stale and flushed
+ // column family 1 should not be flushed since it's empty
+ PutRandomData(0, 100, 1000); // flush
+ WaitForFlush(0);
+ WaitForFlush(2);
+ // 3 files for default column families, 1 file for column family [two], zero
+ // files for column family [one], because it's empty
+ AssertCountLiveFiles(4);
+
+ Flush(0);
+ ASSERT_EQ(0, dbfull()->TEST_total_log_size());
+ Close();
+}
+#endif // !ROCKSDB_LITE
+
+TEST_P(ColumnFamilyTest, CreateMissingColumnFamilies) {
+ Status s = TryOpen({"one", "two"});
+ ASSERT_TRUE(!s.ok());
+ db_options_.create_missing_column_families = true;
+ s = TryOpen({"default", "one", "two"});
+ ASSERT_TRUE(s.ok());
+ Close();
+}
+
+TEST_P(ColumnFamilyTest, SanitizeOptions) {
+ DBOptions db_options;
+ for (int s = kCompactionStyleLevel; s <= kCompactionStyleUniversal; ++s) {
+ for (int l = 0; l <= 2; l++) {
+ for (int i = 1; i <= 3; i++) {
+ for (int j = 1; j <= 3; j++) {
+ for (int k = 1; k <= 3; k++) {
+ ColumnFamilyOptions original;
+ original.compaction_style = static_cast<CompactionStyle>(s);
+ original.num_levels = l;
+ original.level0_stop_writes_trigger = i;
+ original.level0_slowdown_writes_trigger = j;
+ original.level0_file_num_compaction_trigger = k;
+ original.write_buffer_size =
+ l * 4 * 1024 * 1024 + i * 1024 * 1024 + j * 1024 + k;
+
+ ColumnFamilyOptions result =
+ SanitizeOptions(ImmutableDBOptions(db_options), original);
+ ASSERT_TRUE(result.level0_stop_writes_trigger >=
+ result.level0_slowdown_writes_trigger);
+ ASSERT_TRUE(result.level0_slowdown_writes_trigger >=
+ result.level0_file_num_compaction_trigger);
+ ASSERT_TRUE(result.level0_file_num_compaction_trigger ==
+ original.level0_file_num_compaction_trigger);
+ if (s == kCompactionStyleLevel) {
+ ASSERT_GE(result.num_levels, 2);
+ } else {
+ ASSERT_GE(result.num_levels, 1);
+ if (original.num_levels >= 1) {
+ ASSERT_EQ(result.num_levels, original.num_levels);
+ }
+ }
+
+ // Make sure Sanitize options sets arena_block_size to 1/8 of
+ // the write_buffer_size, rounded up to a multiple of 4k.
+ size_t expected_arena_block_size =
+ l * 4 * 1024 * 1024 / 8 + i * 1024 * 1024 / 8;
+ if (j + k != 0) {
+ // not a multiple of 4k, round up 4k
+ expected_arena_block_size += 4 * 1024;
+ }
+ ASSERT_EQ(expected_arena_block_size, result.arena_block_size);
+ }
+ }
+ }
+ }
+ }
+}
+
+TEST_P(ColumnFamilyTest, ReadDroppedColumnFamily) {
+ // iter 0 -- drop CF, don't reopen
+ // iter 1 -- delete CF, reopen
+ for (int iter = 0; iter < 2; ++iter) {
+ db_options_.create_missing_column_families = true;
+ db_options_.max_open_files = 20;
+ // delete obsolete files always
+ db_options_.delete_obsolete_files_period_micros = 0;
+ Open({"default", "one", "two"});
+ ColumnFamilyOptions options;
+ options.level0_file_num_compaction_trigger = 100;
+ options.level0_slowdown_writes_trigger = 200;
+ options.level0_stop_writes_trigger = 200;
+ options.write_buffer_size = 100000; // small write buffer size
+ Reopen({options, options, options});
+
+ // 1MB should create ~10 files for each CF
+ int kKeysNum = 10000;
+ PutRandomData(0, kKeysNum, 100);
+ PutRandomData(1, kKeysNum, 100);
+ PutRandomData(2, kKeysNum, 100);
+
+ {
+ std::unique_ptr<Iterator> iterator(
+ db_->NewIterator(ReadOptions(), handles_[2]));
+ iterator->SeekToFirst();
+
+ if (iter == 0) {
+ // Drop CF two
+ ASSERT_OK(db_->DropColumnFamily(handles_[2]));
+ } else {
+ // delete CF two
+ db_->DestroyColumnFamilyHandle(handles_[2]);
+ handles_[2] = nullptr;
+ }
+ // Make sure iterator created can still be used.
+ int count = 0;
+ for (; iterator->Valid(); iterator->Next()) {
+ ASSERT_OK(iterator->status());
+ ++count;
+ }
+ ASSERT_OK(iterator->status());
+ ASSERT_EQ(count, kKeysNum);
+ }
+
+ // Add bunch more data to other CFs
+ PutRandomData(0, kKeysNum, 100);
+ PutRandomData(1, kKeysNum, 100);
+
+ if (iter == 1) {
+ Reopen();
+ }
+
+ // Since we didn't delete CF handle, RocksDB's contract guarantees that
+ // we're still able to read dropped CF
+ for (int i = 0; i < 3; ++i) {
+ std::unique_ptr<Iterator> iterator(
+ db_->NewIterator(ReadOptions(), handles_[i]));
+ int count = 0;
+ for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
+ ASSERT_OK(iterator->status());
+ ++count;
+ }
+ ASSERT_OK(iterator->status());
+ ASSERT_EQ(count, kKeysNum * ((i == 2) ? 1 : 2));
+ }
+
+ Close();
+ Destroy();
+ }
+}
+
+TEST_P(ColumnFamilyTest, LiveIteratorWithDroppedColumnFamily) {
+ db_options_.create_missing_column_families = true;
+ db_options_.max_open_files = 20;
+ // delete obsolete files always
+ db_options_.delete_obsolete_files_period_micros = 0;
+ Open({"default", "one", "two"});
+ ColumnFamilyOptions options;
+ options.level0_file_num_compaction_trigger = 100;
+ options.level0_slowdown_writes_trigger = 200;
+ options.level0_stop_writes_trigger = 200;
+ options.write_buffer_size = 100000; // small write buffer size
+ Reopen({options, options, options});
+
+ // 1MB should create ~10 files for each CF
+ int kKeysNum = 10000;
+ PutRandomData(1, kKeysNum, 100);
+
+ {
+ std::unique_ptr<Iterator> iterator(
+ db_->NewIterator(ReadOptions(), handles_[1]));
+ iterator->SeekToFirst();
+
+ DropColumnFamilies({1});
+
+ // Make sure iterator created can still be used.
+ int count = 0;
+ for (; iterator->Valid(); iterator->Next()) {
+ ASSERT_OK(iterator->status());
+ ++count;
+ }
+ ASSERT_OK(iterator->status());
+ ASSERT_EQ(count, kKeysNum);
+ }
+
+ Reopen();
+ Close();
+ Destroy();
+}
+
+TEST_P(ColumnFamilyTest, FlushAndDropRaceCondition) {
+ db_options_.create_missing_column_families = true;
+ Open({"default", "one"});
+ ColumnFamilyOptions options;
+ options.level0_file_num_compaction_trigger = 100;
+ options.level0_slowdown_writes_trigger = 200;
+ options.level0_stop_writes_trigger = 200;
+ options.max_write_buffer_number = 20;
+ options.write_buffer_size = 100000; // small write buffer size
+ Reopen({options, options});
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"VersionSet::LogAndApply::ColumnFamilyDrop:0",
+ "FlushJob::WriteLevel0Table"},
+ {"VersionSet::LogAndApply::ColumnFamilyDrop:1",
+ "FlushJob::InstallResults"},
+ {"FlushJob::InstallResults",
+ "VersionSet::LogAndApply::ColumnFamilyDrop:2"}});
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ test::SleepingBackgroundTask sleeping_task;
+
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+ Env::Priority::HIGH);
+
+ // 1MB should create ~10 files for each CF
+ int kKeysNum = 10000;
+ PutRandomData(1, kKeysNum, 100);
+
+ std::vector<port::Thread> threads;
+ threads.emplace_back([&] { ASSERT_OK(db_->DropColumnFamily(handles_[1])); });
+
+ sleeping_task.WakeUp();
+ sleeping_task.WaitUntilDone();
+ sleeping_task.Reset();
+ // now we sleep again. this is just so we're certain that flush job finished
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+ Env::Priority::HIGH);
+ sleeping_task.WakeUp();
+ sleeping_task.WaitUntilDone();
+
+ {
+ // Since we didn't delete CF handle, RocksDB's contract guarantees that
+ // we're still able to read dropped CF
+ std::unique_ptr<Iterator> iterator(
+ db_->NewIterator(ReadOptions(), handles_[1]));
+ int count = 0;
+ for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
+ ASSERT_OK(iterator->status());
+ ++count;
+ }
+ ASSERT_OK(iterator->status());
+ ASSERT_EQ(count, kKeysNum);
+ }
+ for (auto& t : threads) {
+ t.join();
+ }
+
+ Close();
+ Destroy();
+}
+
+#ifndef ROCKSDB_LITE
+// skipped as persisting options is not supported in ROCKSDB_LITE
+namespace {
+std::atomic<int> test_stage(0);
+std::atomic<bool> ordered_by_writethread(false);
+const int kMainThreadStartPersistingOptionsFile = 1;
+const int kChildThreadFinishDroppingColumnFamily = 2;
+void DropSingleColumnFamily(ColumnFamilyTest* cf_test, int cf_id,
+ std::vector<Comparator*>* comparators) {
+ while (test_stage < kMainThreadStartPersistingOptionsFile &&
+ !ordered_by_writethread) {
+ Env::Default()->SleepForMicroseconds(100);
+ }
+ cf_test->DropColumnFamilies({cf_id});
+ if ((*comparators)[cf_id]) {
+ delete (*comparators)[cf_id];
+ (*comparators)[cf_id] = nullptr;
+ }
+ test_stage = kChildThreadFinishDroppingColumnFamily;
+}
+} // namespace
+
+TEST_P(ColumnFamilyTest, CreateAndDropRace) {
+ const int kCfCount = 5;
+ std::vector<ColumnFamilyOptions> cf_opts;
+ std::vector<Comparator*> comparators;
+ for (int i = 0; i < kCfCount; ++i) {
+ cf_opts.emplace_back();
+ comparators.push_back(new test::SimpleSuffixReverseComparator());
+ cf_opts.back().comparator = comparators.back();
+ }
+ db_options_.create_if_missing = true;
+ db_options_.create_missing_column_families = true;
+
+ auto main_thread_id = std::this_thread::get_id();
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "PersistRocksDBOptions:start", [&](void* /*arg*/) {
+ auto current_thread_id = std::this_thread::get_id();
+ // If it's the main thread hitting this sync-point, then it
+ // will be blocked until some other thread update the test_stage.
+ if (main_thread_id == current_thread_id) {
+ test_stage = kMainThreadStartPersistingOptionsFile;
+ while (test_stage < kChildThreadFinishDroppingColumnFamily &&
+ !ordered_by_writethread) {
+ Env::Default()->SleepForMicroseconds(100);
+ }
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WriteThread::EnterUnbatched:Wait", [&](void* /*arg*/) {
+ // This means a thread doing DropColumnFamily() is waiting for
+ // other thread to finish persisting options.
+ // In such case, we update the test_stage to unblock the main thread.
+ ordered_by_writethread = true;
+ });
+
+ // Create a database with four column families
+ Open({"default", "one", "two", "three"},
+ {cf_opts[0], cf_opts[1], cf_opts[2], cf_opts[3]});
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Start a thread that will drop the first column family
+ // and its comparator
+ ROCKSDB_NAMESPACE::port::Thread drop_cf_thread(DropSingleColumnFamily, this,
+ 1, &comparators);
+
+ DropColumnFamilies({2});
+
+ drop_cf_thread.join();
+ Close();
+ Destroy();
+ for (auto* comparator : comparators) {
+ if (comparator) {
+ delete comparator;
+ }
+ }
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif // !ROCKSDB_LITE
+
+TEST_P(ColumnFamilyTest, WriteStallSingleColumnFamily) {
+ const uint64_t kBaseRate = 800000u;
+ db_options_.delayed_write_rate = kBaseRate;
+ db_options_.max_background_compactions = 6;
+
+ Open({"default"});
+ ColumnFamilyData* cfd =
+ static_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())->cfd();
+
+ VersionStorageInfo* vstorage = cfd->current()->storage_info();
+
+ MutableCFOptions mutable_cf_options(column_family_options_);
+
+ mutable_cf_options.level0_slowdown_writes_trigger = 20;
+ mutable_cf_options.level0_stop_writes_trigger = 10000;
+ mutable_cf_options.soft_pending_compaction_bytes_limit = 200;
+ mutable_cf_options.hard_pending_compaction_bytes_limit = 2000;
+ mutable_cf_options.disable_auto_compactions = false;
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(50);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(201);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+ ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(400);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+ ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(500);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(450);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(205);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(202);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(201);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(198);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(399);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(599);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(2001);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(IsDbWriteStopped());
+ ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(3001);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(IsDbWriteStopped());
+ ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(390);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(100);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+ vstorage->set_l0_delay_trigger_count(100);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+ ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage->set_l0_delay_trigger_count(101);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+
+ vstorage->set_l0_delay_trigger_count(0);
+ vstorage->TEST_set_estimated_compaction_needed_bytes(300);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
+
+ vstorage->set_l0_delay_trigger_count(101);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate / 1.25 / 1.25 / 1.25, GetDbDelayedWriteRate());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(200);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
+
+ vstorage->set_l0_delay_trigger_count(0);
+ vstorage->TEST_set_estimated_compaction_needed_bytes(0);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+ mutable_cf_options.disable_auto_compactions = true;
+ dbfull()->TEST_write_controler().set_delayed_write_rate(kBaseRate);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+ vstorage->set_l0_delay_trigger_count(50);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(0, GetDbDelayedWriteRate());
+ ASSERT_EQ(kBaseRate, dbfull()->TEST_write_controler().delayed_write_rate());
+
+ vstorage->set_l0_delay_trigger_count(60);
+ vstorage->TEST_set_estimated_compaction_needed_bytes(300);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(0, GetDbDelayedWriteRate());
+ ASSERT_EQ(kBaseRate, dbfull()->TEST_write_controler().delayed_write_rate());
+
+ mutable_cf_options.disable_auto_compactions = false;
+ vstorage->set_l0_delay_trigger_count(70);
+ vstorage->TEST_set_estimated_compaction_needed_bytes(500);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+ vstorage->set_l0_delay_trigger_count(71);
+ vstorage->TEST_set_estimated_compaction_needed_bytes(501);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+}
+
+TEST_P(ColumnFamilyTest, CompactionSpeedupSingleColumnFamily) {
+ db_options_.max_background_compactions = 6;
+ Open({"default"});
+ ColumnFamilyData* cfd =
+ static_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())->cfd();
+
+ VersionStorageInfo* vstorage = cfd->current()->storage_info();
+
+ MutableCFOptions mutable_cf_options(column_family_options_);
+
+ // Speed up threshold = min(4 * 2, 4 + (36 - 4)/4) = 8
+ mutable_cf_options.level0_file_num_compaction_trigger = 4;
+ mutable_cf_options.level0_slowdown_writes_trigger = 36;
+ mutable_cf_options.level0_stop_writes_trigger = 50;
+ // Speedup threshold = 200 / 4 = 50
+ mutable_cf_options.soft_pending_compaction_bytes_limit = 200;
+ mutable_cf_options.hard_pending_compaction_bytes_limit = 2000;
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(40);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(50);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(300);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(45);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage->set_l0_delay_trigger_count(7);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage->set_l0_delay_trigger_count(9);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage->set_l0_delay_trigger_count(6);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+ // Speed up threshold = min(4 * 2, 4 + (12 - 4)/4) = 6
+ mutable_cf_options.level0_file_num_compaction_trigger = 4;
+ mutable_cf_options.level0_slowdown_writes_trigger = 16;
+ mutable_cf_options.level0_stop_writes_trigger = 30;
+
+ vstorage->set_l0_delay_trigger_count(5);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage->set_l0_delay_trigger_count(7);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage->set_l0_delay_trigger_count(3);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+}
+
+TEST_P(ColumnFamilyTest, WriteStallTwoColumnFamilies) {
+ const uint64_t kBaseRate = 810000u;
+ db_options_.delayed_write_rate = kBaseRate;
+ Open();
+ CreateColumnFamilies({"one"});
+ ColumnFamilyData* cfd =
+ static_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())->cfd();
+ VersionStorageInfo* vstorage = cfd->current()->storage_info();
+
+ ColumnFamilyData* cfd1 =
+ static_cast<ColumnFamilyHandleImpl*>(handles_[1])->cfd();
+ VersionStorageInfo* vstorage1 = cfd1->current()->storage_info();
+
+ MutableCFOptions mutable_cf_options(column_family_options_);
+ mutable_cf_options.level0_slowdown_writes_trigger = 20;
+ mutable_cf_options.level0_stop_writes_trigger = 10000;
+ mutable_cf_options.soft_pending_compaction_bytes_limit = 200;
+ mutable_cf_options.hard_pending_compaction_bytes_limit = 2000;
+
+ MutableCFOptions mutable_cf_options1 = mutable_cf_options;
+ mutable_cf_options1.soft_pending_compaction_bytes_limit = 500;
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(50);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+ vstorage1->TEST_set_estimated_compaction_needed_bytes(201);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+ vstorage1->TEST_set_estimated_compaction_needed_bytes(600);
+ RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(70);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+ vstorage1->TEST_set_estimated_compaction_needed_bytes(800);
+ RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(300);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
+
+ vstorage1->TEST_set_estimated_compaction_needed_bytes(700);
+ RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(500);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
+
+ vstorage1->TEST_set_estimated_compaction_needed_bytes(600);
+ RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+}
+
+TEST_P(ColumnFamilyTest, CompactionSpeedupTwoColumnFamilies) {
+ db_options_.max_background_compactions = 6;
+ column_family_options_.soft_pending_compaction_bytes_limit = 200;
+ column_family_options_.hard_pending_compaction_bytes_limit = 2000;
+ Open();
+ CreateColumnFamilies({"one"});
+ ColumnFamilyData* cfd =
+ static_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())->cfd();
+ VersionStorageInfo* vstorage = cfd->current()->storage_info();
+
+ ColumnFamilyData* cfd1 =
+ static_cast<ColumnFamilyHandleImpl*>(handles_[1])->cfd();
+ VersionStorageInfo* vstorage1 = cfd1->current()->storage_info();
+
+ MutableCFOptions mutable_cf_options(column_family_options_);
+ // Speed up threshold = min(4 * 2, 4 + (36 - 4)/4) = 8
+ mutable_cf_options.level0_file_num_compaction_trigger = 4;
+ mutable_cf_options.level0_slowdown_writes_trigger = 36;
+ mutable_cf_options.level0_stop_writes_trigger = 30;
+ // Speedup threshold = 200 / 4 = 50
+ mutable_cf_options.soft_pending_compaction_bytes_limit = 200;
+ mutable_cf_options.hard_pending_compaction_bytes_limit = 2000;
+
+ MutableCFOptions mutable_cf_options1 = mutable_cf_options;
+ mutable_cf_options1.level0_slowdown_writes_trigger = 16;
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(40);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(60);
+ RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+ ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage1->TEST_set_estimated_compaction_needed_bytes(30);
+ RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+ ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage1->TEST_set_estimated_compaction_needed_bytes(70);
+ RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+ ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(20);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage1->TEST_set_estimated_compaction_needed_bytes(3);
+ RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+ ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage->set_l0_delay_trigger_count(9);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage1->set_l0_delay_trigger_count(2);
+ RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+ ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage->set_l0_delay_trigger_count(0);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+}
+
+TEST_P(ColumnFamilyTest, CreateAndDestoryOptions) {
+ std::unique_ptr<ColumnFamilyOptions> cfo(new ColumnFamilyOptions());
+ ColumnFamilyHandle* cfh;
+ Open();
+ ASSERT_OK(db_->CreateColumnFamily(*(cfo.get()), "yoyo", &cfh));
+ cfo.reset();
+ ASSERT_OK(db_->Put(WriteOptions(), cfh, "foo", "bar"));
+ ASSERT_OK(db_->Flush(FlushOptions(), cfh));
+ ASSERT_OK(db_->DropColumnFamily(cfh));
+ ASSERT_OK(db_->DestroyColumnFamilyHandle(cfh));
+}
+
+TEST_P(ColumnFamilyTest, CreateDropAndDestroy) {
+ ColumnFamilyHandle* cfh;
+ Open();
+ ASSERT_OK(db_->CreateColumnFamily(ColumnFamilyOptions(), "yoyo", &cfh));
+ ASSERT_OK(db_->Put(WriteOptions(), cfh, "foo", "bar"));
+ ASSERT_OK(db_->Flush(FlushOptions(), cfh));
+ ASSERT_OK(db_->DropColumnFamily(cfh));
+ ASSERT_OK(db_->DestroyColumnFamilyHandle(cfh));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_P(ColumnFamilyTest, CreateDropAndDestroyWithoutFileDeletion) {
+ ColumnFamilyHandle* cfh;
+ Open();
+ ASSERT_OK(db_->CreateColumnFamily(ColumnFamilyOptions(), "yoyo", &cfh));
+ ASSERT_OK(db_->Put(WriteOptions(), cfh, "foo", "bar"));
+ ASSERT_OK(db_->Flush(FlushOptions(), cfh));
+ ASSERT_OK(db_->DisableFileDeletions());
+ ASSERT_OK(db_->DropColumnFamily(cfh));
+ ASSERT_OK(db_->DestroyColumnFamilyHandle(cfh));
+}
+
+TEST_P(ColumnFamilyTest, FlushCloseWALFiles) {
+ SpecialEnv env(Env::Default());
+ db_options_.env = &env;
+ db_options_.max_background_flushes = 1;
+ column_family_options_.memtable_factory.reset(new SpecialSkipListFactory(2));
+ Open();
+ CreateColumnFamilies({"one"});
+ ASSERT_OK(Put(1, "fodor", "mirko"));
+ ASSERT_OK(Put(0, "fodor", "mirko"));
+ ASSERT_OK(Put(1, "fodor", "mirko"));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"DBImpl::BGWorkFlush:done", "FlushCloseWALFiles:0"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Block flush jobs from running
+ test::SleepingBackgroundTask sleeping_task;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+ Env::Priority::HIGH);
+
+ WriteOptions wo;
+ wo.sync = true;
+ ASSERT_OK(db_->Put(wo, handles_[1], "fodor", "mirko"));
+
+ ASSERT_EQ(2, env.num_open_wal_file_.load());
+
+ sleeping_task.WakeUp();
+ sleeping_task.WaitUntilDone();
+ TEST_SYNC_POINT("FlushCloseWALFiles:0");
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ASSERT_EQ(1, env.num_open_wal_file_.load());
+
+ Reopen();
+ ASSERT_EQ("mirko", Get(0, "fodor"));
+ ASSERT_EQ("mirko", Get(1, "fodor"));
+ db_options_.env = env_;
+ Close();
+}
+#endif // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE // WaitForFlush() is not supported
+TEST_P(ColumnFamilyTest, IteratorCloseWALFile1) {
+ SpecialEnv env(Env::Default());
+ db_options_.env = &env;
+ db_options_.max_background_flushes = 1;
+ column_family_options_.memtable_factory.reset(new SpecialSkipListFactory(2));
+ Open();
+ CreateColumnFamilies({"one"});
+ ASSERT_OK(Put(1, "fodor", "mirko"));
+ // Create an iterator holding the current super version.
+ Iterator* it = db_->NewIterator(ReadOptions(), handles_[1]);
+ // A flush will make `it` hold the last reference of its super version.
+ Flush(1);
+
+ ASSERT_OK(Put(1, "fodor", "mirko"));
+ ASSERT_OK(Put(0, "fodor", "mirko"));
+ ASSERT_OK(Put(1, "fodor", "mirko"));
+
+ // Flush jobs will close previous WAL files after finishing. By
+ // block flush jobs from running, we trigger a condition where
+ // the iterator destructor should close the WAL files.
+ test::SleepingBackgroundTask sleeping_task;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+ Env::Priority::HIGH);
+
+ WriteOptions wo;
+ wo.sync = true;
+ ASSERT_OK(db_->Put(wo, handles_[1], "fodor", "mirko"));
+
+ ASSERT_EQ(2, env.num_open_wal_file_.load());
+ // Deleting the iterator will clear its super version, triggering
+ // closing all files
+ delete it;
+ ASSERT_EQ(1, env.num_open_wal_file_.load());
+
+ sleeping_task.WakeUp();
+ sleeping_task.WaitUntilDone();
+ WaitForFlush(1);
+
+ Reopen();
+ ASSERT_EQ("mirko", Get(0, "fodor"));
+ ASSERT_EQ("mirko", Get(1, "fodor"));
+ db_options_.env = env_;
+ Close();
+}
+
+TEST_P(ColumnFamilyTest, IteratorCloseWALFile2) {
+ SpecialEnv env(Env::Default());
+ // Allow both of flush and purge job to schedule.
+ env.SetBackgroundThreads(2, Env::HIGH);
+ db_options_.env = &env;
+ db_options_.max_background_flushes = 1;
+ column_family_options_.memtable_factory.reset(new SpecialSkipListFactory(2));
+ Open();
+ CreateColumnFamilies({"one"});
+ ASSERT_OK(Put(1, "fodor", "mirko"));
+ // Create an iterator holding the current super version.
+ ReadOptions ro;
+ ro.background_purge_on_iterator_cleanup = true;
+ Iterator* it = db_->NewIterator(ro, handles_[1]);
+ // A flush will make `it` hold the last reference of its super version.
+ Flush(1);
+
+ ASSERT_OK(Put(1, "fodor", "mirko"));
+ ASSERT_OK(Put(0, "fodor", "mirko"));
+ ASSERT_OK(Put(1, "fodor", "mirko"));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"ColumnFamilyTest::IteratorCloseWALFile2:0",
+ "DBImpl::BGWorkPurge:start"},
+ {"ColumnFamilyTest::IteratorCloseWALFile2:2",
+ "DBImpl::BackgroundCallFlush:start"},
+ {"DBImpl::BGWorkPurge:end", "ColumnFamilyTest::IteratorCloseWALFile2:1"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ WriteOptions wo;
+ wo.sync = true;
+ ASSERT_OK(db_->Put(wo, handles_[1], "fodor", "mirko"));
+
+ ASSERT_EQ(2, env.num_open_wal_file_.load());
+ // Deleting the iterator will clear its super version, triggering
+ // closing all files
+ delete it;
+ ASSERT_EQ(2, env.num_open_wal_file_.load());
+
+ TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:0");
+ TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:1");
+ ASSERT_EQ(1, env.num_open_wal_file_.load());
+ TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:2");
+ WaitForFlush(1);
+ ASSERT_EQ(1, env.num_open_wal_file_.load());
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ Reopen();
+ ASSERT_EQ("mirko", Get(0, "fodor"));
+ ASSERT_EQ("mirko", Get(1, "fodor"));
+ db_options_.env = env_;
+ Close();
+}
+#endif // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE // TEST functions are not supported in lite
+TEST_P(ColumnFamilyTest, ForwardIteratorCloseWALFile) {
+ SpecialEnv env(Env::Default());
+ // Allow both of flush and purge job to schedule.
+ env.SetBackgroundThreads(2, Env::HIGH);
+ db_options_.env = &env;
+ db_options_.max_background_flushes = 1;
+ column_family_options_.memtable_factory.reset(new SpecialSkipListFactory(3));
+ column_family_options_.level0_file_num_compaction_trigger = 2;
+ Open();
+ CreateColumnFamilies({"one"});
+ ASSERT_OK(Put(1, "fodor", "mirko"));
+ ASSERT_OK(Put(1, "fodar2", "mirko"));
+ Flush(1);
+
+ // Create an iterator holding the current super version, as well as
+ // the SST file just flushed.
+ ReadOptions ro;
+ ro.tailing = true;
+ ro.background_purge_on_iterator_cleanup = true;
+ Iterator* it = db_->NewIterator(ro, handles_[1]);
+ // A flush will make `it` hold the last reference of its super version.
+
+ ASSERT_OK(Put(1, "fodor", "mirko"));
+ ASSERT_OK(Put(1, "fodar2", "mirko"));
+ Flush(1);
+
+ WaitForCompaction();
+
+ ASSERT_OK(Put(1, "fodor", "mirko"));
+ ASSERT_OK(Put(1, "fodor", "mirko"));
+ ASSERT_OK(Put(0, "fodor", "mirko"));
+ ASSERT_OK(Put(1, "fodor", "mirko"));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"ColumnFamilyTest::IteratorCloseWALFile2:0",
+ "DBImpl::BGWorkPurge:start"},
+ {"ColumnFamilyTest::IteratorCloseWALFile2:2",
+ "DBImpl::BackgroundCallFlush:start"},
+ {"DBImpl::BGWorkPurge:end", "ColumnFamilyTest::IteratorCloseWALFile2:1"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ WriteOptions wo;
+ wo.sync = true;
+ ASSERT_OK(db_->Put(wo, handles_[1], "fodor", "mirko"));
+
+ env.delete_count_.store(0);
+ ASSERT_EQ(2, env.num_open_wal_file_.load());
+ // Deleting the iterator will clear its super version, triggering
+ // closing all files
+ it->Seek("");
+ ASSERT_EQ(2, env.num_open_wal_file_.load());
+ ASSERT_EQ(0, env.delete_count_.load());
+
+ TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:0");
+ TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:1");
+ ASSERT_EQ(1, env.num_open_wal_file_.load());
+ ASSERT_EQ(1, env.delete_count_.load());
+ TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:2");
+ WaitForFlush(1);
+ ASSERT_EQ(1, env.num_open_wal_file_.load());
+ ASSERT_EQ(1, env.delete_count_.load());
+
+ delete it;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ Reopen();
+ ASSERT_EQ("mirko", Get(0, "fodor"));
+ ASSERT_EQ("mirko", Get(1, "fodor"));
+ db_options_.env = env_;
+ Close();
+}
+#endif // !ROCKSDB_LITE
+
+// Disable on windows because SyncWAL requires env->IsSyncThreadSafe()
+// to return true which is not so in unbuffered mode.
+#ifndef OS_WIN
+TEST_P(ColumnFamilyTest, LogSyncConflictFlush) {
+ Open();
+ CreateColumnFamiliesAndReopen({"one", "two"});
+
+ Put(0, "", "");
+ Put(1, "foo", "bar");
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::SyncWAL:BeforeMarkLogsSynced:1",
+ "ColumnFamilyTest::LogSyncConflictFlush:1"},
+ {"ColumnFamilyTest::LogSyncConflictFlush:2",
+ "DBImpl::SyncWAL:BeforeMarkLogsSynced:2"}});
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ROCKSDB_NAMESPACE::port::Thread thread([&] { db_->SyncWAL(); });
+
+ TEST_SYNC_POINT("ColumnFamilyTest::LogSyncConflictFlush:1");
+ Flush(1);
+ Put(1, "foo", "bar");
+ Flush(1);
+
+ TEST_SYNC_POINT("ColumnFamilyTest::LogSyncConflictFlush:2");
+
+ thread.join();
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ Close();
+}
+#endif
+
+// this test is placed here, because the infrastructure for Column Family
+// test is being used to ensure a roll of wal files.
+// Basic idea is to test that WAL truncation is being detected and not
+// ignored
+TEST_P(ColumnFamilyTest, DISABLED_LogTruncationTest) {
+ Open();
+ CreateColumnFamiliesAndReopen({"one", "two"});
+
+ Build(0, 100);
+
+ // Flush the 0th column family to force a roll of the wal log
+ Flush(0);
+
+ // Add some more entries
+ Build(100, 100);
+
+ std::vector<std::string> filenames;
+ ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+
+ // collect wal files
+ std::vector<std::string> logfs;
+ for (size_t i = 0; i < filenames.size(); i++) {
+ uint64_t number;
+ FileType type;
+ if (!(ParseFileName(filenames[i], &number, &type))) continue;
+
+ if (type != kLogFile) continue;
+
+ logfs.push_back(filenames[i]);
+ }
+
+ std::sort(logfs.begin(), logfs.end());
+ ASSERT_GE(logfs.size(), 2);
+
+ // Take the last but one file, and truncate it
+ std::string fpath = dbname_ + "/" + logfs[logfs.size() - 2];
+ std::vector<std::string> names_save = names_;
+
+ uint64_t fsize;
+ ASSERT_OK(env_->GetFileSize(fpath, &fsize));
+ ASSERT_GT(fsize, 0);
+
+ Close();
+
+ std::string backup_logs = dbname_ + "/backup_logs";
+ std::string t_fpath = backup_logs + "/" + logfs[logfs.size() - 2];
+
+ ASSERT_OK(env_->CreateDirIfMissing(backup_logs));
+ // Not sure how easy it is to make this data driven.
+ // need to read back the WAL file and truncate last 10
+ // entries
+ CopyFile(fpath, t_fpath, fsize - 9180);
+
+ ASSERT_OK(env_->DeleteFile(fpath));
+ ASSERT_OK(env_->RenameFile(t_fpath, fpath));
+
+ db_options_.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+
+ OpenReadOnly(names_save);
+
+ CheckMissed();
+
+ Close();
+
+ Open(names_save);
+
+ CheckMissed();
+
+ Close();
+
+ // cleanup
+ env_->DeleteDir(backup_logs);
+}
+
+TEST_P(ColumnFamilyTest, DefaultCfPathsTest) {
+ Open();
+ // Leave cf_paths for one column families to be empty.
+ // Files should be generated according to db_paths for that
+ // column family.
+ ColumnFamilyOptions cf_opt1, cf_opt2;
+ cf_opt1.cf_paths.emplace_back(dbname_ + "_one_1",
+ std::numeric_limits<uint64_t>::max());
+ CreateColumnFamilies({"one", "two"}, {cf_opt1, cf_opt2});
+ Reopen({ColumnFamilyOptions(), cf_opt1, cf_opt2});
+
+ // Fill Column family 1.
+ PutRandomData(1, 100, 100);
+ Flush(1);
+
+ ASSERT_EQ(1, GetSstFileCount(cf_opt1.cf_paths[0].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ // Fill column family 2
+ PutRandomData(2, 100, 100);
+ Flush(2);
+
+ // SST from Column family 2 should be generated in
+ // db_paths which is dbname_ in this case.
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+}
+
+TEST_P(ColumnFamilyTest, MultipleCFPathsTest) {
+ Open();
+ // Configure Column family specific paths.
+ ColumnFamilyOptions cf_opt1, cf_opt2;
+ cf_opt1.cf_paths.emplace_back(dbname_ + "_one_1",
+ std::numeric_limits<uint64_t>::max());
+ cf_opt2.cf_paths.emplace_back(dbname_ + "_two_1",
+ std::numeric_limits<uint64_t>::max());
+ CreateColumnFamilies({"one", "two"}, {cf_opt1, cf_opt2});
+ Reopen({ColumnFamilyOptions(), cf_opt1, cf_opt2});
+
+ PutRandomData(1, 100, 100, true /* save */);
+ Flush(1);
+
+ // Check that files are generated in appropriate paths.
+ ASSERT_EQ(1, GetSstFileCount(cf_opt1.cf_paths[0].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ PutRandomData(2, 100, 100, true /* save */);
+ Flush(2);
+
+ ASSERT_EQ(1, GetSstFileCount(cf_opt2.cf_paths[0].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ // Re-open and verify the keys.
+ Reopen({ColumnFamilyOptions(), cf_opt1, cf_opt2});
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+ for (int cf = 1; cf != 3; ++cf) {
+ ReadOptions read_options;
+ read_options.readahead_size = 0;
+ auto it = dbi->NewIterator(read_options, handles_[cf]);
+ for (it->SeekToFirst(); it->Valid(); it->Next()) {
+ Slice key(it->key());
+ ASSERT_NE(keys_[cf].end(), keys_[cf].find(key.ToString()));
+ }
+ delete it;
+
+ for (const auto& key : keys_[cf]) {
+ ASSERT_NE("NOT_FOUND", Get(cf, key));
+ }
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/compact_files_test.cc b/src/rocksdb/db/compact_files_test.cc
new file mode 100644
index 000000000..948ada675
--- /dev/null
+++ b/src/rocksdb/db/compact_files_test.cc
@@ -0,0 +1,421 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include <mutex>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CompactFilesTest : public testing::Test {
+ public:
+ CompactFilesTest() {
+ env_ = Env::Default();
+ db_name_ = test::PerThreadDBPath("compact_files_test");
+ }
+
+ std::string db_name_;
+ Env* env_;
+};
+
+// A class which remembers the name of each flushed file.
+class FlushedFileCollector : public EventListener {
+ public:
+ FlushedFileCollector() {}
+ ~FlushedFileCollector() override {}
+
+ void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+ std::lock_guard<std::mutex> lock(mutex_);
+ flushed_files_.push_back(info.file_path);
+ }
+
+ std::vector<std::string> GetFlushedFiles() {
+ std::lock_guard<std::mutex> lock(mutex_);
+ std::vector<std::string> result;
+ for (auto fname : flushed_files_) {
+ result.push_back(fname);
+ }
+ return result;
+ }
+ void ClearFlushedFiles() {
+ std::lock_guard<std::mutex> lock(mutex_);
+ flushed_files_.clear();
+ }
+
+ private:
+ std::vector<std::string> flushed_files_;
+ std::mutex mutex_;
+};
+
+TEST_F(CompactFilesTest, L0ConflictsFiles) {
+ Options options;
+ // to trigger compaction more easily
+ const int kWriteBufferSize = 10000;
+ const int kLevel0Trigger = 2;
+ options.create_if_missing = true;
+ options.compaction_style = kCompactionStyleLevel;
+ // Small slowdown and stop trigger for experimental purpose.
+ options.level0_slowdown_writes_trigger = 20;
+ options.level0_stop_writes_trigger = 20;
+ options.level0_stop_writes_trigger = 20;
+ options.write_buffer_size = kWriteBufferSize;
+ options.level0_file_num_compaction_trigger = kLevel0Trigger;
+ options.compression = kNoCompression;
+
+ DB* db = nullptr;
+ DestroyDB(db_name_, options);
+ Status s = DB::Open(options, db_name_, &db);
+ assert(s.ok());
+ assert(db);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"CompactFilesImpl:0", "BackgroundCallCompaction:0"},
+ {"BackgroundCallCompaction:1", "CompactFilesImpl:1"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // create couple files
+ // Background compaction starts and waits in BackgroundCallCompaction:0
+ for (int i = 0; i < kLevel0Trigger * 4; ++i) {
+ db->Put(WriteOptions(), ToString(i), "");
+ db->Put(WriteOptions(), ToString(100 - i), "");
+ db->Flush(FlushOptions());
+ }
+
+ ROCKSDB_NAMESPACE::ColumnFamilyMetaData meta;
+ db->GetColumnFamilyMetaData(&meta);
+ std::string file1;
+ for (auto& file : meta.levels[0].files) {
+ ASSERT_EQ(0, meta.levels[0].level);
+ if (file1 == "") {
+ file1 = file.db_path + "/" + file.name;
+ } else {
+ std::string file2 = file.db_path + "/" + file.name;
+ // Another thread starts a compact files and creates an L0 compaction
+ // The background compaction then notices that there is an L0 compaction
+ // already in progress and doesn't do an L0 compaction
+ // Once the background compaction finishes, the compact files finishes
+ ASSERT_OK(db->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(),
+ {file1, file2}, 0));
+ break;
+ }
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ delete db;
+}
+
+TEST_F(CompactFilesTest, ObsoleteFiles) {
+ Options options;
+ // to trigger compaction more easily
+ const int kWriteBufferSize = 65536;
+ options.create_if_missing = true;
+ // Disable RocksDB background compaction.
+ options.compaction_style = kCompactionStyleNone;
+ options.level0_slowdown_writes_trigger = (1 << 30);
+ options.level0_stop_writes_trigger = (1 << 30);
+ options.write_buffer_size = kWriteBufferSize;
+ options.max_write_buffer_number = 2;
+ options.compression = kNoCompression;
+
+ // Add listener
+ FlushedFileCollector* collector = new FlushedFileCollector();
+ options.listeners.emplace_back(collector);
+
+ DB* db = nullptr;
+ DestroyDB(db_name_, options);
+ Status s = DB::Open(options, db_name_, &db);
+ assert(s.ok());
+ assert(db);
+
+ // create couple files
+ for (int i = 1000; i < 2000; ++i) {
+ db->Put(WriteOptions(), ToString(i),
+ std::string(kWriteBufferSize / 10, 'a' + (i % 26)));
+ }
+
+ auto l0_files = collector->GetFlushedFiles();
+ ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files, 1));
+ reinterpret_cast<DBImpl*>(db)->TEST_WaitForCompact();
+
+ // verify all compaction input files are deleted
+ for (auto fname : l0_files) {
+ ASSERT_EQ(Status::NotFound(), env_->FileExists(fname));
+ }
+ delete db;
+}
+
+TEST_F(CompactFilesTest, NotCutOutputOnLevel0) {
+ Options options;
+ options.create_if_missing = true;
+ // Disable RocksDB background compaction.
+ options.compaction_style = kCompactionStyleNone;
+ options.level0_slowdown_writes_trigger = 1000;
+ options.level0_stop_writes_trigger = 1000;
+ options.write_buffer_size = 65536;
+ options.max_write_buffer_number = 2;
+ options.compression = kNoCompression;
+ options.max_compaction_bytes = 5000;
+
+ // Add listener
+ FlushedFileCollector* collector = new FlushedFileCollector();
+ options.listeners.emplace_back(collector);
+
+ DB* db = nullptr;
+ DestroyDB(db_name_, options);
+ Status s = DB::Open(options, db_name_, &db);
+ assert(s.ok());
+ assert(db);
+
+ // create couple files
+ for (int i = 0; i < 500; ++i) {
+ db->Put(WriteOptions(), ToString(i), std::string(1000, 'a' + (i % 26)));
+ }
+ reinterpret_cast<DBImpl*>(db)->TEST_WaitForFlushMemTable();
+ auto l0_files_1 = collector->GetFlushedFiles();
+ collector->ClearFlushedFiles();
+ for (int i = 0; i < 500; ++i) {
+ db->Put(WriteOptions(), ToString(i), std::string(1000, 'a' + (i % 26)));
+ }
+ reinterpret_cast<DBImpl*>(db)->TEST_WaitForFlushMemTable();
+ auto l0_files_2 = collector->GetFlushedFiles();
+ ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files_1, 0));
+ ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files_2, 0));
+ // no assertion failure
+ delete db;
+}
+
+TEST_F(CompactFilesTest, CapturingPendingFiles) {
+ Options options;
+ options.create_if_missing = true;
+ // Disable RocksDB background compaction.
+ options.compaction_style = kCompactionStyleNone;
+ // Always do full scans for obsolete files (needed to reproduce the issue).
+ options.delete_obsolete_files_period_micros = 0;
+
+ // Add listener.
+ FlushedFileCollector* collector = new FlushedFileCollector();
+ options.listeners.emplace_back(collector);
+
+ DB* db = nullptr;
+ DestroyDB(db_name_, options);
+ Status s = DB::Open(options, db_name_, &db);
+ assert(s.ok());
+ assert(db);
+
+ // Create 5 files.
+ for (int i = 0; i < 5; ++i) {
+ db->Put(WriteOptions(), "key" + ToString(i), "value");
+ db->Flush(FlushOptions());
+ }
+
+ auto l0_files = collector->GetFlushedFiles();
+ EXPECT_EQ(5, l0_files.size());
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"CompactFilesImpl:2", "CompactFilesTest.CapturingPendingFiles:0"},
+ {"CompactFilesTest.CapturingPendingFiles:1", "CompactFilesImpl:3"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Start compacting files.
+ ROCKSDB_NAMESPACE::port::Thread compaction_thread(
+ [&] { EXPECT_OK(db->CompactFiles(CompactionOptions(), l0_files, 1)); });
+
+ // In the meantime flush another file.
+ TEST_SYNC_POINT("CompactFilesTest.CapturingPendingFiles:0");
+ db->Put(WriteOptions(), "key5", "value");
+ db->Flush(FlushOptions());
+ TEST_SYNC_POINT("CompactFilesTest.CapturingPendingFiles:1");
+
+ compaction_thread.join();
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ delete db;
+
+ // Make sure we can reopen the DB.
+ s = DB::Open(options, db_name_, &db);
+ ASSERT_TRUE(s.ok());
+ assert(db);
+ delete db;
+}
+
+TEST_F(CompactFilesTest, CompactionFilterWithGetSv) {
+ class FilterWithGet : public CompactionFilter {
+ public:
+ bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+ std::string* /*new_value*/,
+ bool* /*value_changed*/) const override {
+ if (db_ == nullptr) {
+ return true;
+ }
+ std::string res;
+ db_->Get(ReadOptions(), "", &res);
+ return true;
+ }
+
+ void SetDB(DB* db) {
+ db_ = db;
+ }
+
+ const char* Name() const override { return "FilterWithGet"; }
+
+ private:
+ DB* db_;
+ };
+
+
+ std::shared_ptr<FilterWithGet> cf(new FilterWithGet());
+
+ Options options;
+ options.create_if_missing = true;
+ options.compaction_filter = cf.get();
+
+ DB* db = nullptr;
+ DestroyDB(db_name_, options);
+ Status s = DB::Open(options, db_name_, &db);
+ ASSERT_OK(s);
+
+ cf->SetDB(db);
+
+ // Write one L0 file
+ db->Put(WriteOptions(), "K1", "V1");
+ db->Flush(FlushOptions());
+
+ // Compact all L0 files using CompactFiles
+ ROCKSDB_NAMESPACE::ColumnFamilyMetaData meta;
+ db->GetColumnFamilyMetaData(&meta);
+ for (auto& file : meta.levels[0].files) {
+ std::string fname = file.db_path + "/" + file.name;
+ ASSERT_OK(
+ db->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), {fname}, 0));
+ }
+
+
+ delete db;
+}
+
+TEST_F(CompactFilesTest, SentinelCompressionType) {
+ if (!Zlib_Supported()) {
+ fprintf(stderr, "zlib compression not supported, skip this test\n");
+ return;
+ }
+ if (!Snappy_Supported()) {
+ fprintf(stderr, "snappy compression not supported, skip this test\n");
+ return;
+ }
+ // Check that passing `CompressionType::kDisableCompressionOption` to
+ // `CompactFiles` causes it to use the column family compression options.
+ for (auto compaction_style :
+ {CompactionStyle::kCompactionStyleLevel,
+ CompactionStyle::kCompactionStyleUniversal,
+ CompactionStyle::kCompactionStyleNone}) {
+ DestroyDB(db_name_, Options());
+ Options options;
+ options.compaction_style = compaction_style;
+ // L0: Snappy, L1: ZSTD, L2: Snappy
+ options.compression_per_level = {CompressionType::kSnappyCompression,
+ CompressionType::kZlibCompression,
+ CompressionType::kSnappyCompression};
+ options.create_if_missing = true;
+ FlushedFileCollector* collector = new FlushedFileCollector();
+ options.listeners.emplace_back(collector);
+ DB* db = nullptr;
+ ASSERT_OK(DB::Open(options, db_name_, &db));
+
+ db->Put(WriteOptions(), "key", "val");
+ db->Flush(FlushOptions());
+
+ auto l0_files = collector->GetFlushedFiles();
+ ASSERT_EQ(1, l0_files.size());
+
+ // L0->L1 compaction, so output should be ZSTD-compressed
+ CompactionOptions compaction_opts;
+ compaction_opts.compression = CompressionType::kDisableCompressionOption;
+ ASSERT_OK(db->CompactFiles(compaction_opts, l0_files, 1));
+
+ ROCKSDB_NAMESPACE::TablePropertiesCollection all_tables_props;
+ ASSERT_OK(db->GetPropertiesOfAllTables(&all_tables_props));
+ for (const auto& name_and_table_props : all_tables_props) {
+ ASSERT_EQ(CompressionTypeToString(CompressionType::kZlibCompression),
+ name_and_table_props.second->compression_name);
+ }
+ delete db;
+ }
+}
+
+TEST_F(CompactFilesTest, GetCompactionJobInfo) {
+ Options options;
+ options.create_if_missing = true;
+ // Disable RocksDB background compaction.
+ options.compaction_style = kCompactionStyleNone;
+ options.level0_slowdown_writes_trigger = 1000;
+ options.level0_stop_writes_trigger = 1000;
+ options.write_buffer_size = 65536;
+ options.max_write_buffer_number = 2;
+ options.compression = kNoCompression;
+ options.max_compaction_bytes = 5000;
+
+ // Add listener
+ FlushedFileCollector* collector = new FlushedFileCollector();
+ options.listeners.emplace_back(collector);
+
+ DB* db = nullptr;
+ DestroyDB(db_name_, options);
+ Status s = DB::Open(options, db_name_, &db);
+ assert(s.ok());
+ assert(db);
+
+ // create couple files
+ for (int i = 0; i < 500; ++i) {
+ db->Put(WriteOptions(), ToString(i), std::string(1000, 'a' + (i % 26)));
+ }
+ reinterpret_cast<DBImpl*>(db)->TEST_WaitForFlushMemTable();
+ auto l0_files_1 = collector->GetFlushedFiles();
+ CompactionOptions co;
+ co.compression = CompressionType::kLZ4Compression;
+ CompactionJobInfo compaction_job_info{};
+ ASSERT_OK(
+ db->CompactFiles(co, l0_files_1, 0, -1, nullptr, &compaction_job_info));
+ ASSERT_EQ(compaction_job_info.base_input_level, 0);
+ ASSERT_EQ(compaction_job_info.cf_id, db->DefaultColumnFamily()->GetID());
+ ASSERT_EQ(compaction_job_info.cf_name, db->DefaultColumnFamily()->GetName());
+ ASSERT_EQ(compaction_job_info.compaction_reason,
+ CompactionReason::kManualCompaction);
+ ASSERT_EQ(compaction_job_info.compression, CompressionType::kLZ4Compression);
+ ASSERT_EQ(compaction_job_info.output_level, 0);
+ ASSERT_OK(compaction_job_info.status);
+ // no assertion failure
+ delete db;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr,
+ "SKIPPED as DBImpl::CompactFiles is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compacted_db_impl.cc b/src/rocksdb/db/compacted_db_impl.cc
new file mode 100644
index 000000000..47d6ecced
--- /dev/null
+++ b/src/rocksdb/db/compacted_db_impl.cc
@@ -0,0 +1,160 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include "db/compacted_db_impl.h"
+#include "db/db_impl/db_impl.h"
+#include "db/version_set.h"
+#include "table/get_context.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+extern void MarkKeyMayExist(void* arg);
+extern bool SaveValue(void* arg, const ParsedInternalKey& parsed_key,
+ const Slice& v, bool hit_and_return);
+
+CompactedDBImpl::CompactedDBImpl(
+ const DBOptions& options, const std::string& dbname)
+ : DBImpl(options, dbname), cfd_(nullptr), version_(nullptr),
+ user_comparator_(nullptr) {
+}
+
+CompactedDBImpl::~CompactedDBImpl() {
+}
+
+size_t CompactedDBImpl::FindFile(const Slice& key) {
+ size_t right = files_.num_files - 1;
+ auto cmp = [&](const FdWithKeyRange& f, const Slice& k) -> bool {
+ return user_comparator_->Compare(ExtractUserKey(f.largest_key), k) < 0;
+ };
+ return static_cast<size_t>(std::lower_bound(files_.files,
+ files_.files + right, key, cmp) - files_.files);
+}
+
+Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*,
+ const Slice& key, PinnableSlice* value) {
+ GetContext get_context(user_comparator_, nullptr, nullptr, nullptr,
+ GetContext::kNotFound, key, value, nullptr, nullptr,
+ true, nullptr, nullptr);
+ LookupKey lkey(key, kMaxSequenceNumber);
+ files_.files[FindFile(key)].fd.table_reader->Get(options, lkey.internal_key(),
+ &get_context, nullptr);
+ if (get_context.State() == GetContext::kFound) {
+ return Status::OK();
+ }
+ return Status::NotFound();
+}
+
+std::vector<Status> CompactedDBImpl::MultiGet(const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>&,
+ const std::vector<Slice>& keys, std::vector<std::string>* values) {
+ autovector<TableReader*, 16> reader_list;
+ for (const auto& key : keys) {
+ const FdWithKeyRange& f = files_.files[FindFile(key)];
+ if (user_comparator_->Compare(key, ExtractUserKey(f.smallest_key)) < 0) {
+ reader_list.push_back(nullptr);
+ } else {
+ LookupKey lkey(key, kMaxSequenceNumber);
+ f.fd.table_reader->Prepare(lkey.internal_key());
+ reader_list.push_back(f.fd.table_reader);
+ }
+ }
+ std::vector<Status> statuses(keys.size(), Status::NotFound());
+ values->resize(keys.size());
+ int idx = 0;
+ for (auto* r : reader_list) {
+ if (r != nullptr) {
+ PinnableSlice pinnable_val;
+ std::string& value = (*values)[idx];
+ GetContext get_context(user_comparator_, nullptr, nullptr, nullptr,
+ GetContext::kNotFound, keys[idx], &pinnable_val,
+ nullptr, nullptr, true, nullptr, nullptr);
+ LookupKey lkey(keys[idx], kMaxSequenceNumber);
+ r->Get(options, lkey.internal_key(), &get_context, nullptr);
+ value.assign(pinnable_val.data(), pinnable_val.size());
+ if (get_context.State() == GetContext::kFound) {
+ statuses[idx] = Status::OK();
+ }
+ }
+ ++idx;
+ }
+ return statuses;
+}
+
+Status CompactedDBImpl::Init(const Options& options) {
+ SuperVersionContext sv_context(/* create_superversion */ true);
+ mutex_.Lock();
+ ColumnFamilyDescriptor cf(kDefaultColumnFamilyName,
+ ColumnFamilyOptions(options));
+ Status s = Recover({cf}, true /* read only */, false, true);
+ if (s.ok()) {
+ cfd_ = reinterpret_cast<ColumnFamilyHandleImpl*>(
+ DefaultColumnFamily())->cfd();
+ cfd_->InstallSuperVersion(&sv_context, &mutex_);
+ }
+ mutex_.Unlock();
+ sv_context.Clean();
+ if (!s.ok()) {
+ return s;
+ }
+ NewThreadStatusCfInfo(cfd_);
+ version_ = cfd_->GetSuperVersion()->current;
+ user_comparator_ = cfd_->user_comparator();
+ auto* vstorage = version_->storage_info();
+ if (vstorage->num_non_empty_levels() == 0) {
+ return Status::NotSupported("no file exists");
+ }
+ const LevelFilesBrief& l0 = vstorage->LevelFilesBrief(0);
+ // L0 should not have files
+ if (l0.num_files > 1) {
+ return Status::NotSupported("L0 contain more than 1 file");
+ }
+ if (l0.num_files == 1) {
+ if (vstorage->num_non_empty_levels() > 1) {
+ return Status::NotSupported("Both L0 and other level contain files");
+ }
+ files_ = l0;
+ return Status::OK();
+ }
+
+ for (int i = 1; i < vstorage->num_non_empty_levels() - 1; ++i) {
+ if (vstorage->LevelFilesBrief(i).num_files > 0) {
+ return Status::NotSupported("Other levels also contain files");
+ }
+ }
+
+ int level = vstorage->num_non_empty_levels() - 1;
+ if (vstorage->LevelFilesBrief(level).num_files > 0) {
+ files_ = vstorage->LevelFilesBrief(level);
+ return Status::OK();
+ }
+ return Status::NotSupported("no file exists");
+}
+
+Status CompactedDBImpl::Open(const Options& options,
+ const std::string& dbname, DB** dbptr) {
+ *dbptr = nullptr;
+
+ if (options.max_open_files != -1) {
+ return Status::InvalidArgument("require max_open_files = -1");
+ }
+ if (options.merge_operator.get() != nullptr) {
+ return Status::InvalidArgument("merge operator is not supported");
+ }
+ DBOptions db_options(options);
+ std::unique_ptr<CompactedDBImpl> db(new CompactedDBImpl(db_options, dbname));
+ Status s = db->Init(options);
+ if (s.ok()) {
+ db->StartTimedTasks();
+ ROCKS_LOG_INFO(db->immutable_db_options_.info_log,
+ "Opened the db as fully compacted mode");
+ LogFlush(db->immutable_db_options_.info_log);
+ *dbptr = db.release();
+ }
+ return s;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/db/compacted_db_impl.h b/src/rocksdb/db/compacted_db_impl.h
new file mode 100644
index 000000000..7099566fc
--- /dev/null
+++ b/src/rocksdb/db/compacted_db_impl.h
@@ -0,0 +1,113 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+#include <string>
+#include <vector>
+#include "db/db_impl/db_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CompactedDBImpl : public DBImpl {
+ public:
+ CompactedDBImpl(const DBOptions& options, const std::string& dbname);
+ // No copying allowed
+ CompactedDBImpl(const CompactedDBImpl&) = delete;
+ void operator=(const CompactedDBImpl&) = delete;
+
+ virtual ~CompactedDBImpl();
+
+ static Status Open(const Options& options, const std::string& dbname,
+ DB** dbptr);
+
+ // Implementations of the DB interface
+ using DB::Get;
+ virtual Status Get(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value) override;
+ using DB::MultiGet;
+ virtual std::vector<Status> MultiGet(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>&,
+ const std::vector<Slice>& keys, std::vector<std::string>* values)
+ override;
+
+ using DBImpl::Put;
+ virtual Status Put(const WriteOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*key*/, const Slice& /*value*/) override {
+ return Status::NotSupported("Not supported in compacted db mode.");
+ }
+ using DBImpl::Merge;
+ virtual Status Merge(const WriteOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*key*/, const Slice& /*value*/) override {
+ return Status::NotSupported("Not supported in compacted db mode.");
+ }
+ using DBImpl::Delete;
+ virtual Status Delete(const WriteOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*key*/) override {
+ return Status::NotSupported("Not supported in compacted db mode.");
+ }
+ virtual Status Write(const WriteOptions& /*options*/,
+ WriteBatch* /*updates*/) override {
+ return Status::NotSupported("Not supported in compacted db mode.");
+ }
+ using DBImpl::CompactRange;
+ virtual Status CompactRange(const CompactRangeOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice* /*begin*/,
+ const Slice* /*end*/) override {
+ return Status::NotSupported("Not supported in compacted db mode.");
+ }
+
+ virtual Status DisableFileDeletions() override {
+ return Status::NotSupported("Not supported in compacted db mode.");
+ }
+ virtual Status EnableFileDeletions(bool /*force*/) override {
+ return Status::NotSupported("Not supported in compacted db mode.");
+ }
+ virtual Status GetLiveFiles(std::vector<std::string>& ret,
+ uint64_t* manifest_file_size,
+ bool /*flush_memtable*/) override {
+ return DBImpl::GetLiveFiles(ret, manifest_file_size,
+ false /* flush_memtable */);
+ }
+ using DBImpl::Flush;
+ virtual Status Flush(const FlushOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/) override {
+ return Status::NotSupported("Not supported in compacted db mode.");
+ }
+ using DB::IngestExternalFile;
+ virtual Status IngestExternalFile(
+ ColumnFamilyHandle* /*column_family*/,
+ const std::vector<std::string>& /*external_files*/,
+ const IngestExternalFileOptions& /*ingestion_options*/) override {
+ return Status::NotSupported("Not supported in compacted db mode.");
+ }
+ using DB::CreateColumnFamilyWithImport;
+ virtual Status CreateColumnFamilyWithImport(
+ const ColumnFamilyOptions& /*options*/,
+ const std::string& /*column_family_name*/,
+ const ImportColumnFamilyOptions& /*import_options*/,
+ const ExportImportFilesMetaData& /*metadata*/,
+ ColumnFamilyHandle** /*handle*/) override {
+ return Status::NotSupported("Not supported in compacted db mode.");
+ }
+
+ private:
+ friend class DB;
+ inline size_t FindFile(const Slice& key);
+ Status Init(const Options& options);
+
+ ColumnFamilyData* cfd_;
+ Version* version_;
+ const Comparator* user_comparator_;
+ LevelFilesBrief files_;
+};
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction.cc b/src/rocksdb/db/compaction/compaction.cc
new file mode 100644
index 000000000..5c34fdcaa
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction.cc
@@ -0,0 +1,564 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <cinttypes>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/compaction/compaction.h"
+#include "rocksdb/compaction_filter.h"
+#include "test_util/sync_point.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const uint64_t kRangeTombstoneSentinel =
+ PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion);
+
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+ const InternalKey& b) {
+ auto c = user_cmp->Compare(a.user_key(), b.user_key());
+ if (c != 0) {
+ return c;
+ }
+ auto a_footer = ExtractInternalKeyFooter(a.Encode());
+ auto b_footer = ExtractInternalKeyFooter(b.Encode());
+ if (a_footer == kRangeTombstoneSentinel) {
+ if (b_footer != kRangeTombstoneSentinel) {
+ return -1;
+ }
+ } else if (b_footer == kRangeTombstoneSentinel) {
+ return 1;
+ }
+ return 0;
+}
+
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a,
+ const InternalKey& b) {
+ if (a == nullptr) {
+ return -1;
+ }
+ return sstableKeyCompare(user_cmp, *a, b);
+}
+
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+ const InternalKey* b) {
+ if (b == nullptr) {
+ return -1;
+ }
+ return sstableKeyCompare(user_cmp, a, *b);
+}
+
+uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
+ uint64_t sum = 0;
+ for (size_t i = 0; i < files.size() && files[i]; i++) {
+ sum += files[i]->fd.GetFileSize();
+ }
+ return sum;
+}
+
+void Compaction::SetInputVersion(Version* _input_version) {
+ input_version_ = _input_version;
+ cfd_ = input_version_->cfd();
+
+ cfd_->Ref();
+ input_version_->Ref();
+ edit_.SetColumnFamily(cfd_->GetID());
+}
+
+void Compaction::GetBoundaryKeys(
+ VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs, Slice* smallest_user_key,
+ Slice* largest_user_key) {
+ bool initialized = false;
+ const Comparator* ucmp = vstorage->InternalComparator()->user_comparator();
+ for (size_t i = 0; i < inputs.size(); ++i) {
+ if (inputs[i].files.empty()) {
+ continue;
+ }
+ if (inputs[i].level == 0) {
+ // we need to consider all files on level 0
+ for (const auto* f : inputs[i].files) {
+ const Slice& start_user_key = f->smallest.user_key();
+ if (!initialized ||
+ ucmp->Compare(start_user_key, *smallest_user_key) < 0) {
+ *smallest_user_key = start_user_key;
+ }
+ const Slice& end_user_key = f->largest.user_key();
+ if (!initialized ||
+ ucmp->Compare(end_user_key, *largest_user_key) > 0) {
+ *largest_user_key = end_user_key;
+ }
+ initialized = true;
+ }
+ } else {
+ // we only need to consider the first and last file
+ const Slice& start_user_key = inputs[i].files[0]->smallest.user_key();
+ if (!initialized ||
+ ucmp->Compare(start_user_key, *smallest_user_key) < 0) {
+ *smallest_user_key = start_user_key;
+ }
+ const Slice& end_user_key = inputs[i].files.back()->largest.user_key();
+ if (!initialized || ucmp->Compare(end_user_key, *largest_user_key) > 0) {
+ *largest_user_key = end_user_key;
+ }
+ initialized = true;
+ }
+ }
+}
+
+std::vector<CompactionInputFiles> Compaction::PopulateWithAtomicBoundaries(
+ VersionStorageInfo* vstorage, std::vector<CompactionInputFiles> inputs) {
+ const Comparator* ucmp = vstorage->InternalComparator()->user_comparator();
+ for (size_t i = 0; i < inputs.size(); i++) {
+ if (inputs[i].level == 0 || inputs[i].files.empty()) {
+ continue;
+ }
+ inputs[i].atomic_compaction_unit_boundaries.reserve(inputs[i].files.size());
+ AtomicCompactionUnitBoundary cur_boundary;
+ size_t first_atomic_idx = 0;
+ auto add_unit_boundary = [&](size_t to) {
+ if (first_atomic_idx == to) return;
+ for (size_t k = first_atomic_idx; k < to; k++) {
+ inputs[i].atomic_compaction_unit_boundaries.push_back(cur_boundary);
+ }
+ first_atomic_idx = to;
+ };
+ for (size_t j = 0; j < inputs[i].files.size(); j++) {
+ const auto* f = inputs[i].files[j];
+ if (j == 0) {
+ // First file in a level.
+ cur_boundary.smallest = &f->smallest;
+ cur_boundary.largest = &f->largest;
+ } else if (sstableKeyCompare(ucmp, *cur_boundary.largest, f->smallest) ==
+ 0) {
+ // SSTs overlap but the end key of the previous file was not
+ // artificially extended by a range tombstone. Extend the current
+ // boundary.
+ cur_boundary.largest = &f->largest;
+ } else {
+ // Atomic compaction unit has ended.
+ add_unit_boundary(j);
+ cur_boundary.smallest = &f->smallest;
+ cur_boundary.largest = &f->largest;
+ }
+ }
+ add_unit_boundary(inputs[i].files.size());
+ assert(inputs[i].files.size() ==
+ inputs[i].atomic_compaction_unit_boundaries.size());
+ }
+ return inputs;
+}
+
+// helper function to determine if compaction is creating files at the
+// bottommost level
+bool Compaction::IsBottommostLevel(
+ int output_level, VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs) {
+ int output_l0_idx;
+ if (output_level == 0) {
+ output_l0_idx = 0;
+ for (const auto* file : vstorage->LevelFiles(0)) {
+ if (inputs[0].files.back() == file) {
+ break;
+ }
+ ++output_l0_idx;
+ }
+ assert(static_cast<size_t>(output_l0_idx) < vstorage->LevelFiles(0).size());
+ } else {
+ output_l0_idx = -1;
+ }
+ Slice smallest_key, largest_key;
+ GetBoundaryKeys(vstorage, inputs, &smallest_key, &largest_key);
+ return !vstorage->RangeMightExistAfterSortedRun(smallest_key, largest_key,
+ output_level, output_l0_idx);
+}
+
+// test function to validate the functionality of IsBottommostLevel()
+// function -- determines if compaction with inputs and storage is bottommost
+bool Compaction::TEST_IsBottommostLevel(
+ int output_level, VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs) {
+ return IsBottommostLevel(output_level, vstorage, inputs);
+}
+
+bool Compaction::IsFullCompaction(
+ VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs) {
+ size_t num_files_in_compaction = 0;
+ size_t total_num_files = 0;
+ for (int l = 0; l < vstorage->num_levels(); l++) {
+ total_num_files += vstorage->NumLevelFiles(l);
+ }
+ for (size_t i = 0; i < inputs.size(); i++) {
+ num_files_in_compaction += inputs[i].size();
+ }
+ return num_files_in_compaction == total_num_files;
+}
+
+Compaction::Compaction(VersionStorageInfo* vstorage,
+ const ImmutableCFOptions& _immutable_cf_options,
+ const MutableCFOptions& _mutable_cf_options,
+ std::vector<CompactionInputFiles> _inputs,
+ int _output_level, uint64_t _target_file_size,
+ uint64_t _max_compaction_bytes, uint32_t _output_path_id,
+ CompressionType _compression,
+ CompressionOptions _compression_opts,
+ uint32_t _max_subcompactions,
+ std::vector<FileMetaData*> _grandparents,
+ bool _manual_compaction, double _score,
+ bool _deletion_compaction,
+ CompactionReason _compaction_reason)
+ : input_vstorage_(vstorage),
+ start_level_(_inputs[0].level),
+ output_level_(_output_level),
+ max_output_file_size_(_target_file_size),
+ max_compaction_bytes_(_max_compaction_bytes),
+ max_subcompactions_(_max_subcompactions),
+ immutable_cf_options_(_immutable_cf_options),
+ mutable_cf_options_(_mutable_cf_options),
+ input_version_(nullptr),
+ number_levels_(vstorage->num_levels()),
+ cfd_(nullptr),
+ output_path_id_(_output_path_id),
+ output_compression_(_compression),
+ output_compression_opts_(_compression_opts),
+ deletion_compaction_(_deletion_compaction),
+ inputs_(PopulateWithAtomicBoundaries(vstorage, std::move(_inputs))),
+ grandparents_(std::move(_grandparents)),
+ score_(_score),
+ bottommost_level_(IsBottommostLevel(output_level_, vstorage, inputs_)),
+ is_full_compaction_(IsFullCompaction(vstorage, inputs_)),
+ is_manual_compaction_(_manual_compaction),
+ is_trivial_move_(false),
+ compaction_reason_(_compaction_reason) {
+ MarkFilesBeingCompacted(true);
+ if (is_manual_compaction_) {
+ compaction_reason_ = CompactionReason::kManualCompaction;
+ }
+ if (max_subcompactions_ == 0) {
+ max_subcompactions_ = immutable_cf_options_.max_subcompactions;
+ }
+ if (!bottommost_level_) {
+ // Currently we only enable dictionary compression during compaction to the
+ // bottommost level.
+ output_compression_opts_.max_dict_bytes = 0;
+ output_compression_opts_.zstd_max_train_bytes = 0;
+ }
+
+#ifndef NDEBUG
+ for (size_t i = 1; i < inputs_.size(); ++i) {
+ assert(inputs_[i].level > inputs_[i - 1].level);
+ }
+#endif
+
+ // setup input_levels_
+ {
+ input_levels_.resize(num_input_levels());
+ for (size_t which = 0; which < num_input_levels(); which++) {
+ DoGenerateLevelFilesBrief(&input_levels_[which], inputs_[which].files,
+ &arena_);
+ }
+ }
+
+ GetBoundaryKeys(vstorage, inputs_, &smallest_user_key_, &largest_user_key_);
+}
+
+Compaction::~Compaction() {
+ if (input_version_ != nullptr) {
+ input_version_->Unref();
+ }
+ if (cfd_ != nullptr) {
+ cfd_->UnrefAndTryDelete();
+ }
+}
+
+bool Compaction::InputCompressionMatchesOutput() const {
+ int base_level = input_vstorage_->base_level();
+ bool matches = (GetCompressionType(immutable_cf_options_, input_vstorage_,
+ mutable_cf_options_, start_level_,
+ base_level) == output_compression_);
+ if (matches) {
+ TEST_SYNC_POINT("Compaction::InputCompressionMatchesOutput:Matches");
+ return true;
+ }
+ TEST_SYNC_POINT("Compaction::InputCompressionMatchesOutput:DidntMatch");
+ return matches;
+}
+
+bool Compaction::IsTrivialMove() const {
+ // Avoid a move if there is lots of overlapping grandparent data.
+ // Otherwise, the move could create a parent file that will require
+ // a very expensive merge later on.
+ // If start_level_== output_level_, the purpose is to force compaction
+ // filter to be applied to that level, and thus cannot be a trivial move.
+
+ // Check if start level have files with overlapping ranges
+ if (start_level_ == 0 && input_vstorage_->level0_non_overlapping() == false) {
+ // We cannot move files from L0 to L1 if the files are overlapping
+ return false;
+ }
+
+ if (is_manual_compaction_ &&
+ (immutable_cf_options_.compaction_filter != nullptr ||
+ immutable_cf_options_.compaction_filter_factory != nullptr)) {
+ // This is a manual compaction and we have a compaction filter that should
+ // be executed, we cannot do a trivial move
+ return false;
+ }
+
+ // Used in universal compaction, where trivial move can be done if the
+ // input files are non overlapping
+ if ((mutable_cf_options_.compaction_options_universal.allow_trivial_move) &&
+ (output_level_ != 0)) {
+ return is_trivial_move_;
+ }
+
+ if (!(start_level_ != output_level_ && num_input_levels() == 1 &&
+ input(0, 0)->fd.GetPathId() == output_path_id() &&
+ InputCompressionMatchesOutput())) {
+ return false;
+ }
+
+ // assert inputs_.size() == 1
+
+ for (const auto& file : inputs_.front().files) {
+ std::vector<FileMetaData*> file_grand_parents;
+ if (output_level_ + 1 >= number_levels_) {
+ continue;
+ }
+ input_vstorage_->GetOverlappingInputs(output_level_ + 1, &file->smallest,
+ &file->largest, &file_grand_parents);
+ const auto compaction_size =
+ file->fd.GetFileSize() + TotalFileSize(file_grand_parents);
+ if (compaction_size > max_compaction_bytes_) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+void Compaction::AddInputDeletions(VersionEdit* out_edit) {
+ for (size_t which = 0; which < num_input_levels(); which++) {
+ for (size_t i = 0; i < inputs_[which].size(); i++) {
+ out_edit->DeleteFile(level(which), inputs_[which][i]->fd.GetNumber());
+ }
+ }
+}
+
+bool Compaction::KeyNotExistsBeyondOutputLevel(
+ const Slice& user_key, std::vector<size_t>* level_ptrs) const {
+ assert(input_version_ != nullptr);
+ assert(level_ptrs != nullptr);
+ assert(level_ptrs->size() == static_cast<size_t>(number_levels_));
+ if (bottommost_level_) {
+ return true;
+ } else if (output_level_ != 0 &&
+ cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
+ // Maybe use binary search to find right entry instead of linear search?
+ const Comparator* user_cmp = cfd_->user_comparator();
+ for (int lvl = output_level_ + 1; lvl < number_levels_; lvl++) {
+ const std::vector<FileMetaData*>& files =
+ input_vstorage_->LevelFiles(lvl);
+ for (; level_ptrs->at(lvl) < files.size(); level_ptrs->at(lvl)++) {
+ auto* f = files[level_ptrs->at(lvl)];
+ if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
+ // We've advanced far enough
+ if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
+ // Key falls in this file's range, so it may
+ // exist beyond output level
+ return false;
+ }
+ break;
+ }
+ }
+ }
+ return true;
+ }
+ return false;
+}
+
+// Mark (or clear) each file that is being compacted
+void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) {
+ for (size_t i = 0; i < num_input_levels(); i++) {
+ for (size_t j = 0; j < inputs_[i].size(); j++) {
+ assert(mark_as_compacted ? !inputs_[i][j]->being_compacted
+ : inputs_[i][j]->being_compacted);
+ inputs_[i][j]->being_compacted = mark_as_compacted;
+ }
+ }
+}
+
+// Sample output:
+// If compacting 3 L0 files, 2 L3 files and 1 L4 file, and outputting to L5,
+// print: "3@0 + 2@3 + 1@4 files to L5"
+const char* Compaction::InputLevelSummary(
+ InputLevelSummaryBuffer* scratch) const {
+ int len = 0;
+ bool is_first = true;
+ for (auto& input_level : inputs_) {
+ if (input_level.empty()) {
+ continue;
+ }
+ if (!is_first) {
+ len +=
+ snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, " + ");
+ len = std::min(len, static_cast<int>(sizeof(scratch->buffer)));
+ } else {
+ is_first = false;
+ }
+ len += snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
+ "%" ROCKSDB_PRIszt "@%d", input_level.size(),
+ input_level.level);
+ len = std::min(len, static_cast<int>(sizeof(scratch->buffer)));
+ }
+ snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
+ " files to L%d", output_level());
+
+ return scratch->buffer;
+}
+
+uint64_t Compaction::CalculateTotalInputSize() const {
+ uint64_t size = 0;
+ for (auto& input_level : inputs_) {
+ for (auto f : input_level.files) {
+ size += f->fd.GetFileSize();
+ }
+ }
+ return size;
+}
+
+void Compaction::ReleaseCompactionFiles(Status status) {
+ MarkFilesBeingCompacted(false);
+ cfd_->compaction_picker()->ReleaseCompactionFiles(this, status);
+}
+
+void Compaction::ResetNextCompactionIndex() {
+ assert(input_version_ != nullptr);
+ input_vstorage_->ResetNextCompactionIndex(start_level_);
+}
+
+namespace {
+int InputSummary(const std::vector<FileMetaData*>& files, char* output,
+ int len) {
+ *output = '\0';
+ int write = 0;
+ for (size_t i = 0; i < files.size(); i++) {
+ int sz = len - write;
+ int ret;
+ char sztxt[16];
+ AppendHumanBytes(files.at(i)->fd.GetFileSize(), sztxt, 16);
+ ret = snprintf(output + write, sz, "%" PRIu64 "(%s) ",
+ files.at(i)->fd.GetNumber(), sztxt);
+ if (ret < 0 || ret >= sz) break;
+ write += ret;
+ }
+ // if files.size() is non-zero, overwrite the last space
+ return write - !!files.size();
+}
+} // namespace
+
+void Compaction::Summary(char* output, int len) {
+ int write =
+ snprintf(output, len, "Base version %" PRIu64 " Base level %d, inputs: [",
+ input_version_->GetVersionNumber(), start_level_);
+ if (write < 0 || write >= len) {
+ return;
+ }
+
+ for (size_t level_iter = 0; level_iter < num_input_levels(); ++level_iter) {
+ if (level_iter > 0) {
+ write += snprintf(output + write, len - write, "], [");
+ if (write < 0 || write >= len) {
+ return;
+ }
+ }
+ write +=
+ InputSummary(inputs_[level_iter].files, output + write, len - write);
+ if (write < 0 || write >= len) {
+ return;
+ }
+ }
+
+ snprintf(output + write, len - write, "]");
+}
+
+uint64_t Compaction::OutputFilePreallocationSize() const {
+ uint64_t preallocation_size = 0;
+
+ for (const auto& level_files : inputs_) {
+ for (const auto& file : level_files.files) {
+ preallocation_size += file->fd.GetFileSize();
+ }
+ }
+
+ if (max_output_file_size_ != port::kMaxUint64 &&
+ (immutable_cf_options_.compaction_style == kCompactionStyleLevel ||
+ output_level() > 0)) {
+ preallocation_size = std::min(max_output_file_size_, preallocation_size);
+ }
+
+ // Over-estimate slightly so we don't end up just barely crossing
+ // the threshold
+ // No point to prellocate more than 1GB.
+ return std::min(uint64_t{1073741824},
+ preallocation_size + (preallocation_size / 10));
+}
+
+std::unique_ptr<CompactionFilter> Compaction::CreateCompactionFilter() const {
+ if (!cfd_->ioptions()->compaction_filter_factory) {
+ return nullptr;
+ }
+
+ CompactionFilter::Context context;
+ context.is_full_compaction = is_full_compaction_;
+ context.is_manual_compaction = is_manual_compaction_;
+ context.column_family_id = cfd_->GetID();
+ return cfd_->ioptions()->compaction_filter_factory->CreateCompactionFilter(
+ context);
+}
+
+bool Compaction::IsOutputLevelEmpty() const {
+ return inputs_.back().level != output_level_ || inputs_.back().empty();
+}
+
+bool Compaction::ShouldFormSubcompactions() const {
+ if (max_subcompactions_ <= 1 || cfd_ == nullptr) {
+ return false;
+ }
+ if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
+ return (start_level_ == 0 || is_manual_compaction_) && output_level_ > 0 &&
+ !IsOutputLevelEmpty();
+ } else if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) {
+ return number_levels_ > 1 && output_level_ > 0;
+ } else {
+ return false;
+ }
+}
+
+uint64_t Compaction::MinInputFileOldestAncesterTime() const {
+ uint64_t min_oldest_ancester_time = port::kMaxUint64;
+ for (const auto& level_files : inputs_) {
+ for (const auto& file : level_files.files) {
+ uint64_t oldest_ancester_time = file->TryGetOldestAncesterTime();
+ if (oldest_ancester_time != 0) {
+ min_oldest_ancester_time =
+ std::min(min_oldest_ancester_time, oldest_ancester_time);
+ }
+ }
+ }
+ return min_oldest_ancester_time;
+}
+
+int Compaction::GetInputBaseLevel() const {
+ return input_vstorage_->base_level();
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction.h b/src/rocksdb/db/compaction/compaction.h
new file mode 100644
index 000000000..9358e50ff
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction.h
@@ -0,0 +1,384 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "db/version_set.h"
+#include "memory/arena.h"
+#include "options/cf_options.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+// The file contains class Compaction, as well as some helper functions
+// and data structures used by the class.
+
+// Utility for comparing sstable boundary keys. Returns -1 if either a or b is
+// null which provides the property that a==null indicates a key that is less
+// than any key and b==null indicates a key that is greater than any key. Note
+// that the comparison is performed primarily on the user-key portion of the
+// key. If the user-keys compare equal, an additional test is made to sort
+// range tombstone sentinel keys before other keys with the same user-key. The
+// result is that 2 user-keys will compare equal if they differ purely on
+// their sequence number and value, but the range tombstone sentinel for that
+// user-key will compare not equal. This is necessary because the range
+// tombstone sentinel key is set as the largest key for an sstable even though
+// that key never appears in the database. We don't want adjacent sstables to
+// be considered overlapping if they are separated by the range tombstone
+// sentinel.
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+ const InternalKey& b);
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a,
+ const InternalKey& b);
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+ const InternalKey* b);
+
+// An AtomicCompactionUnitBoundary represents a range of keys [smallest,
+// largest] that exactly spans one ore more neighbouring SSTs on the same
+// level. Every pair of SSTs in this range "overlap" (i.e., the largest
+// user key of one file is the smallest user key of the next file). These
+// boundaries are propagated down to RangeDelAggregator during compaction
+// to provide safe truncation boundaries for range tombstones.
+struct AtomicCompactionUnitBoundary {
+ const InternalKey* smallest = nullptr;
+ const InternalKey* largest = nullptr;
+};
+
+// The structure that manages compaction input files associated
+// with the same physical level.
+struct CompactionInputFiles {
+ int level;
+ std::vector<FileMetaData*> files;
+ std::vector<AtomicCompactionUnitBoundary> atomic_compaction_unit_boundaries;
+ inline bool empty() const { return files.empty(); }
+ inline size_t size() const { return files.size(); }
+ inline void clear() { files.clear(); }
+ inline FileMetaData* operator[](size_t i) const { return files[i]; }
+};
+
+class Version;
+class ColumnFamilyData;
+class VersionStorageInfo;
+class CompactionFilter;
+
+// A Compaction encapsulates metadata about a compaction.
+class Compaction {
+ public:
+ Compaction(VersionStorageInfo* input_version,
+ const ImmutableCFOptions& immutable_cf_options,
+ const MutableCFOptions& mutable_cf_options,
+ std::vector<CompactionInputFiles> inputs, int output_level,
+ uint64_t target_file_size, uint64_t max_compaction_bytes,
+ uint32_t output_path_id, CompressionType compression,
+ CompressionOptions compression_opts, uint32_t max_subcompactions,
+ std::vector<FileMetaData*> grandparents,
+ bool manual_compaction = false, double score = -1,
+ bool deletion_compaction = false,
+ CompactionReason compaction_reason = CompactionReason::kUnknown);
+
+ // No copying allowed
+ Compaction(const Compaction&) = delete;
+ void operator=(const Compaction&) = delete;
+
+ ~Compaction();
+
+ // Returns the level associated to the specified compaction input level.
+ // If compaction_input_level is not specified, then input_level is set to 0.
+ int level(size_t compaction_input_level = 0) const {
+ return inputs_[compaction_input_level].level;
+ }
+
+ int start_level() const { return start_level_; }
+
+ // Outputs will go to this level
+ int output_level() const { return output_level_; }
+
+ // Returns the number of input levels in this compaction.
+ size_t num_input_levels() const { return inputs_.size(); }
+
+ // Return the object that holds the edits to the descriptor done
+ // by this compaction.
+ VersionEdit* edit() { return &edit_; }
+
+ // Returns the number of input files associated to the specified
+ // compaction input level.
+ // The function will return 0 if when "compaction_input_level" < 0
+ // or "compaction_input_level" >= "num_input_levels()".
+ size_t num_input_files(size_t compaction_input_level) const {
+ if (compaction_input_level < inputs_.size()) {
+ return inputs_[compaction_input_level].size();
+ }
+ return 0;
+ }
+
+ // Returns input version of the compaction
+ Version* input_version() const { return input_version_; }
+
+ // Returns the ColumnFamilyData associated with the compaction.
+ ColumnFamilyData* column_family_data() const { return cfd_; }
+
+ // Returns the file meta data of the 'i'th input file at the
+ // specified compaction input level.
+ // REQUIREMENT: "compaction_input_level" must be >= 0 and
+ // < "input_levels()"
+ FileMetaData* input(size_t compaction_input_level, size_t i) const {
+ assert(compaction_input_level < inputs_.size());
+ return inputs_[compaction_input_level][i];
+ }
+
+ const std::vector<AtomicCompactionUnitBoundary>* boundaries(
+ size_t compaction_input_level) const {
+ assert(compaction_input_level < inputs_.size());
+ return &inputs_[compaction_input_level].atomic_compaction_unit_boundaries;
+ }
+
+ // Returns the list of file meta data of the specified compaction
+ // input level.
+ // REQUIREMENT: "compaction_input_level" must be >= 0 and
+ // < "input_levels()"
+ const std::vector<FileMetaData*>* inputs(
+ size_t compaction_input_level) const {
+ assert(compaction_input_level < inputs_.size());
+ return &inputs_[compaction_input_level].files;
+ }
+
+ const std::vector<CompactionInputFiles>* inputs() { return &inputs_; }
+
+ // Returns the LevelFilesBrief of the specified compaction input level.
+ const LevelFilesBrief* input_levels(size_t compaction_input_level) const {
+ return &input_levels_[compaction_input_level];
+ }
+
+ // Maximum size of files to build during this compaction.
+ uint64_t max_output_file_size() const { return max_output_file_size_; }
+
+ // What compression for output
+ CompressionType output_compression() const { return output_compression_; }
+
+ // What compression options for output
+ CompressionOptions output_compression_opts() const {
+ return output_compression_opts_;
+ }
+
+ // Whether need to write output file to second DB path.
+ uint32_t output_path_id() const { return output_path_id_; }
+
+ // Is this a trivial compaction that can be implemented by just
+ // moving a single input file to the next level (no merging or splitting)
+ bool IsTrivialMove() const;
+
+ // If true, then the compaction can be done by simply deleting input files.
+ bool deletion_compaction() const { return deletion_compaction_; }
+
+ // Add all inputs to this compaction as delete operations to *edit.
+ void AddInputDeletions(VersionEdit* edit);
+
+ // Returns true if the available information we have guarantees that
+ // the input "user_key" does not exist in any level beyond "output_level()".
+ bool KeyNotExistsBeyondOutputLevel(const Slice& user_key,
+ std::vector<size_t>* level_ptrs) const;
+
+ // Clear all files to indicate that they are not being compacted
+ // Delete this compaction from the list of running compactions.
+ //
+ // Requirement: DB mutex held
+ void ReleaseCompactionFiles(Status status);
+
+ // Returns the summary of the compaction in "output" with maximum "len"
+ // in bytes. The caller is responsible for the memory management of
+ // "output".
+ void Summary(char* output, int len);
+
+ // Return the score that was used to pick this compaction run.
+ double score() const { return score_; }
+
+ // Is this compaction creating a file in the bottom most level?
+ bool bottommost_level() const { return bottommost_level_; }
+
+ // Does this compaction include all sst files?
+ bool is_full_compaction() const { return is_full_compaction_; }
+
+ // Was this compaction triggered manually by the client?
+ bool is_manual_compaction() const { return is_manual_compaction_; }
+
+ // Used when allow_trivial_move option is set in
+ // Universal compaction. If all the input files are
+ // non overlapping, then is_trivial_move_ variable
+ // will be set true, else false
+ void set_is_trivial_move(bool trivial_move) {
+ is_trivial_move_ = trivial_move;
+ }
+
+ // Used when allow_trivial_move option is set in
+ // Universal compaction. Returns true, if the input files
+ // are non-overlapping and can be trivially moved.
+ bool is_trivial_move() const { return is_trivial_move_; }
+
+ // How many total levels are there?
+ int number_levels() const { return number_levels_; }
+
+ // Return the ImmutableCFOptions that should be used throughout the compaction
+ // procedure
+ const ImmutableCFOptions* immutable_cf_options() const {
+ return &immutable_cf_options_;
+ }
+
+ // Return the MutableCFOptions that should be used throughout the compaction
+ // procedure
+ const MutableCFOptions* mutable_cf_options() const {
+ return &mutable_cf_options_;
+ }
+
+ // Returns the size in bytes that the output file should be preallocated to.
+ // In level compaction, that is max_file_size_. In universal compaction, that
+ // is the sum of all input file sizes.
+ uint64_t OutputFilePreallocationSize() const;
+
+ void SetInputVersion(Version* input_version);
+
+ struct InputLevelSummaryBuffer {
+ char buffer[128];
+ };
+
+ const char* InputLevelSummary(InputLevelSummaryBuffer* scratch) const;
+
+ uint64_t CalculateTotalInputSize() const;
+
+ // In case of compaction error, reset the nextIndex that is used
+ // to pick up the next file to be compacted from files_by_size_
+ void ResetNextCompactionIndex();
+
+ // Create a CompactionFilter from compaction_filter_factory
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter() const;
+
+ // Is the input level corresponding to output_level_ empty?
+ bool IsOutputLevelEmpty() const;
+
+ // Should this compaction be broken up into smaller ones run in parallel?
+ bool ShouldFormSubcompactions() const;
+
+ // test function to validate the functionality of IsBottommostLevel()
+ // function -- determines if compaction with inputs and storage is bottommost
+ static bool TEST_IsBottommostLevel(
+ int output_level, VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs);
+
+ TablePropertiesCollection GetOutputTableProperties() const {
+ return output_table_properties_;
+ }
+
+ void SetOutputTableProperties(TablePropertiesCollection tp) {
+ output_table_properties_ = std::move(tp);
+ }
+
+ Slice GetSmallestUserKey() const { return smallest_user_key_; }
+
+ Slice GetLargestUserKey() const { return largest_user_key_; }
+
+ int GetInputBaseLevel() const;
+
+ CompactionReason compaction_reason() { return compaction_reason_; }
+
+ const std::vector<FileMetaData*>& grandparents() const {
+ return grandparents_;
+ }
+
+ uint64_t max_compaction_bytes() const { return max_compaction_bytes_; }
+
+ uint32_t max_subcompactions() const { return max_subcompactions_; }
+
+ uint64_t MinInputFileOldestAncesterTime() const;
+
+ private:
+ // mark (or clear) all files that are being compacted
+ void MarkFilesBeingCompacted(bool mark_as_compacted);
+
+ // get the smallest and largest key present in files to be compacted
+ static void GetBoundaryKeys(VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs,
+ Slice* smallest_key, Slice* largest_key);
+
+ // Get the atomic file boundaries for all files in the compaction. Necessary
+ // in order to avoid the scenario described in
+ // https://github.com/facebook/rocksdb/pull/4432#discussion_r221072219 and plumb
+ // down appropriate key boundaries to RangeDelAggregator during compaction.
+ static std::vector<CompactionInputFiles> PopulateWithAtomicBoundaries(
+ VersionStorageInfo* vstorage, std::vector<CompactionInputFiles> inputs);
+
+ // helper function to determine if compaction with inputs and storage is
+ // bottommost
+ static bool IsBottommostLevel(
+ int output_level, VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs);
+
+ static bool IsFullCompaction(VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs);
+
+ VersionStorageInfo* input_vstorage_;
+
+ const int start_level_; // the lowest level to be compacted
+ const int output_level_; // levels to which output files are stored
+ uint64_t max_output_file_size_;
+ uint64_t max_compaction_bytes_;
+ uint32_t max_subcompactions_;
+ const ImmutableCFOptions immutable_cf_options_;
+ const MutableCFOptions mutable_cf_options_;
+ Version* input_version_;
+ VersionEdit edit_;
+ const int number_levels_;
+ ColumnFamilyData* cfd_;
+ Arena arena_; // Arena used to allocate space for file_levels_
+
+ const uint32_t output_path_id_;
+ CompressionType output_compression_;
+ CompressionOptions output_compression_opts_;
+ // If true, then the comaction can be done by simply deleting input files.
+ const bool deletion_compaction_;
+
+ // Compaction input files organized by level. Constant after construction
+ const std::vector<CompactionInputFiles> inputs_;
+
+ // A copy of inputs_, organized more closely in memory
+ autovector<LevelFilesBrief, 2> input_levels_;
+
+ // State used to check for number of overlapping grandparent files
+ // (grandparent == "output_level_ + 1")
+ std::vector<FileMetaData*> grandparents_;
+ const double score_; // score that was used to pick this compaction.
+
+ // Is this compaction creating a file in the bottom most level?
+ const bool bottommost_level_;
+ // Does this compaction include all sst files?
+ const bool is_full_compaction_;
+
+ // Is this compaction requested by the client?
+ const bool is_manual_compaction_;
+
+ // True if we can do trivial move in Universal multi level
+ // compaction
+ bool is_trivial_move_;
+
+ // Does input compression match the output compression?
+ bool InputCompressionMatchesOutput() const;
+
+ // table properties of output files
+ TablePropertiesCollection output_table_properties_;
+
+ // smallest user keys in compaction
+ Slice smallest_user_key_;
+
+ // largest user keys in compaction
+ Slice largest_user_key_;
+
+ // Reason for compaction
+ CompactionReason compaction_reason_;
+};
+
+// Return sum of sizes of all files in `files`.
+extern uint64_t TotalFileSize(const std::vector<FileMetaData*>& files);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_iteration_stats.h b/src/rocksdb/db/compaction/compaction_iteration_stats.h
new file mode 100644
index 000000000..963c1d8eb
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_iteration_stats.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+struct CompactionIterationStats {
+ // Compaction statistics
+
+ // Doesn't include records skipped because of
+ // CompactionFilter::Decision::kRemoveAndSkipUntil.
+ int64_t num_record_drop_user = 0;
+
+ int64_t num_record_drop_hidden = 0;
+ int64_t num_record_drop_obsolete = 0;
+ int64_t num_record_drop_range_del = 0;
+ int64_t num_range_del_drop_obsolete = 0;
+ // Deletions obsoleted before bottom level due to file gap optimization.
+ int64_t num_optimized_del_drop_obsolete = 0;
+ uint64_t total_filter_time = 0;
+
+ // Input statistics
+ // TODO(noetzli): The stats are incomplete. They are lacking everything
+ // consumed by MergeHelper.
+ uint64_t num_input_records = 0;
+ uint64_t num_input_deletion_records = 0;
+ uint64_t num_input_corrupt_records = 0;
+ uint64_t total_input_raw_key_bytes = 0;
+ uint64_t total_input_raw_value_bytes = 0;
+
+ // Single-Delete diagnostics for exceptional situations
+ uint64_t num_single_del_fallthru = 0;
+ uint64_t num_single_del_mismatch = 0;
+};
diff --git a/src/rocksdb/db/compaction/compaction_iterator.cc b/src/rocksdb/db/compaction/compaction_iterator.cc
new file mode 100644
index 000000000..1bebfc717
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_iterator.cc
@@ -0,0 +1,774 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <cinttypes>
+
+#include "db/compaction/compaction_iterator.h"
+#include "db/snapshot_checker.h"
+#include "port/likely.h"
+#include "rocksdb/listener.h"
+#include "table/internal_iterator.h"
+#include "test_util/sync_point.h"
+
+#define DEFINITELY_IN_SNAPSHOT(seq, snapshot) \
+ ((seq) <= (snapshot) && \
+ (snapshot_checker_ == nullptr || \
+ LIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) == \
+ SnapshotCheckerResult::kInSnapshot)))
+
+#define DEFINITELY_NOT_IN_SNAPSHOT(seq, snapshot) \
+ ((seq) > (snapshot) || \
+ (snapshot_checker_ != nullptr && \
+ UNLIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) == \
+ SnapshotCheckerResult::kNotInSnapshot)))
+
+#define IN_EARLIEST_SNAPSHOT(seq) \
+ ((seq) <= earliest_snapshot_ && \
+ (snapshot_checker_ == nullptr || LIKELY(IsInEarliestSnapshot(seq))))
+
+namespace ROCKSDB_NAMESPACE {
+
+CompactionIterator::CompactionIterator(
+ InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+ SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
+ SequenceNumber earliest_write_conflict_snapshot,
+ const SnapshotChecker* snapshot_checker, Env* env,
+ bool report_detailed_time, bool expect_valid_internal_key,
+ CompactionRangeDelAggregator* range_del_agg, const Compaction* compaction,
+ const CompactionFilter* compaction_filter,
+ const std::atomic<bool>* shutting_down,
+ const SequenceNumber preserve_deletes_seqnum,
+ const std::atomic<bool>* manual_compaction_paused,
+ const std::shared_ptr<Logger> info_log)
+ : CompactionIterator(
+ input, cmp, merge_helper, last_sequence, snapshots,
+ earliest_write_conflict_snapshot, snapshot_checker, env,
+ report_detailed_time, expect_valid_internal_key, range_del_agg,
+ std::unique_ptr<CompactionProxy>(
+ compaction ? new CompactionProxy(compaction) : nullptr),
+ compaction_filter, shutting_down, preserve_deletes_seqnum,
+ manual_compaction_paused, info_log) {}
+
+CompactionIterator::CompactionIterator(
+ InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+ SequenceNumber /*last_sequence*/, std::vector<SequenceNumber>* snapshots,
+ SequenceNumber earliest_write_conflict_snapshot,
+ const SnapshotChecker* snapshot_checker, Env* env,
+ bool report_detailed_time, bool expect_valid_internal_key,
+ CompactionRangeDelAggregator* range_del_agg,
+ std::unique_ptr<CompactionProxy> compaction,
+ const CompactionFilter* compaction_filter,
+ const std::atomic<bool>* shutting_down,
+ const SequenceNumber preserve_deletes_seqnum,
+ const std::atomic<bool>* manual_compaction_paused,
+ const std::shared_ptr<Logger> info_log)
+ : input_(input),
+ cmp_(cmp),
+ merge_helper_(merge_helper),
+ snapshots_(snapshots),
+ earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
+ snapshot_checker_(snapshot_checker),
+ env_(env),
+ report_detailed_time_(report_detailed_time),
+ expect_valid_internal_key_(expect_valid_internal_key),
+ range_del_agg_(range_del_agg),
+ compaction_(std::move(compaction)),
+ compaction_filter_(compaction_filter),
+ shutting_down_(shutting_down),
+ manual_compaction_paused_(manual_compaction_paused),
+ preserve_deletes_seqnum_(preserve_deletes_seqnum),
+ current_user_key_sequence_(0),
+ current_user_key_snapshot_(0),
+ merge_out_iter_(merge_helper_),
+ current_key_committed_(false),
+ info_log_(info_log) {
+ assert(compaction_filter_ == nullptr || compaction_ != nullptr);
+ assert(snapshots_ != nullptr);
+ bottommost_level_ =
+ compaction_ == nullptr ? false : compaction_->bottommost_level();
+ if (compaction_ != nullptr) {
+ level_ptrs_ = std::vector<size_t>(compaction_->number_levels(), 0);
+ }
+ if (snapshots_->size() == 0) {
+ // optimize for fast path if there are no snapshots
+ visible_at_tip_ = true;
+ earliest_snapshot_iter_ = snapshots_->end();
+ earliest_snapshot_ = kMaxSequenceNumber;
+ latest_snapshot_ = 0;
+ } else {
+ visible_at_tip_ = false;
+ earliest_snapshot_iter_ = snapshots_->begin();
+ earliest_snapshot_ = snapshots_->at(0);
+ latest_snapshot_ = snapshots_->back();
+ }
+#ifndef NDEBUG
+ // findEarliestVisibleSnapshot assumes this ordering.
+ for (size_t i = 1; i < snapshots_->size(); ++i) {
+ assert(snapshots_->at(i - 1) < snapshots_->at(i));
+ }
+#endif
+ input_->SetPinnedItersMgr(&pinned_iters_mgr_);
+ TEST_SYNC_POINT_CALLBACK("CompactionIterator:AfterInit", compaction_.get());
+}
+
+CompactionIterator::~CompactionIterator() {
+ // input_ Iteartor lifetime is longer than pinned_iters_mgr_ lifetime
+ input_->SetPinnedItersMgr(nullptr);
+}
+
+void CompactionIterator::ResetRecordCounts() {
+ iter_stats_.num_record_drop_user = 0;
+ iter_stats_.num_record_drop_hidden = 0;
+ iter_stats_.num_record_drop_obsolete = 0;
+ iter_stats_.num_record_drop_range_del = 0;
+ iter_stats_.num_range_del_drop_obsolete = 0;
+ iter_stats_.num_optimized_del_drop_obsolete = 0;
+}
+
+void CompactionIterator::SeekToFirst() {
+ NextFromInput();
+ PrepareOutput();
+}
+
+void CompactionIterator::Next() {
+ // If there is a merge output, return it before continuing to process the
+ // input.
+ if (merge_out_iter_.Valid()) {
+ merge_out_iter_.Next();
+
+ // Check if we returned all records of the merge output.
+ if (merge_out_iter_.Valid()) {
+ key_ = merge_out_iter_.key();
+ value_ = merge_out_iter_.value();
+ bool valid_key __attribute__((__unused__));
+ valid_key = ParseInternalKey(key_, &ikey_);
+ // MergeUntil stops when it encounters a corrupt key and does not
+ // include them in the result, so we expect the keys here to be valid.
+ assert(valid_key);
+ if (!valid_key) {
+ ROCKS_LOG_FATAL(info_log_, "Invalid key (%s) in compaction",
+ key_.ToString(true).c_str());
+ }
+
+ // Keep current_key_ in sync.
+ current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+ key_ = current_key_.GetInternalKey();
+ ikey_.user_key = current_key_.GetUserKey();
+ valid_ = true;
+ } else {
+ // We consumed all pinned merge operands, release pinned iterators
+ pinned_iters_mgr_.ReleasePinnedData();
+ // MergeHelper moves the iterator to the first record after the merged
+ // records, so even though we reached the end of the merge output, we do
+ // not want to advance the iterator.
+ NextFromInput();
+ }
+ } else {
+ // Only advance the input iterator if there is no merge output and the
+ // iterator is not already at the next record.
+ if (!at_next_) {
+ input_->Next();
+ }
+ NextFromInput();
+ }
+
+ if (valid_) {
+ // Record that we've outputted a record for the current key.
+ has_outputted_key_ = true;
+ }
+
+ PrepareOutput();
+}
+
+void CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
+ Slice* skip_until) {
+ if (compaction_filter_ != nullptr &&
+ (ikey_.type == kTypeValue || ikey_.type == kTypeBlobIndex)) {
+ // If the user has specified a compaction filter and the sequence
+ // number is greater than any external snapshot, then invoke the
+ // filter. If the return value of the compaction filter is true,
+ // replace the entry with a deletion marker.
+ CompactionFilter::Decision filter;
+ compaction_filter_value_.clear();
+ compaction_filter_skip_until_.Clear();
+ CompactionFilter::ValueType value_type =
+ ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue
+ : CompactionFilter::ValueType::kBlobIndex;
+ // Hack: pass internal key to BlobIndexCompactionFilter since it needs
+ // to get sequence number.
+ Slice& filter_key = ikey_.type == kTypeValue ? ikey_.user_key : key_;
+ {
+ StopWatchNano timer(env_, report_detailed_time_);
+ filter = compaction_filter_->FilterV2(
+ compaction_->level(), filter_key, value_type, value_,
+ &compaction_filter_value_, compaction_filter_skip_until_.rep());
+ iter_stats_.total_filter_time +=
+ env_ != nullptr && report_detailed_time_ ? timer.ElapsedNanos() : 0;
+ }
+
+ if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil &&
+ cmp_->Compare(*compaction_filter_skip_until_.rep(), ikey_.user_key) <=
+ 0) {
+ // Can't skip to a key smaller than the current one.
+ // Keep the key as per FilterV2 documentation.
+ filter = CompactionFilter::Decision::kKeep;
+ }
+
+ if (filter == CompactionFilter::Decision::kRemove) {
+ // convert the current key to a delete; key_ is pointing into
+ // current_key_ at this point, so updating current_key_ updates key()
+ ikey_.type = kTypeDeletion;
+ current_key_.UpdateInternalKey(ikey_.sequence, kTypeDeletion);
+ // no value associated with delete
+ value_.clear();
+ iter_stats_.num_record_drop_user++;
+ } else if (filter == CompactionFilter::Decision::kChangeValue) {
+ value_ = compaction_filter_value_;
+ } else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) {
+ *need_skip = true;
+ compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber,
+ kValueTypeForSeek);
+ *skip_until = compaction_filter_skip_until_.Encode();
+ }
+ }
+}
+
+void CompactionIterator::NextFromInput() {
+ at_next_ = false;
+ valid_ = false;
+
+ while (!valid_ && input_->Valid() && !IsPausingManualCompaction() &&
+ !IsShuttingDown()) {
+ key_ = input_->key();
+ value_ = input_->value();
+ iter_stats_.num_input_records++;
+
+ if (!ParseInternalKey(key_, &ikey_)) {
+ // If `expect_valid_internal_key_` is false, return the corrupted key
+ // and let the caller decide what to do with it.
+ // TODO(noetzli): We should have a more elegant solution for this.
+ if (expect_valid_internal_key_) {
+ assert(!"Corrupted internal key not expected.");
+ status_ = Status::Corruption("Corrupted internal key not expected.");
+ break;
+ }
+ key_ = current_key_.SetInternalKey(key_);
+ has_current_user_key_ = false;
+ current_user_key_sequence_ = kMaxSequenceNumber;
+ current_user_key_snapshot_ = 0;
+ iter_stats_.num_input_corrupt_records++;
+ valid_ = true;
+ break;
+ }
+ TEST_SYNC_POINT_CALLBACK("CompactionIterator:ProcessKV", &ikey_);
+
+ // Update input statistics
+ if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion) {
+ iter_stats_.num_input_deletion_records++;
+ }
+ iter_stats_.total_input_raw_key_bytes += key_.size();
+ iter_stats_.total_input_raw_value_bytes += value_.size();
+
+ // If need_skip is true, we should seek the input iterator
+ // to internal key skip_until and continue from there.
+ bool need_skip = false;
+ // Points either into compaction_filter_skip_until_ or into
+ // merge_helper_->compaction_filter_skip_until_.
+ Slice skip_until;
+
+ // Check whether the user key changed. After this if statement current_key_
+ // is a copy of the current input key (maybe converted to a delete by the
+ // compaction filter). ikey_.user_key is pointing to the copy.
+ if (!has_current_user_key_ ||
+ !cmp_->Equal(ikey_.user_key, current_user_key_)) {
+ // First occurrence of this user key
+ // Copy key for output
+ key_ = current_key_.SetInternalKey(key_, &ikey_);
+ current_user_key_ = ikey_.user_key;
+ has_current_user_key_ = true;
+ has_outputted_key_ = false;
+ current_user_key_sequence_ = kMaxSequenceNumber;
+ current_user_key_snapshot_ = 0;
+ current_key_committed_ = KeyCommitted(ikey_.sequence);
+
+ // Apply the compaction filter to the first committed version of the user
+ // key.
+ if (current_key_committed_) {
+ InvokeFilterIfNeeded(&need_skip, &skip_until);
+ }
+ } else {
+ // Update the current key to reflect the new sequence number/type without
+ // copying the user key.
+ // TODO(rven): Compaction filter does not process keys in this path
+ // Need to have the compaction filter process multiple versions
+ // if we have versions on both sides of a snapshot
+ current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+ key_ = current_key_.GetInternalKey();
+ ikey_.user_key = current_key_.GetUserKey();
+
+ // Note that newer version of a key is ordered before older versions. If a
+ // newer version of a key is committed, so as the older version. No need
+ // to query snapshot_checker_ in that case.
+ if (UNLIKELY(!current_key_committed_)) {
+ assert(snapshot_checker_ != nullptr);
+ current_key_committed_ = KeyCommitted(ikey_.sequence);
+ // Apply the compaction filter to the first committed version of the
+ // user key.
+ if (current_key_committed_) {
+ InvokeFilterIfNeeded(&need_skip, &skip_until);
+ }
+ }
+ }
+
+ if (UNLIKELY(!current_key_committed_)) {
+ assert(snapshot_checker_ != nullptr);
+ valid_ = true;
+ break;
+ }
+
+ // If there are no snapshots, then this kv affect visibility at tip.
+ // Otherwise, search though all existing snapshots to find the earliest
+ // snapshot that is affected by this kv.
+ SequenceNumber last_sequence __attribute__((__unused__));
+ last_sequence = current_user_key_sequence_;
+ current_user_key_sequence_ = ikey_.sequence;
+ SequenceNumber last_snapshot = current_user_key_snapshot_;
+ SequenceNumber prev_snapshot = 0; // 0 means no previous snapshot
+ current_user_key_snapshot_ =
+ visible_at_tip_
+ ? earliest_snapshot_
+ : findEarliestVisibleSnapshot(ikey_.sequence, &prev_snapshot);
+
+ if (need_skip) {
+ // This case is handled below.
+ } else if (clear_and_output_next_key_) {
+ // In the previous iteration we encountered a single delete that we could
+ // not compact out. We will keep this Put, but can drop it's data.
+ // (See Optimization 3, below.)
+ assert(ikey_.type == kTypeValue);
+ if (ikey_.type != kTypeValue) {
+ ROCKS_LOG_FATAL(info_log_,
+ "Unexpected key type %d for compaction output",
+ ikey_.type);
+ }
+ assert(current_user_key_snapshot_ == last_snapshot);
+ if (current_user_key_snapshot_ != last_snapshot) {
+ ROCKS_LOG_FATAL(info_log_,
+ "current_user_key_snapshot_ (%" PRIu64
+ ") != last_snapshot (%" PRIu64 ")",
+ current_user_key_snapshot_, last_snapshot);
+ }
+
+ value_.clear();
+ valid_ = true;
+ clear_and_output_next_key_ = false;
+ } else if (ikey_.type == kTypeSingleDeletion) {
+ // We can compact out a SingleDelete if:
+ // 1) We encounter the corresponding PUT -OR- we know that this key
+ // doesn't appear past this output level
+ // =AND=
+ // 2) We've already returned a record in this snapshot -OR-
+ // there are no earlier earliest_write_conflict_snapshot.
+ //
+ // Rule 1 is needed for SingleDelete correctness. Rule 2 is needed to
+ // allow Transactions to do write-conflict checking (if we compacted away
+ // all keys, then we wouldn't know that a write happened in this
+ // snapshot). If there is no earlier snapshot, then we know that there
+ // are no active transactions that need to know about any writes.
+ //
+ // Optimization 3:
+ // If we encounter a SingleDelete followed by a PUT and Rule 2 is NOT
+ // true, then we must output a SingleDelete. In this case, we will decide
+ // to also output the PUT. While we are compacting less by outputting the
+ // PUT now, hopefully this will lead to better compaction in the future
+ // when Rule 2 is later true (Ie, We are hoping we can later compact out
+ // both the SingleDelete and the Put, while we couldn't if we only
+ // outputted the SingleDelete now).
+ // In this case, we can save space by removing the PUT's value as it will
+ // never be read.
+ //
+ // Deletes and Merges are not supported on the same key that has a
+ // SingleDelete as it is not possible to correctly do any partial
+ // compaction of such a combination of operations. The result of mixing
+ // those operations for a given key is documented as being undefined. So
+ // we can choose how to handle such a combinations of operations. We will
+ // try to compact out as much as we can in these cases.
+ // We will report counts on these anomalous cases.
+
+ // The easiest way to process a SingleDelete during iteration is to peek
+ // ahead at the next key.
+ ParsedInternalKey next_ikey;
+ input_->Next();
+
+ // Check whether the next key exists, is not corrupt, and is the same key
+ // as the single delete.
+ if (input_->Valid() && ParseInternalKey(input_->key(), &next_ikey) &&
+ cmp_->Equal(ikey_.user_key, next_ikey.user_key)) {
+ // Check whether the next key belongs to the same snapshot as the
+ // SingleDelete.
+ if (prev_snapshot == 0 ||
+ DEFINITELY_NOT_IN_SNAPSHOT(next_ikey.sequence, prev_snapshot)) {
+ if (next_ikey.type == kTypeSingleDeletion) {
+ // We encountered two SingleDeletes in a row. This could be due to
+ // unexpected user input.
+ // Skip the first SingleDelete and let the next iteration decide how
+ // to handle the second SingleDelete
+
+ // First SingleDelete has been skipped since we already called
+ // input_->Next().
+ ++iter_stats_.num_record_drop_obsolete;
+ ++iter_stats_.num_single_del_mismatch;
+ } else if (has_outputted_key_ ||
+ DEFINITELY_IN_SNAPSHOT(
+ ikey_.sequence, earliest_write_conflict_snapshot_)) {
+ // Found a matching value, we can drop the single delete and the
+ // value. It is safe to drop both records since we've already
+ // outputted a key in this snapshot, or there is no earlier
+ // snapshot (Rule 2 above).
+
+ // Note: it doesn't matter whether the second key is a Put or if it
+ // is an unexpected Merge or Delete. We will compact it out
+ // either way. We will maintain counts of how many mismatches
+ // happened
+ if (next_ikey.type != kTypeValue &&
+ next_ikey.type != kTypeBlobIndex) {
+ ++iter_stats_.num_single_del_mismatch;
+ }
+
+ ++iter_stats_.num_record_drop_hidden;
+ ++iter_stats_.num_record_drop_obsolete;
+ // Already called input_->Next() once. Call it a second time to
+ // skip past the second key.
+ input_->Next();
+ } else {
+ // Found a matching value, but we cannot drop both keys since
+ // there is an earlier snapshot and we need to leave behind a record
+ // to know that a write happened in this snapshot (Rule 2 above).
+ // Clear the value and output the SingleDelete. (The value will be
+ // outputted on the next iteration.)
+
+ // Setting valid_ to true will output the current SingleDelete
+ valid_ = true;
+
+ // Set up the Put to be outputted in the next iteration.
+ // (Optimization 3).
+ clear_and_output_next_key_ = true;
+ }
+ } else {
+ // We hit the next snapshot without hitting a put, so the iterator
+ // returns the single delete.
+ valid_ = true;
+ }
+ } else {
+ // We are at the end of the input, could not parse the next key, or hit
+ // a different key. The iterator returns the single delete if the key
+ // possibly exists beyond the current output level. We set
+ // has_current_user_key to false so that if the iterator is at the next
+ // key, we do not compare it again against the previous key at the next
+ // iteration. If the next key is corrupt, we return before the
+ // comparison, so the value of has_current_user_key does not matter.
+ has_current_user_key_ = false;
+ if (compaction_ != nullptr && IN_EARLIEST_SNAPSHOT(ikey_.sequence) &&
+ compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key,
+ &level_ptrs_)) {
+ // Key doesn't exist outside of this range.
+ // Can compact out this SingleDelete.
+ ++iter_stats_.num_record_drop_obsolete;
+ ++iter_stats_.num_single_del_fallthru;
+ if (!bottommost_level_) {
+ ++iter_stats_.num_optimized_del_drop_obsolete;
+ }
+ } else {
+ // Output SingleDelete
+ valid_ = true;
+ }
+ }
+
+ if (valid_) {
+ at_next_ = true;
+ }
+ } else if (last_snapshot == current_user_key_snapshot_ ||
+ (last_snapshot > 0 &&
+ last_snapshot < current_user_key_snapshot_)) {
+ // If the earliest snapshot is which this key is visible in
+ // is the same as the visibility of a previous instance of the
+ // same key, then this kv is not visible in any snapshot.
+ // Hidden by an newer entry for same user key
+ //
+ // Note: Dropping this key will not affect TransactionDB write-conflict
+ // checking since there has already been a record returned for this key
+ // in this snapshot.
+ assert(last_sequence >= current_user_key_sequence_);
+ if (last_sequence < current_user_key_sequence_) {
+ ROCKS_LOG_FATAL(info_log_,
+ "last_sequence (%" PRIu64
+ ") < current_user_key_sequence_ (%" PRIu64 ")",
+ last_sequence, current_user_key_sequence_);
+ }
+
+ ++iter_stats_.num_record_drop_hidden; // (A)
+ input_->Next();
+ } else if (compaction_ != nullptr && ikey_.type == kTypeDeletion &&
+ IN_EARLIEST_SNAPSHOT(ikey_.sequence) &&
+ ikeyNotNeededForIncrementalSnapshot() &&
+ compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key,
+ &level_ptrs_)) {
+ // TODO(noetzli): This is the only place where we use compaction_
+ // (besides the constructor). We should probably get rid of this
+ // dependency and find a way to do similar filtering during flushes.
+ //
+ // For this user key:
+ // (1) there is no data in higher levels
+ // (2) data in lower levels will have larger sequence numbers
+ // (3) data in layers that are being compacted here and have
+ // smaller sequence numbers will be dropped in the next
+ // few iterations of this loop (by rule (A) above).
+ // Therefore this deletion marker is obsolete and can be dropped.
+ //
+ // Note: Dropping this Delete will not affect TransactionDB
+ // write-conflict checking since it is earlier than any snapshot.
+ //
+ // It seems that we can also drop deletion later than earliest snapshot
+ // given that:
+ // (1) The deletion is earlier than earliest_write_conflict_snapshot, and
+ // (2) No value exist earlier than the deletion.
+ ++iter_stats_.num_record_drop_obsolete;
+ if (!bottommost_level_) {
+ ++iter_stats_.num_optimized_del_drop_obsolete;
+ }
+ input_->Next();
+ } else if ((ikey_.type == kTypeDeletion) && bottommost_level_ &&
+ ikeyNotNeededForIncrementalSnapshot()) {
+ // Handle the case where we have a delete key at the bottom most level
+ // We can skip outputting the key iff there are no subsequent puts for this
+ // key
+ ParsedInternalKey next_ikey;
+ input_->Next();
+ // Skip over all versions of this key that happen to occur in the same snapshot
+ // range as the delete
+ while (input_->Valid() && ParseInternalKey(input_->key(), &next_ikey) &&
+ cmp_->Equal(ikey_.user_key, next_ikey.user_key) &&
+ (prev_snapshot == 0 ||
+ DEFINITELY_NOT_IN_SNAPSHOT(next_ikey.sequence, prev_snapshot))) {
+ input_->Next();
+ }
+ // If you find you still need to output a row with this key, we need to output the
+ // delete too
+ if (input_->Valid() && ParseInternalKey(input_->key(), &next_ikey) &&
+ cmp_->Equal(ikey_.user_key, next_ikey.user_key)) {
+ valid_ = true;
+ at_next_ = true;
+ }
+ } else if (ikey_.type == kTypeMerge) {
+ if (!merge_helper_->HasOperator()) {
+ status_ = Status::InvalidArgument(
+ "merge_operator is not properly initialized.");
+ return;
+ }
+
+ pinned_iters_mgr_.StartPinning();
+ // We know the merge type entry is not hidden, otherwise we would
+ // have hit (A)
+ // We encapsulate the merge related state machine in a different
+ // object to minimize change to the existing flow.
+ Status s = merge_helper_->MergeUntil(input_, range_del_agg_,
+ prev_snapshot, bottommost_level_);
+ merge_out_iter_.SeekToFirst();
+
+ if (!s.ok() && !s.IsMergeInProgress()) {
+ status_ = s;
+ return;
+ } else if (merge_out_iter_.Valid()) {
+ // NOTE: key, value, and ikey_ refer to old entries.
+ // These will be correctly set below.
+ key_ = merge_out_iter_.key();
+ value_ = merge_out_iter_.value();
+ bool valid_key __attribute__((__unused__));
+ valid_key = ParseInternalKey(key_, &ikey_);
+ // MergeUntil stops when it encounters a corrupt key and does not
+ // include them in the result, so we expect the keys here to valid.
+ assert(valid_key);
+ if (!valid_key) {
+ ROCKS_LOG_FATAL(info_log_, "Invalid key (%s) in compaction",
+ key_.ToString(true).c_str());
+ }
+ // Keep current_key_ in sync.
+ current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+ key_ = current_key_.GetInternalKey();
+ ikey_.user_key = current_key_.GetUserKey();
+ valid_ = true;
+ } else {
+ // all merge operands were filtered out. reset the user key, since the
+ // batch consumed by the merge operator should not shadow any keys
+ // coming after the merges
+ has_current_user_key_ = false;
+ pinned_iters_mgr_.ReleasePinnedData();
+
+ if (merge_helper_->FilteredUntil(&skip_until)) {
+ need_skip = true;
+ }
+ }
+ } else {
+ // 1. new user key -OR-
+ // 2. different snapshot stripe
+ bool should_delete = range_del_agg_->ShouldDelete(
+ key_, RangeDelPositioningMode::kForwardTraversal);
+ if (should_delete) {
+ ++iter_stats_.num_record_drop_hidden;
+ ++iter_stats_.num_record_drop_range_del;
+ input_->Next();
+ } else {
+ valid_ = true;
+ }
+ }
+
+ if (need_skip) {
+ input_->Seek(skip_until);
+ }
+ }
+
+ if (!valid_ && IsShuttingDown()) {
+ status_ = Status::ShutdownInProgress();
+ }
+
+ if (IsPausingManualCompaction()) {
+ status_ = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+ }
+}
+
+void CompactionIterator::PrepareOutput() {
+ if (valid_) {
+ if (compaction_filter_ && ikey_.type == kTypeBlobIndex) {
+ const auto blob_decision = compaction_filter_->PrepareBlobOutput(
+ user_key(), value_, &compaction_filter_value_);
+
+ if (blob_decision == CompactionFilter::BlobDecision::kCorruption) {
+ status_ = Status::Corruption(
+ "Corrupted blob reference encountered during GC");
+ valid_ = false;
+ } else if (blob_decision == CompactionFilter::BlobDecision::kIOError) {
+ status_ = Status::IOError("Could not relocate blob during GC");
+ valid_ = false;
+ } else if (blob_decision ==
+ CompactionFilter::BlobDecision::kChangeValue) {
+ value_ = compaction_filter_value_;
+ }
+ }
+
+ // Zeroing out the sequence number leads to better compression.
+ // If this is the bottommost level (no files in lower levels)
+ // and the earliest snapshot is larger than this seqno
+ // and the userkey differs from the last userkey in compaction
+ // then we can squash the seqno to zero.
+ //
+ // This is safe for TransactionDB write-conflict checking since transactions
+ // only care about sequence number larger than any active snapshots.
+ //
+ // Can we do the same for levels above bottom level as long as
+ // KeyNotExistsBeyondOutputLevel() return true?
+ if (valid_ && compaction_ != nullptr &&
+ !compaction_->allow_ingest_behind() &&
+ ikeyNotNeededForIncrementalSnapshot() && bottommost_level_ &&
+ IN_EARLIEST_SNAPSHOT(ikey_.sequence) && ikey_.type != kTypeMerge) {
+ assert(ikey_.type != kTypeDeletion && ikey_.type != kTypeSingleDeletion);
+ if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion) {
+ ROCKS_LOG_FATAL(info_log_,
+ "Unexpected key type %d for seq-zero optimization",
+ ikey_.type);
+ }
+ ikey_.sequence = 0;
+ current_key_.UpdateInternalKey(0, ikey_.type);
+ }
+ }
+}
+
+inline SequenceNumber CompactionIterator::findEarliestVisibleSnapshot(
+ SequenceNumber in, SequenceNumber* prev_snapshot) {
+ assert(snapshots_->size());
+ if (snapshots_->size() == 0) {
+ ROCKS_LOG_FATAL(info_log_,
+ "No snapshot left in findEarliestVisibleSnapshot");
+ }
+ auto snapshots_iter = std::lower_bound(
+ snapshots_->begin(), snapshots_->end(), in);
+ if (snapshots_iter == snapshots_->begin()) {
+ *prev_snapshot = 0;
+ } else {
+ *prev_snapshot = *std::prev(snapshots_iter);
+ assert(*prev_snapshot < in);
+ if (*prev_snapshot >= in) {
+ ROCKS_LOG_FATAL(info_log_,
+ "*prev_snapshot >= in in findEarliestVisibleSnapshot");
+ }
+ }
+ if (snapshot_checker_ == nullptr) {
+ return snapshots_iter != snapshots_->end()
+ ? *snapshots_iter : kMaxSequenceNumber;
+ }
+ bool has_released_snapshot = !released_snapshots_.empty();
+ for (; snapshots_iter != snapshots_->end(); ++snapshots_iter) {
+ auto cur = *snapshots_iter;
+ assert(in <= cur);
+ if (in > cur) {
+ ROCKS_LOG_FATAL(info_log_, "in > cur in findEarliestVisibleSnapshot");
+ }
+ // Skip if cur is in released_snapshots.
+ if (has_released_snapshot && released_snapshots_.count(cur) > 0) {
+ continue;
+ }
+ auto res = snapshot_checker_->CheckInSnapshot(in, cur);
+ if (res == SnapshotCheckerResult::kInSnapshot) {
+ return cur;
+ } else if (res == SnapshotCheckerResult::kSnapshotReleased) {
+ released_snapshots_.insert(cur);
+ }
+ *prev_snapshot = cur;
+ }
+ return kMaxSequenceNumber;
+}
+
+// used in 2 places - prevents deletion markers to be dropped if they may be
+// needed and disables seqnum zero-out in PrepareOutput for recent keys.
+inline bool CompactionIterator::ikeyNotNeededForIncrementalSnapshot() {
+ return (!compaction_->preserve_deletes()) ||
+ (ikey_.sequence < preserve_deletes_seqnum_);
+}
+
+bool CompactionIterator::IsInEarliestSnapshot(SequenceNumber sequence) {
+ assert(snapshot_checker_ != nullptr);
+ bool pre_condition = (earliest_snapshot_ == kMaxSequenceNumber ||
+ (earliest_snapshot_iter_ != snapshots_->end() &&
+ *earliest_snapshot_iter_ == earliest_snapshot_));
+ assert(pre_condition);
+ if (!pre_condition) {
+ ROCKS_LOG_FATAL(info_log_,
+ "Pre-Condition is not hold in IsInEarliestSnapshot");
+ }
+ auto in_snapshot =
+ snapshot_checker_->CheckInSnapshot(sequence, earliest_snapshot_);
+ while (UNLIKELY(in_snapshot == SnapshotCheckerResult::kSnapshotReleased)) {
+ // Avoid the the current earliest_snapshot_ being return as
+ // earliest visible snapshot for the next value. So if a value's sequence
+ // is zero-ed out by PrepareOutput(), the next value will be compact out.
+ released_snapshots_.insert(earliest_snapshot_);
+ earliest_snapshot_iter_++;
+
+ if (earliest_snapshot_iter_ == snapshots_->end()) {
+ earliest_snapshot_ = kMaxSequenceNumber;
+ } else {
+ earliest_snapshot_ = *earliest_snapshot_iter_;
+ }
+ in_snapshot =
+ snapshot_checker_->CheckInSnapshot(sequence, earliest_snapshot_);
+ }
+ assert(in_snapshot != SnapshotCheckerResult::kSnapshotReleased);
+ if (in_snapshot == SnapshotCheckerResult::kSnapshotReleased) {
+ ROCKS_LOG_FATAL(info_log_,
+ "Unexpected released snapshot in IsInEarliestSnapshot");
+ }
+ return in_snapshot == SnapshotCheckerResult::kInSnapshot;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_iterator.h b/src/rocksdb/db/compaction/compaction_iterator.h
new file mode 100644
index 000000000..8be60eb9e
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_iterator.h
@@ -0,0 +1,240 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <algorithm>
+#include <deque>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/compaction/compaction.h"
+#include "db/compaction/compaction_iteration_stats.h"
+#include "db/merge_helper.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/range_del_aggregator.h"
+#include "db/snapshot_checker.h"
+#include "options/cf_options.h"
+#include "rocksdb/compaction_filter.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CompactionIterator {
+ public:
+ // A wrapper around Compaction. Has a much smaller interface, only what
+ // CompactionIterator uses. Tests can override it.
+ class CompactionProxy {
+ public:
+ explicit CompactionProxy(const Compaction* compaction)
+ : compaction_(compaction) {}
+
+ virtual ~CompactionProxy() = default;
+ virtual int level(size_t /*compaction_input_level*/ = 0) const {
+ return compaction_->level();
+ }
+ virtual bool KeyNotExistsBeyondOutputLevel(
+ const Slice& user_key, std::vector<size_t>* level_ptrs) const {
+ return compaction_->KeyNotExistsBeyondOutputLevel(user_key, level_ptrs);
+ }
+ virtual bool bottommost_level() const {
+ return compaction_->bottommost_level();
+ }
+ virtual int number_levels() const { return compaction_->number_levels(); }
+ virtual Slice GetLargestUserKey() const {
+ return compaction_->GetLargestUserKey();
+ }
+ virtual bool allow_ingest_behind() const {
+ return compaction_->immutable_cf_options()->allow_ingest_behind;
+ }
+ virtual bool preserve_deletes() const {
+ return compaction_->immutable_cf_options()->preserve_deletes;
+ }
+
+ protected:
+ CompactionProxy() = default;
+
+ private:
+ const Compaction* compaction_;
+ };
+
+ CompactionIterator(
+ InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+ SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
+ SequenceNumber earliest_write_conflict_snapshot,
+ const SnapshotChecker* snapshot_checker, Env* env,
+ bool report_detailed_time, bool expect_valid_internal_key,
+ CompactionRangeDelAggregator* range_del_agg,
+ const Compaction* compaction = nullptr,
+ const CompactionFilter* compaction_filter = nullptr,
+ const std::atomic<bool>* shutting_down = nullptr,
+ const SequenceNumber preserve_deletes_seqnum = 0,
+ const std::atomic<bool>* manual_compaction_paused = nullptr,
+ const std::shared_ptr<Logger> info_log = nullptr);
+
+ // Constructor with custom CompactionProxy, used for tests.
+ CompactionIterator(
+ InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+ SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
+ SequenceNumber earliest_write_conflict_snapshot,
+ const SnapshotChecker* snapshot_checker, Env* env,
+ bool report_detailed_time, bool expect_valid_internal_key,
+ CompactionRangeDelAggregator* range_del_agg,
+ std::unique_ptr<CompactionProxy> compaction,
+ const CompactionFilter* compaction_filter = nullptr,
+ const std::atomic<bool>* shutting_down = nullptr,
+ const SequenceNumber preserve_deletes_seqnum = 0,
+ const std::atomic<bool>* manual_compaction_paused = nullptr,
+ const std::shared_ptr<Logger> info_log = nullptr);
+
+ ~CompactionIterator();
+
+ void ResetRecordCounts();
+
+ // Seek to the beginning of the compaction iterator output.
+ //
+ // REQUIRED: Call only once.
+ void SeekToFirst();
+
+ // Produces the next record in the compaction.
+ //
+ // REQUIRED: SeekToFirst() has been called.
+ void Next();
+
+ // Getters
+ const Slice& key() const { return key_; }
+ const Slice& value() const { return value_; }
+ const Status& status() const { return status_; }
+ const ParsedInternalKey& ikey() const { return ikey_; }
+ bool Valid() const { return valid_; }
+ const Slice& user_key() const { return current_user_key_; }
+ const CompactionIterationStats& iter_stats() const { return iter_stats_; }
+
+ private:
+ // Processes the input stream to find the next output
+ void NextFromInput();
+
+ // Do last preparations before presenting the output to the callee. At this
+ // point this only zeroes out the sequence number if possible for better
+ // compression.
+ void PrepareOutput();
+
+ // Invoke compaction filter if needed.
+ void InvokeFilterIfNeeded(bool* need_skip, Slice* skip_until);
+
+ // Given a sequence number, return the sequence number of the
+ // earliest snapshot that this sequence number is visible in.
+ // The snapshots themselves are arranged in ascending order of
+ // sequence numbers.
+ // Employ a sequential search because the total number of
+ // snapshots are typically small.
+ inline SequenceNumber findEarliestVisibleSnapshot(
+ SequenceNumber in, SequenceNumber* prev_snapshot);
+
+ // Checks whether the currently seen ikey_ is needed for
+ // incremental (differential) snapshot and hence can't be dropped
+ // or seqnum be zero-ed out even if all other conditions for it are met.
+ inline bool ikeyNotNeededForIncrementalSnapshot();
+
+ inline bool KeyCommitted(SequenceNumber sequence) {
+ return snapshot_checker_ == nullptr ||
+ snapshot_checker_->CheckInSnapshot(sequence, kMaxSequenceNumber) ==
+ SnapshotCheckerResult::kInSnapshot;
+ }
+
+ bool IsInEarliestSnapshot(SequenceNumber sequence);
+
+ InternalIterator* input_;
+ const Comparator* cmp_;
+ MergeHelper* merge_helper_;
+ const std::vector<SequenceNumber>* snapshots_;
+ // List of snapshots released during compaction.
+ // findEarliestVisibleSnapshot() find them out from return of
+ // snapshot_checker, and make sure they will not be returned as
+ // earliest visible snapshot of an older value.
+ // See WritePreparedTransactionTest::ReleaseSnapshotDuringCompaction3.
+ std::unordered_set<SequenceNumber> released_snapshots_;
+ std::vector<SequenceNumber>::const_iterator earliest_snapshot_iter_;
+ const SequenceNumber earliest_write_conflict_snapshot_;
+ const SnapshotChecker* const snapshot_checker_;
+ Env* env_;
+ bool report_detailed_time_;
+ bool expect_valid_internal_key_;
+ CompactionRangeDelAggregator* range_del_agg_;
+ std::unique_ptr<CompactionProxy> compaction_;
+ const CompactionFilter* compaction_filter_;
+ const std::atomic<bool>* shutting_down_;
+ const std::atomic<bool>* manual_compaction_paused_;
+ const SequenceNumber preserve_deletes_seqnum_;
+ bool bottommost_level_;
+ bool valid_ = false;
+ bool visible_at_tip_;
+ SequenceNumber earliest_snapshot_;
+ SequenceNumber latest_snapshot_;
+
+ // State
+ //
+ // Points to a copy of the current compaction iterator output (current_key_)
+ // if valid_.
+ Slice key_;
+ // Points to the value in the underlying iterator that corresponds to the
+ // current output.
+ Slice value_;
+ // The status is OK unless compaction iterator encounters a merge operand
+ // while not having a merge operator defined.
+ Status status_;
+ // Stores the user key, sequence number and type of the current compaction
+ // iterator output (or current key in the underlying iterator during
+ // NextFromInput()).
+ ParsedInternalKey ikey_;
+ // Stores whether ikey_.user_key is valid. If set to false, the user key is
+ // not compared against the current key in the underlying iterator.
+ bool has_current_user_key_ = false;
+ bool at_next_ = false; // If false, the iterator
+ // Holds a copy of the current compaction iterator output (or current key in
+ // the underlying iterator during NextFromInput()).
+ IterKey current_key_;
+ Slice current_user_key_;
+ SequenceNumber current_user_key_sequence_;
+ SequenceNumber current_user_key_snapshot_;
+
+ // True if the iterator has already returned a record for the current key.
+ bool has_outputted_key_ = false;
+
+ // truncated the value of the next key and output it without applying any
+ // compaction rules. This is used for outputting a put after a single delete.
+ bool clear_and_output_next_key_ = false;
+
+ MergeOutputIterator merge_out_iter_;
+ // PinnedIteratorsManager used to pin input_ Iterator blocks while reading
+ // merge operands and then releasing them after consuming them.
+ PinnedIteratorsManager pinned_iters_mgr_;
+ std::string compaction_filter_value_;
+ InternalKey compaction_filter_skip_until_;
+ // "level_ptrs" holds indices that remember which file of an associated
+ // level we were last checking during the last call to compaction->
+ // KeyNotExistsBeyondOutputLevel(). This allows future calls to the function
+ // to pick off where it left off since each subcompaction's key range is
+ // increasing so a later call to the function must be looking for a key that
+ // is in or beyond the last file checked during the previous call
+ std::vector<size_t> level_ptrs_;
+ CompactionIterationStats iter_stats_;
+
+ // Used to avoid purging uncommitted values. The application can specify
+ // uncommitted values by providing a SnapshotChecker object.
+ bool current_key_committed_;
+ std::shared_ptr<Logger> info_log_;
+
+ bool IsShuttingDown() {
+ // This is a best-effort facility, so memory_order_relaxed is sufficient.
+ return shutting_down_ && shutting_down_->load(std::memory_order_relaxed);
+ }
+
+ bool IsPausingManualCompaction() {
+ // This is a best-effort facility, so memory_order_relaxed is sufficient.
+ return manual_compaction_paused_ &&
+ manual_compaction_paused_->load(std::memory_order_relaxed);
+ }
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_iterator_test.cc b/src/rocksdb/db/compaction/compaction_iterator_test.cc
new file mode 100644
index 000000000..0c50fb9ba
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_iterator_test.cc
@@ -0,0 +1,976 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+
+#include <string>
+#include <vector>
+
+#include "db/compaction/compaction_iterator.h"
+#include "port/port.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Expects no merging attempts.
+class NoMergingMergeOp : public MergeOperator {
+ public:
+ bool FullMergeV2(const MergeOperationInput& /*merge_in*/,
+ MergeOperationOutput* /*merge_out*/) const override {
+ ADD_FAILURE();
+ return false;
+ }
+ bool PartialMergeMulti(const Slice& /*key*/,
+ const std::deque<Slice>& /*operand_list*/,
+ std::string* /*new_value*/,
+ Logger* /*logger*/) const override {
+ ADD_FAILURE();
+ return false;
+ }
+ const char* Name() const override {
+ return "CompactionIteratorTest NoMergingMergeOp";
+ }
+};
+
+// Compaction filter that gets stuck when it sees a particular key,
+// then gets unstuck when told to.
+// Always returns Decition::kRemove.
+class StallingFilter : public CompactionFilter {
+ public:
+ Decision FilterV2(int /*level*/, const Slice& key, ValueType /*type*/,
+ const Slice& /*existing_value*/, std::string* /*new_value*/,
+ std::string* /*skip_until*/) const override {
+ int k = std::atoi(key.ToString().c_str());
+ last_seen.store(k);
+ while (k >= stall_at.load()) {
+ std::this_thread::yield();
+ }
+ return Decision::kRemove;
+ }
+
+ const char* Name() const override {
+ return "CompactionIteratorTest StallingFilter";
+ }
+
+ // Wait until the filter sees a key >= k and stalls at that key.
+ // If `exact`, asserts that the seen key is equal to k.
+ void WaitForStall(int k, bool exact = true) {
+ stall_at.store(k);
+ while (last_seen.load() < k) {
+ std::this_thread::yield();
+ }
+ if (exact) {
+ EXPECT_EQ(k, last_seen.load());
+ }
+ }
+
+ // Filter will stall on key >= stall_at. Advance stall_at to unstall.
+ mutable std::atomic<int> stall_at{0};
+ // Last key the filter was called with.
+ mutable std::atomic<int> last_seen{0};
+};
+
+// Compaction filter that filter out all keys.
+class FilterAllKeysCompactionFilter : public CompactionFilter {
+ public:
+ Decision FilterV2(int /*level*/, const Slice& /*key*/, ValueType /*type*/,
+ const Slice& /*existing_value*/, std::string* /*new_value*/,
+ std::string* /*skip_until*/) const override {
+ return Decision::kRemove;
+ }
+
+ const char* Name() const override { return "AllKeysCompactionFilter"; }
+};
+
+class LoggingForwardVectorIterator : public InternalIterator {
+ public:
+ struct Action {
+ enum class Type {
+ SEEK_TO_FIRST,
+ SEEK,
+ NEXT,
+ };
+
+ Type type;
+ std::string arg;
+
+ explicit Action(Type _type, std::string _arg = "")
+ : type(_type), arg(_arg) {}
+
+ bool operator==(const Action& rhs) const {
+ return std::tie(type, arg) == std::tie(rhs.type, rhs.arg);
+ }
+ };
+
+ LoggingForwardVectorIterator(const std::vector<std::string>& keys,
+ const std::vector<std::string>& values)
+ : keys_(keys), values_(values), current_(keys.size()) {
+ assert(keys_.size() == values_.size());
+ }
+
+ bool Valid() const override { return current_ < keys_.size(); }
+
+ void SeekToFirst() override {
+ log.emplace_back(Action::Type::SEEK_TO_FIRST);
+ current_ = 0;
+ }
+ void SeekToLast() override { assert(false); }
+
+ void Seek(const Slice& target) override {
+ log.emplace_back(Action::Type::SEEK, target.ToString());
+ current_ = std::lower_bound(keys_.begin(), keys_.end(), target.ToString()) -
+ keys_.begin();
+ }
+
+ void SeekForPrev(const Slice& /*target*/) override { assert(false); }
+
+ void Next() override {
+ assert(Valid());
+ log.emplace_back(Action::Type::NEXT);
+ current_++;
+ }
+ void Prev() override { assert(false); }
+
+ Slice key() const override {
+ assert(Valid());
+ return Slice(keys_[current_]);
+ }
+ Slice value() const override {
+ assert(Valid());
+ return Slice(values_[current_]);
+ }
+
+ Status status() const override { return Status::OK(); }
+
+ std::vector<Action> log;
+
+ private:
+ std::vector<std::string> keys_;
+ std::vector<std::string> values_;
+ size_t current_;
+};
+
+class FakeCompaction : public CompactionIterator::CompactionProxy {
+ public:
+ FakeCompaction() = default;
+
+ int level(size_t /*compaction_input_level*/) const override { return 0; }
+ bool KeyNotExistsBeyondOutputLevel(
+ const Slice& /*user_key*/,
+ std::vector<size_t>* /*level_ptrs*/) const override {
+ return is_bottommost_level || key_not_exists_beyond_output_level;
+ }
+ bool bottommost_level() const override { return is_bottommost_level; }
+ int number_levels() const override { return 1; }
+ Slice GetLargestUserKey() const override {
+ return "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
+ }
+ bool allow_ingest_behind() const override { return false; }
+
+ bool preserve_deletes() const override { return false; }
+
+ bool key_not_exists_beyond_output_level = false;
+
+ bool is_bottommost_level = false;
+};
+
+// A simplifed snapshot checker which assumes each snapshot has a global
+// last visible sequence.
+class TestSnapshotChecker : public SnapshotChecker {
+ public:
+ explicit TestSnapshotChecker(
+ SequenceNumber last_committed_sequence,
+ const std::unordered_map<SequenceNumber, SequenceNumber>& snapshots = {{}})
+ : last_committed_sequence_(last_committed_sequence),
+ snapshots_(snapshots) {}
+
+ SnapshotCheckerResult CheckInSnapshot(
+ SequenceNumber seq, SequenceNumber snapshot_seq) const override {
+ if (snapshot_seq == kMaxSequenceNumber) {
+ return seq <= last_committed_sequence_
+ ? SnapshotCheckerResult::kInSnapshot
+ : SnapshotCheckerResult::kNotInSnapshot;
+ }
+ assert(snapshots_.count(snapshot_seq) > 0);
+ return seq <= snapshots_.at(snapshot_seq)
+ ? SnapshotCheckerResult::kInSnapshot
+ : SnapshotCheckerResult::kNotInSnapshot;
+ }
+
+ private:
+ SequenceNumber last_committed_sequence_;
+ // A map of valid snapshot to last visible sequence to the snapshot.
+ std::unordered_map<SequenceNumber, SequenceNumber> snapshots_;
+};
+
+// Test param:
+// bool: whether to pass snapshot_checker to compaction iterator.
+class CompactionIteratorTest : public testing::TestWithParam<bool> {
+ public:
+ CompactionIteratorTest()
+ : cmp_(BytewiseComparator()), icmp_(cmp_), snapshots_({}) {}
+
+ void InitIterators(
+ const std::vector<std::string>& ks, const std::vector<std::string>& vs,
+ const std::vector<std::string>& range_del_ks,
+ const std::vector<std::string>& range_del_vs,
+ SequenceNumber last_sequence,
+ SequenceNumber last_committed_sequence = kMaxSequenceNumber,
+ MergeOperator* merge_op = nullptr, CompactionFilter* filter = nullptr,
+ bool bottommost_level = false,
+ SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber) {
+ std::unique_ptr<InternalIterator> unfragmented_range_del_iter(
+ new test::VectorIterator(range_del_ks, range_del_vs));
+ auto tombstone_list = std::make_shared<FragmentedRangeTombstoneList>(
+ std::move(unfragmented_range_del_iter), icmp_);
+ std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+ new FragmentedRangeTombstoneIterator(tombstone_list, icmp_,
+ kMaxSequenceNumber));
+ range_del_agg_.reset(new CompactionRangeDelAggregator(&icmp_, snapshots_));
+ range_del_agg_->AddTombstones(std::move(range_del_iter));
+
+ std::unique_ptr<CompactionIterator::CompactionProxy> compaction;
+ if (filter || bottommost_level) {
+ compaction_proxy_ = new FakeCompaction();
+ compaction_proxy_->is_bottommost_level = bottommost_level;
+ compaction.reset(compaction_proxy_);
+ }
+ bool use_snapshot_checker = UseSnapshotChecker() || GetParam();
+ if (use_snapshot_checker || last_committed_sequence < kMaxSequenceNumber) {
+ snapshot_checker_.reset(
+ new TestSnapshotChecker(last_committed_sequence, snapshot_map_));
+ }
+ merge_helper_.reset(
+ new MergeHelper(Env::Default(), cmp_, merge_op, filter, nullptr, false,
+ 0 /*latest_snapshot*/, snapshot_checker_.get(),
+ 0 /*level*/, nullptr /*statistics*/, &shutting_down_));
+
+ iter_.reset(new LoggingForwardVectorIterator(ks, vs));
+ iter_->SeekToFirst();
+ c_iter_.reset(new CompactionIterator(
+ iter_.get(), cmp_, merge_helper_.get(), last_sequence, &snapshots_,
+ earliest_write_conflict_snapshot, snapshot_checker_.get(),
+ Env::Default(), false /* report_detailed_time */, false,
+ range_del_agg_.get(), std::move(compaction), filter, &shutting_down_));
+ }
+
+ void AddSnapshot(SequenceNumber snapshot,
+ SequenceNumber last_visible_seq = kMaxSequenceNumber) {
+ snapshots_.push_back(snapshot);
+ snapshot_map_[snapshot] = last_visible_seq;
+ }
+
+ virtual bool UseSnapshotChecker() const { return false; }
+
+ void RunTest(
+ const std::vector<std::string>& input_keys,
+ const std::vector<std::string>& input_values,
+ const std::vector<std::string>& expected_keys,
+ const std::vector<std::string>& expected_values,
+ SequenceNumber last_committed_seq = kMaxSequenceNumber,
+ MergeOperator* merge_operator = nullptr,
+ CompactionFilter* compaction_filter = nullptr,
+ bool bottommost_level = false,
+ SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber) {
+ InitIterators(input_keys, input_values, {}, {}, kMaxSequenceNumber,
+ last_committed_seq, merge_operator, compaction_filter,
+ bottommost_level, earliest_write_conflict_snapshot);
+ c_iter_->SeekToFirst();
+ for (size_t i = 0; i < expected_keys.size(); i++) {
+ std::string info = "i = " + ToString(i);
+ ASSERT_TRUE(c_iter_->Valid()) << info;
+ ASSERT_OK(c_iter_->status()) << info;
+ ASSERT_EQ(expected_keys[i], c_iter_->key().ToString()) << info;
+ ASSERT_EQ(expected_values[i], c_iter_->value().ToString()) << info;
+ c_iter_->Next();
+ }
+ ASSERT_FALSE(c_iter_->Valid());
+ }
+
+ const Comparator* cmp_;
+ const InternalKeyComparator icmp_;
+ std::vector<SequenceNumber> snapshots_;
+ // A map of valid snapshot to last visible sequence to the snapshot.
+ std::unordered_map<SequenceNumber, SequenceNumber> snapshot_map_;
+ std::unique_ptr<MergeHelper> merge_helper_;
+ std::unique_ptr<LoggingForwardVectorIterator> iter_;
+ std::unique_ptr<CompactionIterator> c_iter_;
+ std::unique_ptr<CompactionRangeDelAggregator> range_del_agg_;
+ std::unique_ptr<SnapshotChecker> snapshot_checker_;
+ std::atomic<bool> shutting_down_{false};
+ FakeCompaction* compaction_proxy_;
+};
+
+// It is possible that the output of the compaction iterator is empty even if
+// the input is not.
+TEST_P(CompactionIteratorTest, EmptyResult) {
+ InitIterators({test::KeyStr("a", 5, kTypeSingleDeletion),
+ test::KeyStr("a", 3, kTypeValue)},
+ {"", "val"}, {}, {}, 5);
+ c_iter_->SeekToFirst();
+ ASSERT_FALSE(c_iter_->Valid());
+}
+
+// If there is a corruption after a single deletion, the corrupted key should
+// be preserved.
+TEST_P(CompactionIteratorTest, CorruptionAfterSingleDeletion) {
+ InitIterators({test::KeyStr("a", 5, kTypeSingleDeletion),
+ test::KeyStr("a", 3, kTypeValue, true),
+ test::KeyStr("b", 10, kTypeValue)},
+ {"", "val", "val2"}, {}, {}, 10);
+ c_iter_->SeekToFirst();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("a", 5, kTypeSingleDeletion),
+ c_iter_->key().ToString());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("a", 3, kTypeValue, true), c_iter_->key().ToString());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("b", 10, kTypeValue), c_iter_->key().ToString());
+ c_iter_->Next();
+ ASSERT_FALSE(c_iter_->Valid());
+}
+
+TEST_P(CompactionIteratorTest, SimpleRangeDeletion) {
+ InitIterators({test::KeyStr("morning", 5, kTypeValue),
+ test::KeyStr("morning", 2, kTypeValue),
+ test::KeyStr("night", 3, kTypeValue)},
+ {"zao", "zao", "wan"},
+ {test::KeyStr("ma", 4, kTypeRangeDeletion)}, {"mz"}, 5);
+ c_iter_->SeekToFirst();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("morning", 5, kTypeValue), c_iter_->key().ToString());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("night", 3, kTypeValue), c_iter_->key().ToString());
+ c_iter_->Next();
+ ASSERT_FALSE(c_iter_->Valid());
+}
+
+TEST_P(CompactionIteratorTest, RangeDeletionWithSnapshots) {
+ AddSnapshot(10);
+ std::vector<std::string> ks1;
+ ks1.push_back(test::KeyStr("ma", 28, kTypeRangeDeletion));
+ std::vector<std::string> vs1{"mz"};
+ std::vector<std::string> ks2{test::KeyStr("morning", 15, kTypeValue),
+ test::KeyStr("morning", 5, kTypeValue),
+ test::KeyStr("night", 40, kTypeValue),
+ test::KeyStr("night", 20, kTypeValue)};
+ std::vector<std::string> vs2{"zao 15", "zao 5", "wan 40", "wan 20"};
+ InitIterators(ks2, vs2, ks1, vs1, 40);
+ c_iter_->SeekToFirst();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("morning", 5, kTypeValue), c_iter_->key().ToString());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("night", 40, kTypeValue), c_iter_->key().ToString());
+ c_iter_->Next();
+ ASSERT_FALSE(c_iter_->Valid());
+}
+
+TEST_P(CompactionIteratorTest, CompactionFilterSkipUntil) {
+ class Filter : public CompactionFilter {
+ Decision FilterV2(int /*level*/, const Slice& key, ValueType t,
+ const Slice& existing_value, std::string* /*new_value*/,
+ std::string* skip_until) const override {
+ std::string k = key.ToString();
+ std::string v = existing_value.ToString();
+ // See InitIterators() call below for the sequence of keys and their
+ // filtering decisions. Here we closely assert that compaction filter is
+ // called with the expected keys and only them, and with the right values.
+ if (k == "a") {
+ EXPECT_EQ(ValueType::kValue, t);
+ EXPECT_EQ("av50", v);
+ return Decision::kKeep;
+ }
+ if (k == "b") {
+ EXPECT_EQ(ValueType::kValue, t);
+ EXPECT_EQ("bv60", v);
+ *skip_until = "d+";
+ return Decision::kRemoveAndSkipUntil;
+ }
+ if (k == "e") {
+ EXPECT_EQ(ValueType::kMergeOperand, t);
+ EXPECT_EQ("em71", v);
+ return Decision::kKeep;
+ }
+ if (k == "f") {
+ if (v == "fm65") {
+ EXPECT_EQ(ValueType::kMergeOperand, t);
+ *skip_until = "f";
+ } else {
+ EXPECT_EQ("fm30", v);
+ EXPECT_EQ(ValueType::kMergeOperand, t);
+ *skip_until = "g+";
+ }
+ return Decision::kRemoveAndSkipUntil;
+ }
+ if (k == "h") {
+ EXPECT_EQ(ValueType::kValue, t);
+ EXPECT_EQ("hv91", v);
+ return Decision::kKeep;
+ }
+ if (k == "i") {
+ EXPECT_EQ(ValueType::kMergeOperand, t);
+ EXPECT_EQ("im95", v);
+ *skip_until = "z";
+ return Decision::kRemoveAndSkipUntil;
+ }
+ ADD_FAILURE();
+ return Decision::kKeep;
+ }
+
+ const char* Name() const override {
+ return "CompactionIteratorTest.CompactionFilterSkipUntil::Filter";
+ }
+ };
+
+ NoMergingMergeOp merge_op;
+ Filter filter;
+ InitIterators(
+ {test::KeyStr("a", 50, kTypeValue), // keep
+ test::KeyStr("a", 45, kTypeMerge),
+ test::KeyStr("b", 60, kTypeValue), // skip to "d+"
+ test::KeyStr("b", 40, kTypeValue), test::KeyStr("c", 35, kTypeValue),
+ test::KeyStr("d", 70, kTypeMerge),
+ test::KeyStr("e", 71, kTypeMerge), // keep
+ test::KeyStr("f", 65, kTypeMerge), // skip to "f", aka keep
+ test::KeyStr("f", 30, kTypeMerge), // skip to "g+"
+ test::KeyStr("f", 25, kTypeValue), test::KeyStr("g", 90, kTypeValue),
+ test::KeyStr("h", 91, kTypeValue), // keep
+ test::KeyStr("i", 95, kTypeMerge), // skip to "z"
+ test::KeyStr("j", 99, kTypeValue)},
+ {"av50", "am45", "bv60", "bv40", "cv35", "dm70", "em71", "fm65", "fm30",
+ "fv25", "gv90", "hv91", "im95", "jv99"},
+ {}, {}, kMaxSequenceNumber, kMaxSequenceNumber, &merge_op, &filter);
+
+ // Compaction should output just "a", "e" and "h" keys.
+ c_iter_->SeekToFirst();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("a", 50, kTypeValue), c_iter_->key().ToString());
+ ASSERT_EQ("av50", c_iter_->value().ToString());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("e", 71, kTypeMerge), c_iter_->key().ToString());
+ ASSERT_EQ("em71", c_iter_->value().ToString());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("h", 91, kTypeValue), c_iter_->key().ToString());
+ ASSERT_EQ("hv91", c_iter_->value().ToString());
+ c_iter_->Next();
+ ASSERT_FALSE(c_iter_->Valid());
+
+ // Check that the compaction iterator did the correct sequence of calls on
+ // the underlying iterator.
+ using A = LoggingForwardVectorIterator::Action;
+ using T = A::Type;
+ std::vector<A> expected_actions = {
+ A(T::SEEK_TO_FIRST),
+ A(T::NEXT),
+ A(T::NEXT),
+ A(T::SEEK, test::KeyStr("d+", kMaxSequenceNumber, kValueTypeForSeek)),
+ A(T::NEXT),
+ A(T::NEXT),
+ A(T::SEEK, test::KeyStr("g+", kMaxSequenceNumber, kValueTypeForSeek)),
+ A(T::NEXT),
+ A(T::SEEK, test::KeyStr("z", kMaxSequenceNumber, kValueTypeForSeek))};
+ ASSERT_EQ(expected_actions, iter_->log);
+}
+
+TEST_P(CompactionIteratorTest, ShuttingDownInFilter) {
+ NoMergingMergeOp merge_op;
+ StallingFilter filter;
+ InitIterators(
+ {test::KeyStr("1", 1, kTypeValue), test::KeyStr("2", 2, kTypeValue),
+ test::KeyStr("3", 3, kTypeValue), test::KeyStr("4", 4, kTypeValue)},
+ {"v1", "v2", "v3", "v4"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber,
+ &merge_op, &filter);
+ // Don't leave tombstones (kTypeDeletion) for filtered keys.
+ compaction_proxy_->key_not_exists_beyond_output_level = true;
+
+ std::atomic<bool> seek_done{false};
+ ROCKSDB_NAMESPACE::port::Thread compaction_thread([&] {
+ c_iter_->SeekToFirst();
+ EXPECT_FALSE(c_iter_->Valid());
+ EXPECT_TRUE(c_iter_->status().IsShutdownInProgress());
+ seek_done.store(true);
+ });
+
+ // Let key 1 through.
+ filter.WaitForStall(1);
+
+ // Shutdown during compaction filter call for key 2.
+ filter.WaitForStall(2);
+ shutting_down_.store(true);
+ EXPECT_FALSE(seek_done.load());
+
+ // Unstall filter and wait for SeekToFirst() to return.
+ filter.stall_at.store(3);
+ compaction_thread.join();
+ assert(seek_done.load());
+
+ // Check that filter was never called again.
+ EXPECT_EQ(2, filter.last_seen.load());
+}
+
+// Same as ShuttingDownInFilter, but shutdown happens during filter call for
+// a merge operand, not for a value.
+TEST_P(CompactionIteratorTest, ShuttingDownInMerge) {
+ NoMergingMergeOp merge_op;
+ StallingFilter filter;
+ InitIterators(
+ {test::KeyStr("1", 1, kTypeValue), test::KeyStr("2", 2, kTypeMerge),
+ test::KeyStr("3", 3, kTypeMerge), test::KeyStr("4", 4, kTypeValue)},
+ {"v1", "v2", "v3", "v4"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber,
+ &merge_op, &filter);
+ compaction_proxy_->key_not_exists_beyond_output_level = true;
+
+ std::atomic<bool> seek_done{false};
+ ROCKSDB_NAMESPACE::port::Thread compaction_thread([&] {
+ c_iter_->SeekToFirst();
+ ASSERT_FALSE(c_iter_->Valid());
+ ASSERT_TRUE(c_iter_->status().IsShutdownInProgress());
+ seek_done.store(true);
+ });
+
+ // Let key 1 through.
+ filter.WaitForStall(1);
+
+ // Shutdown during compaction filter call for key 2.
+ filter.WaitForStall(2);
+ shutting_down_.store(true);
+ EXPECT_FALSE(seek_done.load());
+
+ // Unstall filter and wait for SeekToFirst() to return.
+ filter.stall_at.store(3);
+ compaction_thread.join();
+ assert(seek_done.load());
+
+ // Check that filter was never called again.
+ EXPECT_EQ(2, filter.last_seen.load());
+}
+
+TEST_P(CompactionIteratorTest, SingleMergeOperand) {
+ class Filter : public CompactionFilter {
+ Decision FilterV2(int /*level*/, const Slice& key, ValueType t,
+ const Slice& existing_value, std::string* /*new_value*/,
+ std::string* /*skip_until*/) const override {
+ std::string k = key.ToString();
+ std::string v = existing_value.ToString();
+
+ // See InitIterators() call below for the sequence of keys and their
+ // filtering decisions. Here we closely assert that compaction filter is
+ // called with the expected keys and only them, and with the right values.
+ if (k == "a") {
+ EXPECT_EQ(ValueType::kMergeOperand, t);
+ EXPECT_EQ("av1", v);
+ return Decision::kKeep;
+ } else if (k == "b") {
+ EXPECT_EQ(ValueType::kMergeOperand, t);
+ return Decision::kKeep;
+ } else if (k == "c") {
+ return Decision::kKeep;
+ }
+
+ ADD_FAILURE();
+ return Decision::kKeep;
+ }
+
+ const char* Name() const override {
+ return "CompactionIteratorTest.SingleMergeOperand::Filter";
+ }
+ };
+
+ class SingleMergeOp : public MergeOperator {
+ public:
+ bool FullMergeV2(const MergeOperationInput& merge_in,
+ MergeOperationOutput* merge_out) const override {
+ // See InitIterators() call below for why "c" is the only key for which
+ // FullMergeV2 should be called.
+ EXPECT_EQ("c", merge_in.key.ToString());
+
+ std::string temp_value;
+ if (merge_in.existing_value != nullptr) {
+ temp_value = merge_in.existing_value->ToString();
+ }
+
+ for (auto& operand : merge_in.operand_list) {
+ temp_value.append(operand.ToString());
+ }
+ merge_out->new_value = temp_value;
+
+ return true;
+ }
+
+ bool PartialMergeMulti(const Slice& key,
+ const std::deque<Slice>& operand_list,
+ std::string* new_value,
+ Logger* /*logger*/) const override {
+ std::string string_key = key.ToString();
+ EXPECT_TRUE(string_key == "a" || string_key == "b");
+
+ if (string_key == "a") {
+ EXPECT_EQ(1, operand_list.size());
+ } else if (string_key == "b") {
+ EXPECT_EQ(2, operand_list.size());
+ }
+
+ std::string temp_value;
+ for (auto& operand : operand_list) {
+ temp_value.append(operand.ToString());
+ }
+ swap(temp_value, *new_value);
+
+ return true;
+ }
+
+ const char* Name() const override {
+ return "CompactionIteratorTest SingleMergeOp";
+ }
+
+ bool AllowSingleOperand() const override { return true; }
+ };
+
+ SingleMergeOp merge_op;
+ Filter filter;
+ InitIterators(
+ // a should invoke PartialMergeMulti with a single merge operand.
+ {test::KeyStr("a", 50, kTypeMerge),
+ // b should invoke PartialMergeMulti with two operands.
+ test::KeyStr("b", 70, kTypeMerge), test::KeyStr("b", 60, kTypeMerge),
+ // c should invoke FullMerge due to kTypeValue at the beginning.
+ test::KeyStr("c", 90, kTypeMerge), test::KeyStr("c", 80, kTypeValue)},
+ {"av1", "bv2", "bv1", "cv2", "cv1"}, {}, {}, kMaxSequenceNumber,
+ kMaxSequenceNumber, &merge_op, &filter);
+
+ c_iter_->SeekToFirst();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), c_iter_->key().ToString());
+ ASSERT_EQ("av1", c_iter_->value().ToString());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ("bv1bv2", c_iter_->value().ToString());
+ c_iter_->Next();
+ ASSERT_EQ("cv1cv2", c_iter_->value().ToString());
+}
+
+// In bottommost level, values earlier than earliest snapshot can be output
+// with sequence = 0.
+TEST_P(CompactionIteratorTest, ZeroOutSequenceAtBottomLevel) {
+ AddSnapshot(1);
+ RunTest({test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 2, kTypeValue)},
+ {"v1", "v2"},
+ {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue)},
+ {"v1", "v2"}, kMaxSequenceNumber /*last_commited_seq*/,
+ nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+ true /*bottommost_level*/);
+}
+
+// In bottommost level, deletions earlier than earliest snapshot can be removed
+// permanently.
+TEST_P(CompactionIteratorTest, RemoveDeletionAtBottomLevel) {
+ AddSnapshot(1);
+ RunTest({test::KeyStr("a", 1, kTypeDeletion),
+ test::KeyStr("b", 3, kTypeDeletion),
+ test::KeyStr("b", 1, kTypeValue)},
+ {"", "", ""},
+ {test::KeyStr("b", 3, kTypeDeletion),
+ test::KeyStr("b", 0, kTypeValue)},
+ {"", ""},
+ kMaxSequenceNumber /*last_commited_seq*/, nullptr /*merge_operator*/,
+ nullptr /*compaction_filter*/, true /*bottommost_level*/);
+}
+
+// In bottommost level, single deletions earlier than earliest snapshot can be
+// removed permanently.
+TEST_P(CompactionIteratorTest, RemoveSingleDeletionAtBottomLevel) {
+ AddSnapshot(1);
+ RunTest({test::KeyStr("a", 1, kTypeSingleDeletion),
+ test::KeyStr("b", 2, kTypeSingleDeletion)},
+ {"", ""}, {test::KeyStr("b", 2, kTypeSingleDeletion)}, {""},
+ kMaxSequenceNumber /*last_commited_seq*/, nullptr /*merge_operator*/,
+ nullptr /*compaction_filter*/, true /*bottommost_level*/);
+}
+
+INSTANTIATE_TEST_CASE_P(CompactionIteratorTestInstance, CompactionIteratorTest,
+ testing::Values(true, false));
+
+// Tests how CompactionIterator work together with SnapshotChecker.
+class CompactionIteratorWithSnapshotCheckerTest
+ : public CompactionIteratorTest {
+ public:
+ bool UseSnapshotChecker() const override { return true; }
+};
+
+// Uncommitted keys (keys with seq > last_committed_seq) should be output as-is
+// while committed version of these keys should get compacted as usual.
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ PreserveUncommittedKeys_Value) {
+ RunTest(
+ {test::KeyStr("foo", 3, kTypeValue), test::KeyStr("foo", 2, kTypeValue),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"v3", "v2", "v1"},
+ {test::KeyStr("foo", 3, kTypeValue), test::KeyStr("foo", 2, kTypeValue)},
+ {"v3", "v2"}, 2 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ PreserveUncommittedKeys_Deletion) {
+ RunTest({test::KeyStr("foo", 2, kTypeDeletion),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"", "v1"},
+ {test::KeyStr("foo", 2, kTypeDeletion),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"", "v1"}, 1 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ PreserveUncommittedKeys_Merge) {
+ auto merge_op = MergeOperators::CreateStringAppendOperator();
+ RunTest(
+ {test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeMerge),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"v3", "v2", "v1"},
+ {test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeValue)},
+ {"v3", "v1,v2"}, 2 /*last_committed_seq*/, merge_op.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ PreserveUncommittedKeys_SingleDelete) {
+ RunTest({test::KeyStr("foo", 2, kTypeSingleDeletion),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"", "v1"},
+ {test::KeyStr("foo", 2, kTypeSingleDeletion),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"", "v1"}, 1 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ PreserveUncommittedKeys_BlobIndex) {
+ RunTest({test::KeyStr("foo", 3, kTypeBlobIndex),
+ test::KeyStr("foo", 2, kTypeBlobIndex),
+ test::KeyStr("foo", 1, kTypeBlobIndex)},
+ {"v3", "v2", "v1"},
+ {test::KeyStr("foo", 3, kTypeBlobIndex),
+ test::KeyStr("foo", 2, kTypeBlobIndex)},
+ {"v3", "v2"}, 2 /*last_committed_seq*/);
+}
+
+// Test compaction iterator dedup keys visible to the same snapshot.
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Value) {
+ AddSnapshot(2, 1);
+ RunTest(
+ {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 3, kTypeValue),
+ test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)},
+ {"v4", "v3", "v2", "v1"},
+ {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 3, kTypeValue),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"v4", "v3", "v1"}, 3 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Deletion) {
+ AddSnapshot(2, 1);
+ RunTest(
+ {test::KeyStr("foo", 4, kTypeValue),
+ test::KeyStr("foo", 3, kTypeDeletion),
+ test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)},
+ {"v4", "", "v2", "v1"},
+ {test::KeyStr("foo", 4, kTypeValue),
+ test::KeyStr("foo", 3, kTypeDeletion),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"v4", "", "v1"}, 3 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Merge) {
+ AddSnapshot(2, 1);
+ AddSnapshot(4, 3);
+ auto merge_op = MergeOperators::CreateStringAppendOperator();
+ RunTest(
+ {test::KeyStr("foo", 5, kTypeMerge), test::KeyStr("foo", 4, kTypeMerge),
+ test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeMerge),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"v5", "v4", "v3", "v2", "v1"},
+ {test::KeyStr("foo", 5, kTypeMerge), test::KeyStr("foo", 4, kTypeMerge),
+ test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 1, kTypeValue)},
+ {"v5", "v4", "v2,v3", "v1"}, 4 /*last_committed_seq*/, merge_op.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ DedupSameSnapshot_SingleDeletion) {
+ AddSnapshot(2, 1);
+ RunTest(
+ {test::KeyStr("foo", 4, kTypeValue),
+ test::KeyStr("foo", 3, kTypeSingleDeletion),
+ test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)},
+ {"v4", "", "v2", "v1"},
+ {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 1, kTypeValue)},
+ {"v4", "v1"}, 3 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_BlobIndex) {
+ AddSnapshot(2, 1);
+ RunTest({test::KeyStr("foo", 4, kTypeBlobIndex),
+ test::KeyStr("foo", 3, kTypeBlobIndex),
+ test::KeyStr("foo", 2, kTypeBlobIndex),
+ test::KeyStr("foo", 1, kTypeBlobIndex)},
+ {"v4", "v3", "v2", "v1"},
+ {test::KeyStr("foo", 4, kTypeBlobIndex),
+ test::KeyStr("foo", 3, kTypeBlobIndex),
+ test::KeyStr("foo", 1, kTypeBlobIndex)},
+ {"v4", "v3", "v1"}, 3 /*last_committed_seq*/);
+}
+
+// At bottom level, sequence numbers can be zero out, and deletions can be
+// removed, but only when they are visible to earliest snapshot.
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ NotZeroOutSequenceIfNotVisibleToEarliestSnapshot) {
+ AddSnapshot(2, 1);
+ RunTest({test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 2, kTypeValue),
+ test::KeyStr("c", 3, kTypeValue)},
+ {"v1", "v2", "v3"},
+ {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue),
+ test::KeyStr("c", 3, kTypeValue)},
+ {"v1", "v2", "v3"}, kMaxSequenceNumber /*last_commited_seq*/,
+ nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+ true /*bottommost_level*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ NotRemoveDeletionIfNotVisibleToEarliestSnapshot) {
+ AddSnapshot(2, 1);
+ RunTest(
+ {test::KeyStr("a", 1, kTypeDeletion), test::KeyStr("b", 2, kTypeDeletion),
+ test::KeyStr("c", 3, kTypeDeletion)},
+ {"", "", ""},
+ {},
+ {"", ""}, kMaxSequenceNumber /*last_commited_seq*/,
+ nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+ true /*bottommost_level*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ NotRemoveDeletionIfValuePresentToEarlierSnapshot) {
+ AddSnapshot(2,1);
+ RunTest(
+ {test::KeyStr("a", 4, kTypeDeletion), test::KeyStr("a", 1, kTypeValue),
+ test::KeyStr("b", 3, kTypeValue)},
+ {"", "", ""},
+ {test::KeyStr("a", 4, kTypeDeletion), test::KeyStr("a", 0, kTypeValue),
+ test::KeyStr("b", 3, kTypeValue)},
+ {"", "", ""}, kMaxSequenceNumber /*last_commited_seq*/,
+ nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+ true /*bottommost_level*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ NotRemoveSingleDeletionIfNotVisibleToEarliestSnapshot) {
+ AddSnapshot(2, 1);
+ RunTest({test::KeyStr("a", 1, kTypeSingleDeletion),
+ test::KeyStr("b", 2, kTypeSingleDeletion),
+ test::KeyStr("c", 3, kTypeSingleDeletion)},
+ {"", "", ""},
+ {test::KeyStr("b", 2, kTypeSingleDeletion),
+ test::KeyStr("c", 3, kTypeSingleDeletion)},
+ {"", ""}, kMaxSequenceNumber /*last_commited_seq*/,
+ nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+ true /*bottommost_level*/);
+}
+
+// Single delete should not cancel out values that not visible to the
+// same set of snapshots
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ SingleDeleteAcrossSnapshotBoundary) {
+ AddSnapshot(2, 1);
+ RunTest({test::KeyStr("a", 2, kTypeSingleDeletion),
+ test::KeyStr("a", 1, kTypeValue)},
+ {"", "v1"},
+ {test::KeyStr("a", 2, kTypeSingleDeletion),
+ test::KeyStr("a", 1, kTypeValue)},
+ {"", "v1"}, 2 /*last_committed_seq*/);
+}
+
+// Single delete should be kept in case it is not visible to the
+// earliest write conflict snapshot. If a single delete is kept for this reason,
+// corresponding value can be trimmed to save space.
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ KeepSingleDeletionForWriteConflictChecking) {
+ AddSnapshot(2, 0);
+ RunTest({test::KeyStr("a", 2, kTypeSingleDeletion),
+ test::KeyStr("a", 1, kTypeValue)},
+ {"", "v1"},
+ {test::KeyStr("a", 2, kTypeSingleDeletion),
+ test::KeyStr("a", 1, kTypeValue)},
+ {"", ""}, 2 /*last_committed_seq*/, nullptr /*merge_operator*/,
+ nullptr /*compaction_filter*/, false /*bottommost_level*/,
+ 2 /*earliest_write_conflict_snapshot*/);
+}
+
+// Compaction filter should keep uncommitted key as-is, and
+// * Convert the latest velue to deletion, and/or
+// * if latest value is a merge, apply filter to all suequent merges.
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_Value) {
+ std::unique_ptr<CompactionFilter> compaction_filter(
+ new FilterAllKeysCompactionFilter());
+ RunTest(
+ {test::KeyStr("a", 2, kTypeValue), test::KeyStr("a", 1, kTypeValue),
+ test::KeyStr("b", 3, kTypeValue), test::KeyStr("c", 1, kTypeValue)},
+ {"v2", "v1", "v3", "v4"},
+ {test::KeyStr("a", 2, kTypeValue), test::KeyStr("a", 1, kTypeDeletion),
+ test::KeyStr("b", 3, kTypeValue), test::KeyStr("c", 1, kTypeDeletion)},
+ {"v2", "", "v3", ""}, 1 /*last_committed_seq*/,
+ nullptr /*merge_operator*/, compaction_filter.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_Deletion) {
+ std::unique_ptr<CompactionFilter> compaction_filter(
+ new FilterAllKeysCompactionFilter());
+ RunTest(
+ {test::KeyStr("a", 2, kTypeDeletion), test::KeyStr("a", 1, kTypeValue)},
+ {"", "v1"},
+ {test::KeyStr("a", 2, kTypeDeletion),
+ test::KeyStr("a", 1, kTypeDeletion)},
+ {"", ""}, 1 /*last_committed_seq*/, nullptr /*merge_operator*/,
+ compaction_filter.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ CompactionFilter_PartialMerge) {
+ std::shared_ptr<MergeOperator> merge_op =
+ MergeOperators::CreateStringAppendOperator();
+ std::unique_ptr<CompactionFilter> compaction_filter(
+ new FilterAllKeysCompactionFilter());
+ RunTest({test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 2, kTypeMerge),
+ test::KeyStr("a", 1, kTypeMerge)},
+ {"v3", "v2", "v1"}, {test::KeyStr("a", 3, kTypeMerge)}, {"v3"},
+ 2 /*last_committed_seq*/, merge_op.get(), compaction_filter.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_FullMerge) {
+ std::shared_ptr<MergeOperator> merge_op =
+ MergeOperators::CreateStringAppendOperator();
+ std::unique_ptr<CompactionFilter> compaction_filter(
+ new FilterAllKeysCompactionFilter());
+ RunTest(
+ {test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 2, kTypeMerge),
+ test::KeyStr("a", 1, kTypeValue)},
+ {"v3", "v2", "v1"},
+ {test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 1, kTypeDeletion)},
+ {"v3", ""}, 2 /*last_committed_seq*/, merge_op.get(),
+ compaction_filter.get());
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/compaction/compaction_job.cc b/src/rocksdb/db/compaction/compaction_job.cc
new file mode 100644
index 000000000..576ec7b45
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_job.cc
@@ -0,0 +1,1700 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <algorithm>
+#include <cinttypes>
+#include <functional>
+#include <list>
+#include <memory>
+#include <random>
+#include <set>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#include "db/builder.h"
+#include "db/compaction/compaction_job.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/memtable_list.h"
+#include "db/merge_context.h"
+#include "db/merge_helper.h"
+#include "db/range_del_aggregator.h"
+#include "db/version_set.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/sst_file_manager_impl.h"
+#include "file/writable_file_writer.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/thread_status_util.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/merging_iterator.h"
+#include "table/table_builder.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const char* GetCompactionReasonString(CompactionReason compaction_reason) {
+ switch (compaction_reason) {
+ case CompactionReason::kUnknown:
+ return "Unknown";
+ case CompactionReason::kLevelL0FilesNum:
+ return "LevelL0FilesNum";
+ case CompactionReason::kLevelMaxLevelSize:
+ return "LevelMaxLevelSize";
+ case CompactionReason::kUniversalSizeAmplification:
+ return "UniversalSizeAmplification";
+ case CompactionReason::kUniversalSizeRatio:
+ return "UniversalSizeRatio";
+ case CompactionReason::kUniversalSortedRunNum:
+ return "UniversalSortedRunNum";
+ case CompactionReason::kFIFOMaxSize:
+ return "FIFOMaxSize";
+ case CompactionReason::kFIFOReduceNumFiles:
+ return "FIFOReduceNumFiles";
+ case CompactionReason::kFIFOTtl:
+ return "FIFOTtl";
+ case CompactionReason::kManualCompaction:
+ return "ManualCompaction";
+ case CompactionReason::kFilesMarkedForCompaction:
+ return "FilesMarkedForCompaction";
+ case CompactionReason::kBottommostFiles:
+ return "BottommostFiles";
+ case CompactionReason::kTtl:
+ return "Ttl";
+ case CompactionReason::kFlush:
+ return "Flush";
+ case CompactionReason::kExternalSstIngestion:
+ return "ExternalSstIngestion";
+ case CompactionReason::kPeriodicCompaction:
+ return "PeriodicCompaction";
+ case CompactionReason::kNumOfReasons:
+ // fall through
+ default:
+ assert(false);
+ return "Invalid";
+ }
+}
+
+// Maintains state for each sub-compaction
+struct CompactionJob::SubcompactionState {
+ const Compaction* compaction;
+ std::unique_ptr<CompactionIterator> c_iter;
+
+ // The boundaries of the key-range this compaction is interested in. No two
+ // subcompactions may have overlapping key-ranges.
+ // 'start' is inclusive, 'end' is exclusive, and nullptr means unbounded
+ Slice *start, *end;
+
+ // The return status of this subcompaction
+ Status status;
+
+ // Files produced by this subcompaction
+ struct Output {
+ FileMetaData meta;
+ bool finished;
+ std::shared_ptr<const TableProperties> table_properties;
+ };
+
+ // State kept for output being generated
+ std::vector<Output> outputs;
+ std::unique_ptr<WritableFileWriter> outfile;
+ std::unique_ptr<TableBuilder> builder;
+ Output* current_output() {
+ if (outputs.empty()) {
+ // This subcompaction's outptut could be empty if compaction was aborted
+ // before this subcompaction had a chance to generate any output files.
+ // When subcompactions are executed sequentially this is more likely and
+ // will be particulalry likely for the later subcompactions to be empty.
+ // Once they are run in parallel however it should be much rarer.
+ return nullptr;
+ } else {
+ return &outputs.back();
+ }
+ }
+
+ uint64_t current_output_file_size;
+
+ // State during the subcompaction
+ uint64_t total_bytes;
+ uint64_t num_output_records;
+ CompactionJobStats compaction_job_stats;
+ uint64_t approx_size;
+ // An index that used to speed up ShouldStopBefore().
+ size_t grandparent_index = 0;
+ // The number of bytes overlapping between the current output and
+ // grandparent files used in ShouldStopBefore().
+ uint64_t overlapped_bytes = 0;
+ // A flag determine whether the key has been seen in ShouldStopBefore()
+ bool seen_key = false;
+
+ SubcompactionState(Compaction* c, Slice* _start, Slice* _end,
+ uint64_t size = 0)
+ : compaction(c),
+ start(_start),
+ end(_end),
+ outfile(nullptr),
+ builder(nullptr),
+ current_output_file_size(0),
+ total_bytes(0),
+ num_output_records(0),
+ approx_size(size),
+ grandparent_index(0),
+ overlapped_bytes(0),
+ seen_key(false) {
+ assert(compaction != nullptr);
+ }
+
+ SubcompactionState(SubcompactionState&& o) { *this = std::move(o); }
+
+ SubcompactionState& operator=(SubcompactionState&& o) {
+ compaction = std::move(o.compaction);
+ start = std::move(o.start);
+ end = std::move(o.end);
+ status = std::move(o.status);
+ outputs = std::move(o.outputs);
+ outfile = std::move(o.outfile);
+ builder = std::move(o.builder);
+ current_output_file_size = std::move(o.current_output_file_size);
+ total_bytes = std::move(o.total_bytes);
+ num_output_records = std::move(o.num_output_records);
+ compaction_job_stats = std::move(o.compaction_job_stats);
+ approx_size = std::move(o.approx_size);
+ grandparent_index = std::move(o.grandparent_index);
+ overlapped_bytes = std::move(o.overlapped_bytes);
+ seen_key = std::move(o.seen_key);
+ return *this;
+ }
+
+ // Because member std::unique_ptrs do not have these.
+ SubcompactionState(const SubcompactionState&) = delete;
+
+ SubcompactionState& operator=(const SubcompactionState&) = delete;
+
+ // Returns true iff we should stop building the current output
+ // before processing "internal_key".
+ bool ShouldStopBefore(const Slice& internal_key, uint64_t curr_file_size) {
+ const InternalKeyComparator* icmp =
+ &compaction->column_family_data()->internal_comparator();
+ const std::vector<FileMetaData*>& grandparents = compaction->grandparents();
+
+ // Scan to find earliest grandparent file that contains key.
+ while (grandparent_index < grandparents.size() &&
+ icmp->Compare(internal_key,
+ grandparents[grandparent_index]->largest.Encode()) >
+ 0) {
+ if (seen_key) {
+ overlapped_bytes += grandparents[grandparent_index]->fd.GetFileSize();
+ }
+ assert(grandparent_index + 1 >= grandparents.size() ||
+ icmp->Compare(
+ grandparents[grandparent_index]->largest.Encode(),
+ grandparents[grandparent_index + 1]->smallest.Encode()) <= 0);
+ grandparent_index++;
+ }
+ seen_key = true;
+
+ if (overlapped_bytes + curr_file_size >
+ compaction->max_compaction_bytes()) {
+ // Too much overlap for current output; start new output
+ overlapped_bytes = 0;
+ return true;
+ }
+
+ return false;
+ }
+};
+
+// Maintains state for the entire compaction
+struct CompactionJob::CompactionState {
+ Compaction* const compaction;
+
+ // REQUIRED: subcompaction states are stored in order of increasing
+ // key-range
+ std::vector<CompactionJob::SubcompactionState> sub_compact_states;
+ Status status;
+
+ uint64_t total_bytes;
+ uint64_t num_output_records;
+
+ explicit CompactionState(Compaction* c)
+ : compaction(c),
+ total_bytes(0),
+ num_output_records(0) {}
+
+ size_t NumOutputFiles() {
+ size_t total = 0;
+ for (auto& s : sub_compact_states) {
+ total += s.outputs.size();
+ }
+ return total;
+ }
+
+ Slice SmallestUserKey() {
+ for (const auto& sub_compact_state : sub_compact_states) {
+ if (!sub_compact_state.outputs.empty() &&
+ sub_compact_state.outputs[0].finished) {
+ return sub_compact_state.outputs[0].meta.smallest.user_key();
+ }
+ }
+ // If there is no finished output, return an empty slice.
+ return Slice(nullptr, 0);
+ }
+
+ Slice LargestUserKey() {
+ for (auto it = sub_compact_states.rbegin(); it < sub_compact_states.rend();
+ ++it) {
+ if (!it->outputs.empty() && it->current_output()->finished) {
+ assert(it->current_output() != nullptr);
+ return it->current_output()->meta.largest.user_key();
+ }
+ }
+ // If there is no finished output, return an empty slice.
+ return Slice(nullptr, 0);
+ }
+};
+
+void CompactionJob::AggregateStatistics() {
+ for (SubcompactionState& sc : compact_->sub_compact_states) {
+ compact_->total_bytes += sc.total_bytes;
+ compact_->num_output_records += sc.num_output_records;
+ }
+ if (compaction_job_stats_) {
+ for (SubcompactionState& sc : compact_->sub_compact_states) {
+ compaction_job_stats_->Add(sc.compaction_job_stats);
+ }
+ }
+}
+
+CompactionJob::CompactionJob(
+ int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
+ const FileOptions& file_options, VersionSet* versions,
+ const std::atomic<bool>* shutting_down,
+ const SequenceNumber preserve_deletes_seqnum, LogBuffer* log_buffer,
+ Directory* db_directory, Directory* output_directory, Statistics* stats,
+ InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
+ std::vector<SequenceNumber> existing_snapshots,
+ SequenceNumber earliest_write_conflict_snapshot,
+ const SnapshotChecker* snapshot_checker, std::shared_ptr<Cache> table_cache,
+ EventLogger* event_logger, bool paranoid_file_checks, bool measure_io_stats,
+ const std::string& dbname, CompactionJobStats* compaction_job_stats,
+ Env::Priority thread_pri, const std::atomic<bool>* manual_compaction_paused)
+ : job_id_(job_id),
+ compact_(new CompactionState(compaction)),
+ compaction_job_stats_(compaction_job_stats),
+ compaction_stats_(compaction->compaction_reason(), 1),
+ dbname_(dbname),
+ db_options_(db_options),
+ file_options_(file_options),
+ env_(db_options.env),
+ fs_(db_options.fs.get()),
+ file_options_for_read_(
+ fs_->OptimizeForCompactionTableRead(file_options, db_options_)),
+ versions_(versions),
+ shutting_down_(shutting_down),
+ manual_compaction_paused_(manual_compaction_paused),
+ preserve_deletes_seqnum_(preserve_deletes_seqnum),
+ log_buffer_(log_buffer),
+ db_directory_(db_directory),
+ output_directory_(output_directory),
+ stats_(stats),
+ db_mutex_(db_mutex),
+ db_error_handler_(db_error_handler),
+ existing_snapshots_(std::move(existing_snapshots)),
+ earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
+ snapshot_checker_(snapshot_checker),
+ table_cache_(std::move(table_cache)),
+ event_logger_(event_logger),
+ bottommost_level_(false),
+ paranoid_file_checks_(paranoid_file_checks),
+ measure_io_stats_(measure_io_stats),
+ write_hint_(Env::WLTH_NOT_SET),
+ thread_pri_(thread_pri) {
+ assert(log_buffer_ != nullptr);
+ const auto* cfd = compact_->compaction->column_family_data();
+ ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env,
+ db_options_.enable_thread_tracking);
+ ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
+ ReportStartedCompaction(compaction);
+}
+
+CompactionJob::~CompactionJob() {
+ assert(compact_ == nullptr);
+ ThreadStatusUtil::ResetThreadStatus();
+}
+
+void CompactionJob::ReportStartedCompaction(Compaction* compaction) {
+ const auto* cfd = compact_->compaction->column_family_data();
+ ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env,
+ db_options_.enable_thread_tracking);
+
+ ThreadStatusUtil::SetThreadOperationProperty(ThreadStatus::COMPACTION_JOB_ID,
+ job_id_);
+
+ ThreadStatusUtil::SetThreadOperationProperty(
+ ThreadStatus::COMPACTION_INPUT_OUTPUT_LEVEL,
+ (static_cast<uint64_t>(compact_->compaction->start_level()) << 32) +
+ compact_->compaction->output_level());
+
+ // In the current design, a CompactionJob is always created
+ // for non-trivial compaction.
+ assert(compaction->IsTrivialMove() == false ||
+ compaction->is_manual_compaction() == true);
+
+ ThreadStatusUtil::SetThreadOperationProperty(
+ ThreadStatus::COMPACTION_PROP_FLAGS,
+ compaction->is_manual_compaction() +
+ (compaction->deletion_compaction() << 1));
+
+ ThreadStatusUtil::SetThreadOperationProperty(
+ ThreadStatus::COMPACTION_TOTAL_INPUT_BYTES,
+ compaction->CalculateTotalInputSize());
+
+ IOSTATS_RESET(bytes_written);
+ IOSTATS_RESET(bytes_read);
+ ThreadStatusUtil::SetThreadOperationProperty(
+ ThreadStatus::COMPACTION_BYTES_WRITTEN, 0);
+ ThreadStatusUtil::SetThreadOperationProperty(
+ ThreadStatus::COMPACTION_BYTES_READ, 0);
+
+ // Set the thread operation after operation properties
+ // to ensure GetThreadList() can always show them all together.
+ ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
+
+ if (compaction_job_stats_) {
+ compaction_job_stats_->is_manual_compaction =
+ compaction->is_manual_compaction();
+ }
+}
+
+void CompactionJob::Prepare() {
+ AutoThreadOperationStageUpdater stage_updater(
+ ThreadStatus::STAGE_COMPACTION_PREPARE);
+
+ // Generate file_levels_ for compaction berfore making Iterator
+ auto* c = compact_->compaction;
+ assert(c->column_family_data() != nullptr);
+ assert(c->column_family_data()->current()->storage_info()->NumLevelFiles(
+ compact_->compaction->level()) > 0);
+
+ write_hint_ =
+ c->column_family_data()->CalculateSSTWriteHint(c->output_level());
+ bottommost_level_ = c->bottommost_level();
+
+ if (c->ShouldFormSubcompactions()) {
+ {
+ StopWatch sw(env_, stats_, SUBCOMPACTION_SETUP_TIME);
+ GenSubcompactionBoundaries();
+ }
+ assert(sizes_.size() == boundaries_.size() + 1);
+
+ for (size_t i = 0; i <= boundaries_.size(); i++) {
+ Slice* start = i == 0 ? nullptr : &boundaries_[i - 1];
+ Slice* end = i == boundaries_.size() ? nullptr : &boundaries_[i];
+ compact_->sub_compact_states.emplace_back(c, start, end, sizes_[i]);
+ }
+ RecordInHistogram(stats_, NUM_SUBCOMPACTIONS_SCHEDULED,
+ compact_->sub_compact_states.size());
+ } else {
+ compact_->sub_compact_states.emplace_back(c, nullptr, nullptr);
+ }
+}
+
+struct RangeWithSize {
+ Range range;
+ uint64_t size;
+
+ RangeWithSize(const Slice& a, const Slice& b, uint64_t s = 0)
+ : range(a, b), size(s) {}
+};
+
+void CompactionJob::GenSubcompactionBoundaries() {
+ auto* c = compact_->compaction;
+ auto* cfd = c->column_family_data();
+ const Comparator* cfd_comparator = cfd->user_comparator();
+ std::vector<Slice> bounds;
+ int start_lvl = c->start_level();
+ int out_lvl = c->output_level();
+
+ // Add the starting and/or ending key of certain input files as a potential
+ // boundary
+ for (size_t lvl_idx = 0; lvl_idx < c->num_input_levels(); lvl_idx++) {
+ int lvl = c->level(lvl_idx);
+ if (lvl >= start_lvl && lvl <= out_lvl) {
+ const LevelFilesBrief* flevel = c->input_levels(lvl_idx);
+ size_t num_files = flevel->num_files;
+
+ if (num_files == 0) {
+ continue;
+ }
+
+ if (lvl == 0) {
+ // For level 0 add the starting and ending key of each file since the
+ // files may have greatly differing key ranges (not range-partitioned)
+ for (size_t i = 0; i < num_files; i++) {
+ bounds.emplace_back(flevel->files[i].smallest_key);
+ bounds.emplace_back(flevel->files[i].largest_key);
+ }
+ } else {
+ // For all other levels add the smallest/largest key in the level to
+ // encompass the range covered by that level
+ bounds.emplace_back(flevel->files[0].smallest_key);
+ bounds.emplace_back(flevel->files[num_files - 1].largest_key);
+ if (lvl == out_lvl) {
+ // For the last level include the starting keys of all files since
+ // the last level is the largest and probably has the widest key
+ // range. Since it's range partitioned, the ending key of one file
+ // and the starting key of the next are very close (or identical).
+ for (size_t i = 1; i < num_files; i++) {
+ bounds.emplace_back(flevel->files[i].smallest_key);
+ }
+ }
+ }
+ }
+ }
+
+ std::sort(bounds.begin(), bounds.end(),
+ [cfd_comparator](const Slice& a, const Slice& b) -> bool {
+ return cfd_comparator->Compare(ExtractUserKey(a),
+ ExtractUserKey(b)) < 0;
+ });
+ // Remove duplicated entries from bounds
+ bounds.erase(
+ std::unique(bounds.begin(), bounds.end(),
+ [cfd_comparator](const Slice& a, const Slice& b) -> bool {
+ return cfd_comparator->Compare(ExtractUserKey(a),
+ ExtractUserKey(b)) == 0;
+ }),
+ bounds.end());
+
+ // Combine consecutive pairs of boundaries into ranges with an approximate
+ // size of data covered by keys in that range
+ uint64_t sum = 0;
+ std::vector<RangeWithSize> ranges;
+ // Get input version from CompactionState since it's already referenced
+ // earlier in SetInputVersioCompaction::SetInputVersion and will not change
+ // when db_mutex_ is released below
+ auto* v = compact_->compaction->input_version();
+ for (auto it = bounds.begin();;) {
+ const Slice a = *it;
+ ++it;
+
+ if (it == bounds.end()) {
+ break;
+ }
+
+ const Slice b = *it;
+
+ // ApproximateSize could potentially create table reader iterator to seek
+ // to the index block and may incur I/O cost in the process. Unlock db
+ // mutex to reduce contention
+ db_mutex_->Unlock();
+ uint64_t size = versions_->ApproximateSize(SizeApproximationOptions(), v, a,
+ b, start_lvl, out_lvl + 1,
+ TableReaderCaller::kCompaction);
+ db_mutex_->Lock();
+ ranges.emplace_back(a, b, size);
+ sum += size;
+ }
+
+ // Group the ranges into subcompactions
+ const double min_file_fill_percent = 4.0 / 5;
+ int base_level = v->storage_info()->base_level();
+ uint64_t max_output_files = static_cast<uint64_t>(std::ceil(
+ sum / min_file_fill_percent /
+ MaxFileSizeForLevel(*(c->mutable_cf_options()), out_lvl,
+ c->immutable_cf_options()->compaction_style, base_level,
+ c->immutable_cf_options()->level_compaction_dynamic_level_bytes)));
+ uint64_t subcompactions =
+ std::min({static_cast<uint64_t>(ranges.size()),
+ static_cast<uint64_t>(c->max_subcompactions()),
+ max_output_files});
+
+ if (subcompactions > 1) {
+ double mean = sum * 1.0 / subcompactions;
+ // Greedily add ranges to the subcompaction until the sum of the ranges'
+ // sizes becomes >= the expected mean size of a subcompaction
+ sum = 0;
+ for (size_t i = 0; i < ranges.size() - 1; i++) {
+ sum += ranges[i].size;
+ if (subcompactions == 1) {
+ // If there's only one left to schedule then it goes to the end so no
+ // need to put an end boundary
+ continue;
+ }
+ if (sum >= mean) {
+ boundaries_.emplace_back(ExtractUserKey(ranges[i].range.limit));
+ sizes_.emplace_back(sum);
+ subcompactions--;
+ sum = 0;
+ }
+ }
+ sizes_.emplace_back(sum + ranges.back().size);
+ } else {
+ // Only one range so its size is the total sum of sizes computed above
+ sizes_.emplace_back(sum);
+ }
+}
+
+Status CompactionJob::Run() {
+ AutoThreadOperationStageUpdater stage_updater(
+ ThreadStatus::STAGE_COMPACTION_RUN);
+ TEST_SYNC_POINT("CompactionJob::Run():Start");
+ log_buffer_->FlushBufferToLog();
+ LogCompaction();
+
+ const size_t num_threads = compact_->sub_compact_states.size();
+ assert(num_threads > 0);
+ const uint64_t start_micros = env_->NowMicros();
+
+ // Launch a thread for each of subcompactions 1...num_threads-1
+ std::vector<port::Thread> thread_pool;
+ thread_pool.reserve(num_threads - 1);
+ for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) {
+ thread_pool.emplace_back(&CompactionJob::ProcessKeyValueCompaction, this,
+ &compact_->sub_compact_states[i]);
+ }
+
+ // Always schedule the first subcompaction (whether or not there are also
+ // others) in the current thread to be efficient with resources
+ ProcessKeyValueCompaction(&compact_->sub_compact_states[0]);
+
+ // Wait for all other threads (if there are any) to finish execution
+ for (auto& thread : thread_pool) {
+ thread.join();
+ }
+
+ compaction_stats_.micros = env_->NowMicros() - start_micros;
+ compaction_stats_.cpu_micros = 0;
+ for (size_t i = 0; i < compact_->sub_compact_states.size(); i++) {
+ compaction_stats_.cpu_micros +=
+ compact_->sub_compact_states[i].compaction_job_stats.cpu_micros;
+ }
+
+ RecordTimeToHistogram(stats_, COMPACTION_TIME, compaction_stats_.micros);
+ RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME,
+ compaction_stats_.cpu_micros);
+
+ TEST_SYNC_POINT("CompactionJob::Run:BeforeVerify");
+
+ // Check if any thread encountered an error during execution
+ Status status;
+ for (const auto& state : compact_->sub_compact_states) {
+ if (!state.status.ok()) {
+ status = state.status;
+ break;
+ }
+ }
+
+ if (status.ok() && output_directory_) {
+ status = output_directory_->Fsync();
+ }
+
+ if (status.ok()) {
+ thread_pool.clear();
+ std::vector<const FileMetaData*> files_meta;
+ for (const auto& state : compact_->sub_compact_states) {
+ for (const auto& output : state.outputs) {
+ files_meta.emplace_back(&output.meta);
+ }
+ }
+ ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+ auto prefix_extractor =
+ compact_->compaction->mutable_cf_options()->prefix_extractor.get();
+ std::atomic<size_t> next_file_meta_idx(0);
+ auto verify_table = [&](Status& output_status) {
+ while (true) {
+ size_t file_idx = next_file_meta_idx.fetch_add(1);
+ if (file_idx >= files_meta.size()) {
+ break;
+ }
+ // Verify that the table is usable
+ // We set for_compaction to false and don't OptimizeForCompactionTableRead
+ // here because this is a special case after we finish the table building
+ // No matter whether use_direct_io_for_flush_and_compaction is true,
+ // we will regard this verification as user reads since the goal is
+ // to cache it here for further user reads
+ InternalIterator* iter = cfd->table_cache()->NewIterator(
+ ReadOptions(), file_options_, cfd->internal_comparator(),
+ *files_meta[file_idx], /*range_del_agg=*/nullptr, prefix_extractor,
+ /*table_reader_ptr=*/nullptr,
+ cfd->internal_stats()->GetFileReadHist(
+ compact_->compaction->output_level()),
+ TableReaderCaller::kCompactionRefill, /*arena=*/nullptr,
+ /*skip_filters=*/false, compact_->compaction->output_level(),
+ /*smallest_compaction_key=*/nullptr,
+ /*largest_compaction_key=*/nullptr);
+ auto s = iter->status();
+
+ if (s.ok() && paranoid_file_checks_) {
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {}
+ s = iter->status();
+ }
+
+ delete iter;
+
+ if (!s.ok()) {
+ output_status = s;
+ break;
+ }
+ }
+ };
+ for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) {
+ thread_pool.emplace_back(verify_table,
+ std::ref(compact_->sub_compact_states[i].status));
+ }
+ verify_table(compact_->sub_compact_states[0].status);
+ for (auto& thread : thread_pool) {
+ thread.join();
+ }
+ for (const auto& state : compact_->sub_compact_states) {
+ if (!state.status.ok()) {
+ status = state.status;
+ break;
+ }
+ }
+ }
+
+ TablePropertiesCollection tp;
+ for (const auto& state : compact_->sub_compact_states) {
+ for (const auto& output : state.outputs) {
+ auto fn =
+ TableFileName(state.compaction->immutable_cf_options()->cf_paths,
+ output.meta.fd.GetNumber(), output.meta.fd.GetPathId());
+ tp[fn] = output.table_properties;
+ }
+ }
+ compact_->compaction->SetOutputTableProperties(std::move(tp));
+
+ // Finish up all book-keeping to unify the subcompaction results
+ AggregateStatistics();
+ UpdateCompactionStats();
+ RecordCompactionIOStats();
+ LogFlush(db_options_.info_log);
+ TEST_SYNC_POINT("CompactionJob::Run():End");
+
+ compact_->status = status;
+ return status;
+}
+
+Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) {
+ AutoThreadOperationStageUpdater stage_updater(
+ ThreadStatus::STAGE_COMPACTION_INSTALL);
+ db_mutex_->AssertHeld();
+ Status status = compact_->status;
+ ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+ cfd->internal_stats()->AddCompactionStats(
+ compact_->compaction->output_level(), thread_pri_, compaction_stats_);
+
+ if (status.ok()) {
+ status = InstallCompactionResults(mutable_cf_options);
+ }
+ VersionStorageInfo::LevelSummaryStorage tmp;
+ auto vstorage = cfd->current()->storage_info();
+ const auto& stats = compaction_stats_;
+
+ double read_write_amp = 0.0;
+ double write_amp = 0.0;
+ double bytes_read_per_sec = 0;
+ double bytes_written_per_sec = 0;
+
+ if (stats.bytes_read_non_output_levels > 0) {
+ read_write_amp = (stats.bytes_written + stats.bytes_read_output_level +
+ stats.bytes_read_non_output_levels) /
+ static_cast<double>(stats.bytes_read_non_output_levels);
+ write_amp = stats.bytes_written /
+ static_cast<double>(stats.bytes_read_non_output_levels);
+ }
+ if (stats.micros > 0) {
+ bytes_read_per_sec =
+ (stats.bytes_read_non_output_levels + stats.bytes_read_output_level) /
+ static_cast<double>(stats.micros);
+ bytes_written_per_sec =
+ stats.bytes_written / static_cast<double>(stats.micros);
+ }
+
+ ROCKS_LOG_BUFFER(
+ log_buffer_,
+ "[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, "
+ "files in(%d, %d) out(%d) "
+ "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) "
+ "write-amplify(%.1f) %s, records in: %" PRIu64
+ ", records dropped: %" PRIu64 " output_compression: %s\n",
+ cfd->GetName().c_str(), vstorage->LevelSummary(&tmp), bytes_read_per_sec,
+ bytes_written_per_sec, compact_->compaction->output_level(),
+ stats.num_input_files_in_non_output_levels,
+ stats.num_input_files_in_output_level, stats.num_output_files,
+ stats.bytes_read_non_output_levels / 1048576.0,
+ stats.bytes_read_output_level / 1048576.0,
+ stats.bytes_written / 1048576.0, read_write_amp, write_amp,
+ status.ToString().c_str(), stats.num_input_records,
+ stats.num_dropped_records,
+ CompressionTypeToString(compact_->compaction->output_compression())
+ .c_str());
+
+ UpdateCompactionJobStats(stats);
+
+ auto stream = event_logger_->LogToBuffer(log_buffer_);
+ stream << "job" << job_id_ << "event"
+ << "compaction_finished"
+ << "compaction_time_micros" << stats.micros
+ << "compaction_time_cpu_micros" << stats.cpu_micros << "output_level"
+ << compact_->compaction->output_level() << "num_output_files"
+ << compact_->NumOutputFiles() << "total_output_size"
+ << compact_->total_bytes << "num_input_records"
+ << stats.num_input_records << "num_output_records"
+ << compact_->num_output_records << "num_subcompactions"
+ << compact_->sub_compact_states.size() << "output_compression"
+ << CompressionTypeToString(compact_->compaction->output_compression());
+
+ if (compaction_job_stats_ != nullptr) {
+ stream << "num_single_delete_mismatches"
+ << compaction_job_stats_->num_single_del_mismatch;
+ stream << "num_single_delete_fallthrough"
+ << compaction_job_stats_->num_single_del_fallthru;
+ }
+
+ if (measure_io_stats_ && compaction_job_stats_ != nullptr) {
+ stream << "file_write_nanos" << compaction_job_stats_->file_write_nanos;
+ stream << "file_range_sync_nanos"
+ << compaction_job_stats_->file_range_sync_nanos;
+ stream << "file_fsync_nanos" << compaction_job_stats_->file_fsync_nanos;
+ stream << "file_prepare_write_nanos"
+ << compaction_job_stats_->file_prepare_write_nanos;
+ }
+
+ stream << "lsm_state";
+ stream.StartArray();
+ for (int level = 0; level < vstorage->num_levels(); ++level) {
+ stream << vstorage->NumLevelFiles(level);
+ }
+ stream.EndArray();
+
+ CleanupCompaction();
+ return status;
+}
+
+void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
+ assert(sub_compact != nullptr);
+
+ uint64_t prev_cpu_micros = env_->NowCPUNanos() / 1000;
+
+ ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
+
+ // Create compaction filter and fail the compaction if
+ // IgnoreSnapshots() = false because it is not supported anymore
+ const CompactionFilter* compaction_filter =
+ cfd->ioptions()->compaction_filter;
+ std::unique_ptr<CompactionFilter> compaction_filter_from_factory = nullptr;
+ if (compaction_filter == nullptr) {
+ compaction_filter_from_factory =
+ sub_compact->compaction->CreateCompactionFilter();
+ compaction_filter = compaction_filter_from_factory.get();
+ }
+ if (compaction_filter != nullptr && !compaction_filter->IgnoreSnapshots()) {
+ sub_compact->status = Status::NotSupported(
+ "CompactionFilter::IgnoreSnapshots() = false is not supported "
+ "anymore.");
+ return;
+ }
+
+ CompactionRangeDelAggregator range_del_agg(&cfd->internal_comparator(),
+ existing_snapshots_);
+
+ // Although the v2 aggregator is what the level iterator(s) know about,
+ // the AddTombstones calls will be propagated down to the v1 aggregator.
+ std::unique_ptr<InternalIterator> input(versions_->MakeInputIterator(
+ sub_compact->compaction, &range_del_agg, file_options_for_read_));
+
+ AutoThreadOperationStageUpdater stage_updater(
+ ThreadStatus::STAGE_COMPACTION_PROCESS_KV);
+
+ // I/O measurement variables
+ PerfLevel prev_perf_level = PerfLevel::kEnableTime;
+ const uint64_t kRecordStatsEvery = 1000;
+ uint64_t prev_write_nanos = 0;
+ uint64_t prev_fsync_nanos = 0;
+ uint64_t prev_range_sync_nanos = 0;
+ uint64_t prev_prepare_write_nanos = 0;
+ uint64_t prev_cpu_write_nanos = 0;
+ uint64_t prev_cpu_read_nanos = 0;
+ if (measure_io_stats_) {
+ prev_perf_level = GetPerfLevel();
+ SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
+ prev_write_nanos = IOSTATS(write_nanos);
+ prev_fsync_nanos = IOSTATS(fsync_nanos);
+ prev_range_sync_nanos = IOSTATS(range_sync_nanos);
+ prev_prepare_write_nanos = IOSTATS(prepare_write_nanos);
+ prev_cpu_write_nanos = IOSTATS(cpu_write_nanos);
+ prev_cpu_read_nanos = IOSTATS(cpu_read_nanos);
+ }
+
+ MergeHelper merge(
+ env_, cfd->user_comparator(), cfd->ioptions()->merge_operator,
+ compaction_filter, db_options_.info_log.get(),
+ false /* internal key corruption is expected */,
+ existing_snapshots_.empty() ? 0 : existing_snapshots_.back(),
+ snapshot_checker_, compact_->compaction->level(),
+ db_options_.statistics.get());
+
+ TEST_SYNC_POINT("CompactionJob::Run():Inprogress");
+ TEST_SYNC_POINT_CALLBACK(
+ "CompactionJob::Run():PausingManualCompaction:1",
+ reinterpret_cast<void*>(
+ const_cast<std::atomic<bool>*>(manual_compaction_paused_)));
+
+ Slice* start = sub_compact->start;
+ Slice* end = sub_compact->end;
+ if (start != nullptr) {
+ IterKey start_iter;
+ start_iter.SetInternalKey(*start, kMaxSequenceNumber, kValueTypeForSeek);
+ input->Seek(start_iter.GetInternalKey());
+ } else {
+ input->SeekToFirst();
+ }
+
+ Status status;
+ sub_compact->c_iter.reset(new CompactionIterator(
+ input.get(), cfd->user_comparator(), &merge, versions_->LastSequence(),
+ &existing_snapshots_, earliest_write_conflict_snapshot_,
+ snapshot_checker_, env_, ShouldReportDetailedTime(env_, stats_), false,
+ &range_del_agg, sub_compact->compaction, compaction_filter,
+ shutting_down_, preserve_deletes_seqnum_, manual_compaction_paused_,
+ db_options_.info_log));
+ auto c_iter = sub_compact->c_iter.get();
+ c_iter->SeekToFirst();
+ if (c_iter->Valid() && sub_compact->compaction->output_level() != 0) {
+ // ShouldStopBefore() maintains state based on keys processed so far. The
+ // compaction loop always calls it on the "next" key, thus won't tell it the
+ // first key. So we do that here.
+ sub_compact->ShouldStopBefore(c_iter->key(),
+ sub_compact->current_output_file_size);
+ }
+ const auto& c_iter_stats = c_iter->iter_stats();
+
+ while (status.ok() && !cfd->IsDropped() && c_iter->Valid()) {
+ // Invariant: c_iter.status() is guaranteed to be OK if c_iter->Valid()
+ // returns true.
+ const Slice& key = c_iter->key();
+ const Slice& value = c_iter->value();
+
+ // If an end key (exclusive) is specified, check if the current key is
+ // >= than it and exit if it is because the iterator is out of its range
+ if (end != nullptr &&
+ cfd->user_comparator()->Compare(c_iter->user_key(), *end) >= 0) {
+ break;
+ }
+ if (c_iter_stats.num_input_records % kRecordStatsEvery ==
+ kRecordStatsEvery - 1) {
+ RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats);
+ c_iter->ResetRecordCounts();
+ RecordCompactionIOStats();
+ }
+
+ // Open output file if necessary
+ if (sub_compact->builder == nullptr) {
+ status = OpenCompactionOutputFile(sub_compact);
+ if (!status.ok()) {
+ break;
+ }
+ }
+ assert(sub_compact->builder != nullptr);
+ assert(sub_compact->current_output() != nullptr);
+ sub_compact->builder->Add(key, value);
+ sub_compact->current_output_file_size = sub_compact->builder->FileSize();
+ const ParsedInternalKey& ikey = c_iter->ikey();
+ sub_compact->current_output()->meta.UpdateBoundaries(
+ key, value, ikey.sequence, ikey.type);
+ sub_compact->num_output_records++;
+
+ // Close output file if it is big enough. Two possibilities determine it's
+ // time to close it: (1) the current key should be this file's last key, (2)
+ // the next key should not be in this file.
+ //
+ // TODO(aekmekji): determine if file should be closed earlier than this
+ // during subcompactions (i.e. if output size, estimated by input size, is
+ // going to be 1.2MB and max_output_file_size = 1MB, prefer to have 0.6MB
+ // and 0.6MB instead of 1MB and 0.2MB)
+ bool output_file_ended = false;
+ Status input_status;
+ if (sub_compact->compaction->output_level() != 0 &&
+ sub_compact->current_output_file_size >=
+ sub_compact->compaction->max_output_file_size()) {
+ // (1) this key terminates the file. For historical reasons, the iterator
+ // status before advancing will be given to FinishCompactionOutputFile().
+ input_status = input->status();
+ output_file_ended = true;
+ }
+ TEST_SYNC_POINT_CALLBACK(
+ "CompactionJob::Run():PausingManualCompaction:2",
+ reinterpret_cast<void*>(
+ const_cast<std::atomic<bool>*>(manual_compaction_paused_)));
+ c_iter->Next();
+ if (c_iter->status().IsManualCompactionPaused()) {
+ break;
+ }
+ if (!output_file_ended && c_iter->Valid() &&
+ sub_compact->compaction->output_level() != 0 &&
+ sub_compact->ShouldStopBefore(c_iter->key(),
+ sub_compact->current_output_file_size) &&
+ sub_compact->builder != nullptr) {
+ // (2) this key belongs to the next file. For historical reasons, the
+ // iterator status after advancing will be given to
+ // FinishCompactionOutputFile().
+ input_status = input->status();
+ output_file_ended = true;
+ }
+ if (output_file_ended) {
+ const Slice* next_key = nullptr;
+ if (c_iter->Valid()) {
+ next_key = &c_iter->key();
+ }
+ CompactionIterationStats range_del_out_stats;
+ status =
+ FinishCompactionOutputFile(input_status, sub_compact, &range_del_agg,
+ &range_del_out_stats, next_key);
+ RecordDroppedKeys(range_del_out_stats,
+ &sub_compact->compaction_job_stats);
+ }
+ }
+
+ sub_compact->compaction_job_stats.num_input_deletion_records =
+ c_iter_stats.num_input_deletion_records;
+ sub_compact->compaction_job_stats.num_corrupt_keys =
+ c_iter_stats.num_input_corrupt_records;
+ sub_compact->compaction_job_stats.num_single_del_fallthru =
+ c_iter_stats.num_single_del_fallthru;
+ sub_compact->compaction_job_stats.num_single_del_mismatch =
+ c_iter_stats.num_single_del_mismatch;
+ sub_compact->compaction_job_stats.total_input_raw_key_bytes +=
+ c_iter_stats.total_input_raw_key_bytes;
+ sub_compact->compaction_job_stats.total_input_raw_value_bytes +=
+ c_iter_stats.total_input_raw_value_bytes;
+
+ RecordTick(stats_, FILTER_OPERATION_TOTAL_TIME,
+ c_iter_stats.total_filter_time);
+ RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats);
+ RecordCompactionIOStats();
+
+ if (status.ok() && cfd->IsDropped()) {
+ status =
+ Status::ColumnFamilyDropped("Column family dropped during compaction");
+ }
+ if ((status.ok() || status.IsColumnFamilyDropped()) &&
+ shutting_down_->load(std::memory_order_relaxed)) {
+ status = Status::ShutdownInProgress("Database shutdown");
+ }
+ if ((status.ok() || status.IsColumnFamilyDropped()) &&
+ (manual_compaction_paused_ &&
+ manual_compaction_paused_->load(std::memory_order_relaxed))) {
+ status = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+ }
+ if (status.ok()) {
+ status = input->status();
+ }
+ if (status.ok()) {
+ status = c_iter->status();
+ }
+
+ if (status.ok() && sub_compact->builder == nullptr &&
+ sub_compact->outputs.size() == 0 && !range_del_agg.IsEmpty()) {
+ // handle subcompaction containing only range deletions
+ status = OpenCompactionOutputFile(sub_compact);
+ }
+
+ // Call FinishCompactionOutputFile() even if status is not ok: it needs to
+ // close the output file.
+ if (sub_compact->builder != nullptr) {
+ CompactionIterationStats range_del_out_stats;
+ Status s = FinishCompactionOutputFile(status, sub_compact, &range_del_agg,
+ &range_del_out_stats);
+ if (status.ok()) {
+ status = s;
+ }
+ RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats);
+ }
+
+ sub_compact->compaction_job_stats.cpu_micros =
+ env_->NowCPUNanos() / 1000 - prev_cpu_micros;
+
+ if (measure_io_stats_) {
+ sub_compact->compaction_job_stats.file_write_nanos +=
+ IOSTATS(write_nanos) - prev_write_nanos;
+ sub_compact->compaction_job_stats.file_fsync_nanos +=
+ IOSTATS(fsync_nanos) - prev_fsync_nanos;
+ sub_compact->compaction_job_stats.file_range_sync_nanos +=
+ IOSTATS(range_sync_nanos) - prev_range_sync_nanos;
+ sub_compact->compaction_job_stats.file_prepare_write_nanos +=
+ IOSTATS(prepare_write_nanos) - prev_prepare_write_nanos;
+ sub_compact->compaction_job_stats.cpu_micros -=
+ (IOSTATS(cpu_write_nanos) - prev_cpu_write_nanos +
+ IOSTATS(cpu_read_nanos) - prev_cpu_read_nanos) /
+ 1000;
+ if (prev_perf_level != PerfLevel::kEnableTimeAndCPUTimeExceptForMutex) {
+ SetPerfLevel(prev_perf_level);
+ }
+ }
+
+ sub_compact->c_iter.reset();
+ input.reset();
+ sub_compact->status = status;
+}
+
+void CompactionJob::RecordDroppedKeys(
+ const CompactionIterationStats& c_iter_stats,
+ CompactionJobStats* compaction_job_stats) {
+ if (c_iter_stats.num_record_drop_user > 0) {
+ RecordTick(stats_, COMPACTION_KEY_DROP_USER,
+ c_iter_stats.num_record_drop_user);
+ }
+ if (c_iter_stats.num_record_drop_hidden > 0) {
+ RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY,
+ c_iter_stats.num_record_drop_hidden);
+ if (compaction_job_stats) {
+ compaction_job_stats->num_records_replaced +=
+ c_iter_stats.num_record_drop_hidden;
+ }
+ }
+ if (c_iter_stats.num_record_drop_obsolete > 0) {
+ RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE,
+ c_iter_stats.num_record_drop_obsolete);
+ if (compaction_job_stats) {
+ compaction_job_stats->num_expired_deletion_records +=
+ c_iter_stats.num_record_drop_obsolete;
+ }
+ }
+ if (c_iter_stats.num_record_drop_range_del > 0) {
+ RecordTick(stats_, COMPACTION_KEY_DROP_RANGE_DEL,
+ c_iter_stats.num_record_drop_range_del);
+ }
+ if (c_iter_stats.num_range_del_drop_obsolete > 0) {
+ RecordTick(stats_, COMPACTION_RANGE_DEL_DROP_OBSOLETE,
+ c_iter_stats.num_range_del_drop_obsolete);
+ }
+ if (c_iter_stats.num_optimized_del_drop_obsolete > 0) {
+ RecordTick(stats_, COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE,
+ c_iter_stats.num_optimized_del_drop_obsolete);
+ }
+}
+
+Status CompactionJob::FinishCompactionOutputFile(
+ const Status& input_status, SubcompactionState* sub_compact,
+ CompactionRangeDelAggregator* range_del_agg,
+ CompactionIterationStats* range_del_out_stats,
+ const Slice* next_table_min_key /* = nullptr */) {
+ AutoThreadOperationStageUpdater stage_updater(
+ ThreadStatus::STAGE_COMPACTION_SYNC_FILE);
+ assert(sub_compact != nullptr);
+ assert(sub_compact->outfile);
+ assert(sub_compact->builder != nullptr);
+ assert(sub_compact->current_output() != nullptr);
+
+ uint64_t output_number = sub_compact->current_output()->meta.fd.GetNumber();
+ assert(output_number != 0);
+
+ ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
+ const Comparator* ucmp = cfd->user_comparator();
+
+ // Check for iterator errors
+ Status s = input_status;
+ auto meta = &sub_compact->current_output()->meta;
+ assert(meta != nullptr);
+ if (s.ok()) {
+ Slice lower_bound_guard, upper_bound_guard;
+ std::string smallest_user_key;
+ const Slice *lower_bound, *upper_bound;
+ bool lower_bound_from_sub_compact = false;
+ if (sub_compact->outputs.size() == 1) {
+ // For the first output table, include range tombstones before the min key
+ // but after the subcompaction boundary.
+ lower_bound = sub_compact->start;
+ lower_bound_from_sub_compact = true;
+ } else if (meta->smallest.size() > 0) {
+ // For subsequent output tables, only include range tombstones from min
+ // key onwards since the previous file was extended to contain range
+ // tombstones falling before min key.
+ smallest_user_key = meta->smallest.user_key().ToString(false /*hex*/);
+ lower_bound_guard = Slice(smallest_user_key);
+ lower_bound = &lower_bound_guard;
+ } else {
+ lower_bound = nullptr;
+ }
+ if (next_table_min_key != nullptr) {
+ // This may be the last file in the subcompaction in some cases, so we
+ // need to compare the end key of subcompaction with the next file start
+ // key. When the end key is chosen by the subcompaction, we know that
+ // it must be the biggest key in output file. Therefore, it is safe to
+ // use the smaller key as the upper bound of the output file, to ensure
+ // that there is no overlapping between different output files.
+ upper_bound_guard = ExtractUserKey(*next_table_min_key);
+ if (sub_compact->end != nullptr &&
+ ucmp->Compare(upper_bound_guard, *sub_compact->end) >= 0) {
+ upper_bound = sub_compact->end;
+ } else {
+ upper_bound = &upper_bound_guard;
+ }
+ } else {
+ // This is the last file in the subcompaction, so extend until the
+ // subcompaction ends.
+ upper_bound = sub_compact->end;
+ }
+ auto earliest_snapshot = kMaxSequenceNumber;
+ if (existing_snapshots_.size() > 0) {
+ earliest_snapshot = existing_snapshots_[0];
+ }
+ bool has_overlapping_endpoints;
+ if (upper_bound != nullptr && meta->largest.size() > 0) {
+ has_overlapping_endpoints =
+ ucmp->Compare(meta->largest.user_key(), *upper_bound) == 0;
+ } else {
+ has_overlapping_endpoints = false;
+ }
+
+ // The end key of the subcompaction must be bigger or equal to the upper
+ // bound. If the end of subcompaction is null or the upper bound is null,
+ // it means that this file is the last file in the compaction. So there
+ // will be no overlapping between this file and others.
+ assert(sub_compact->end == nullptr ||
+ upper_bound == nullptr ||
+ ucmp->Compare(*upper_bound , *sub_compact->end) <= 0);
+ auto it = range_del_agg->NewIterator(lower_bound, upper_bound,
+ has_overlapping_endpoints);
+ // Position the range tombstone output iterator. There may be tombstone
+ // fragments that are entirely out of range, so make sure that we do not
+ // include those.
+ if (lower_bound != nullptr) {
+ it->Seek(*lower_bound);
+ } else {
+ it->SeekToFirst();
+ }
+ for (; it->Valid(); it->Next()) {
+ auto tombstone = it->Tombstone();
+ if (upper_bound != nullptr) {
+ int cmp = ucmp->Compare(*upper_bound, tombstone.start_key_);
+ if ((has_overlapping_endpoints && cmp < 0) ||
+ (!has_overlapping_endpoints && cmp <= 0)) {
+ // Tombstones starting after upper_bound only need to be included in
+ // the next table. If the current SST ends before upper_bound, i.e.,
+ // `has_overlapping_endpoints == false`, we can also skip over range
+ // tombstones that start exactly at upper_bound. Such range tombstones
+ // will be included in the next file and are not relevant to the point
+ // keys or endpoints of the current file.
+ break;
+ }
+ }
+
+ if (bottommost_level_ && tombstone.seq_ <= earliest_snapshot) {
+ // TODO(andrewkr): tombstones that span multiple output files are
+ // counted for each compaction output file, so lots of double counting.
+ range_del_out_stats->num_range_del_drop_obsolete++;
+ range_del_out_stats->num_record_drop_obsolete++;
+ continue;
+ }
+
+ auto kv = tombstone.Serialize();
+ assert(lower_bound == nullptr ||
+ ucmp->Compare(*lower_bound, kv.second) < 0);
+ sub_compact->builder->Add(kv.first.Encode(), kv.second);
+ InternalKey smallest_candidate = std::move(kv.first);
+ if (lower_bound != nullptr &&
+ ucmp->Compare(smallest_candidate.user_key(), *lower_bound) <= 0) {
+ // Pretend the smallest key has the same user key as lower_bound
+ // (the max key in the previous table or subcompaction) in order for
+ // files to appear key-space partitioned.
+ //
+ // When lower_bound is chosen by a subcompaction, we know that
+ // subcompactions over smaller keys cannot contain any keys at
+ // lower_bound. We also know that smaller subcompactions exist, because
+ // otherwise the subcompaction woud be unbounded on the left. As a
+ // result, we know that no other files on the output level will contain
+ // actual keys at lower_bound (an output file may have a largest key of
+ // lower_bound@kMaxSequenceNumber, but this only indicates a large range
+ // tombstone was truncated). Therefore, it is safe to use the
+ // tombstone's sequence number, to ensure that keys at lower_bound at
+ // lower levels are covered by truncated tombstones.
+ //
+ // If lower_bound was chosen by the smallest data key in the file,
+ // choose lowest seqnum so this file's smallest internal key comes after
+ // the previous file's largest. The fake seqnum is OK because the read
+ // path's file-picking code only considers user key.
+ smallest_candidate = InternalKey(
+ *lower_bound, lower_bound_from_sub_compact ? tombstone.seq_ : 0,
+ kTypeRangeDeletion);
+ }
+ InternalKey largest_candidate = tombstone.SerializeEndKey();
+ if (upper_bound != nullptr &&
+ ucmp->Compare(*upper_bound, largest_candidate.user_key()) <= 0) {
+ // Pretend the largest key has the same user key as upper_bound (the
+ // min key in the following table or subcompaction) in order for files
+ // to appear key-space partitioned.
+ //
+ // Choose highest seqnum so this file's largest internal key comes
+ // before the next file's/subcompaction's smallest. The fake seqnum is
+ // OK because the read path's file-picking code only considers the user
+ // key portion.
+ //
+ // Note Seek() also creates InternalKey with (user_key,
+ // kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of
+ // kTypeRangeDeletion (0xF), so the range tombstone comes before the
+ // Seek() key in InternalKey's ordering. So Seek() will look in the
+ // next file for the user key.
+ largest_candidate =
+ InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion);
+ }
+#ifndef NDEBUG
+ SequenceNumber smallest_ikey_seqnum = kMaxSequenceNumber;
+ if (meta->smallest.size() > 0) {
+ smallest_ikey_seqnum = GetInternalKeySeqno(meta->smallest.Encode());
+ }
+#endif
+ meta->UpdateBoundariesForRange(smallest_candidate, largest_candidate,
+ tombstone.seq_,
+ cfd->internal_comparator());
+
+ // The smallest key in a file is used for range tombstone truncation, so
+ // it cannot have a seqnum of 0 (unless the smallest data key in a file
+ // has a seqnum of 0). Otherwise, the truncated tombstone may expose
+ // deleted keys at lower levels.
+ assert(smallest_ikey_seqnum == 0 ||
+ ExtractInternalKeyFooter(meta->smallest.Encode()) !=
+ PackSequenceAndType(0, kTypeRangeDeletion));
+ }
+ meta->marked_for_compaction = sub_compact->builder->NeedCompact();
+ }
+ const uint64_t current_entries = sub_compact->builder->NumEntries();
+ if (s.ok()) {
+ s = sub_compact->builder->Finish();
+ } else {
+ sub_compact->builder->Abandon();
+ }
+ const uint64_t current_bytes = sub_compact->builder->FileSize();
+ if (s.ok()) {
+ // Add the checksum information to file metadata.
+ meta->file_checksum = sub_compact->builder->GetFileChecksum();
+ meta->file_checksum_func_name =
+ sub_compact->builder->GetFileChecksumFuncName();
+
+ meta->fd.file_size = current_bytes;
+ }
+ sub_compact->current_output()->finished = true;
+ sub_compact->total_bytes += current_bytes;
+
+ // Finish and check for file errors
+ if (s.ok()) {
+ StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS);
+ s = sub_compact->outfile->Sync(db_options_.use_fsync);
+ }
+ if (s.ok()) {
+ s = sub_compact->outfile->Close();
+ }
+ sub_compact->outfile.reset();
+
+ TableProperties tp;
+ if (s.ok()) {
+ tp = sub_compact->builder->GetTableProperties();
+ }
+
+ if (s.ok() && current_entries == 0 && tp.num_range_deletions == 0) {
+ // If there is nothing to output, no necessary to generate a sst file.
+ // This happens when the output level is bottom level, at the same time
+ // the sub_compact output nothing.
+ std::string fname =
+ TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths,
+ meta->fd.GetNumber(), meta->fd.GetPathId());
+ env_->DeleteFile(fname);
+
+ // Also need to remove the file from outputs, or it will be added to the
+ // VersionEdit.
+ assert(!sub_compact->outputs.empty());
+ sub_compact->outputs.pop_back();
+ meta = nullptr;
+ }
+
+ if (s.ok() && (current_entries > 0 || tp.num_range_deletions > 0)) {
+ // Output to event logger and fire events.
+ sub_compact->current_output()->table_properties =
+ std::make_shared<TableProperties>(tp);
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "[%s] [JOB %d] Generated table #%" PRIu64 ": %" PRIu64
+ " keys, %" PRIu64 " bytes%s",
+ cfd->GetName().c_str(), job_id_, output_number,
+ current_entries, current_bytes,
+ meta->marked_for_compaction ? " (need compaction)" : "");
+ }
+ std::string fname;
+ FileDescriptor output_fd;
+ uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
+ if (meta != nullptr) {
+ fname =
+ TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths,
+ meta->fd.GetNumber(), meta->fd.GetPathId());
+ output_fd = meta->fd;
+ oldest_blob_file_number = meta->oldest_blob_file_number;
+ } else {
+ fname = "(nil)";
+ }
+ EventHelpers::LogAndNotifyTableFileCreationFinished(
+ event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname,
+ job_id_, output_fd, oldest_blob_file_number, tp,
+ TableFileCreationReason::kCompaction, s);
+
+#ifndef ROCKSDB_LITE
+ // Report new file to SstFileManagerImpl
+ auto sfm =
+ static_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get());
+ if (sfm && meta != nullptr && meta->fd.GetPathId() == 0) {
+ sfm->OnAddFile(fname);
+ if (sfm->IsMaxAllowedSpaceReached()) {
+ // TODO(ajkr): should we return OK() if max space was reached by the final
+ // compaction output file (similarly to how flush works when full)?
+ s = Status::SpaceLimit("Max allowed space was reached");
+ TEST_SYNC_POINT(
+ "CompactionJob::FinishCompactionOutputFile:"
+ "MaxAllowedSpaceReached");
+ InstrumentedMutexLock l(db_mutex_);
+ db_error_handler_->SetBGError(s, BackgroundErrorReason::kCompaction);
+ }
+ }
+#endif
+
+ sub_compact->builder.reset();
+ sub_compact->current_output_file_size = 0;
+ return s;
+}
+
+Status CompactionJob::InstallCompactionResults(
+ const MutableCFOptions& mutable_cf_options) {
+ db_mutex_->AssertHeld();
+
+ auto* compaction = compact_->compaction;
+ // paranoia: verify that the files that we started with
+ // still exist in the current version and in the same original level.
+ // This ensures that a concurrent compaction did not erroneously
+ // pick the same files to compact_.
+ if (!versions_->VerifyCompactionFileConsistency(compaction)) {
+ Compaction::InputLevelSummaryBuffer inputs_summary;
+
+ ROCKS_LOG_ERROR(db_options_.info_log, "[%s] [JOB %d] Compaction %s aborted",
+ compaction->column_family_data()->GetName().c_str(),
+ job_id_, compaction->InputLevelSummary(&inputs_summary));
+ return Status::Corruption("Compaction input files inconsistent");
+ }
+
+ {
+ Compaction::InputLevelSummaryBuffer inputs_summary;
+ ROCKS_LOG_INFO(
+ db_options_.info_log, "[%s] [JOB %d] Compacted %s => %" PRIu64 " bytes",
+ compaction->column_family_data()->GetName().c_str(), job_id_,
+ compaction->InputLevelSummary(&inputs_summary), compact_->total_bytes);
+ }
+
+ // Add compaction inputs
+ compaction->AddInputDeletions(compact_->compaction->edit());
+
+ for (const auto& sub_compact : compact_->sub_compact_states) {
+ for (const auto& out : sub_compact.outputs) {
+ compaction->edit()->AddFile(compaction->output_level(), out.meta);
+ }
+ }
+ return versions_->LogAndApply(compaction->column_family_data(),
+ mutable_cf_options, compaction->edit(),
+ db_mutex_, db_directory_);
+}
+
+void CompactionJob::RecordCompactionIOStats() {
+ RecordTick(stats_, COMPACT_READ_BYTES, IOSTATS(bytes_read));
+ ThreadStatusUtil::IncreaseThreadOperationProperty(
+ ThreadStatus::COMPACTION_BYTES_READ, IOSTATS(bytes_read));
+ IOSTATS_RESET(bytes_read);
+ RecordTick(stats_, COMPACT_WRITE_BYTES, IOSTATS(bytes_written));
+ ThreadStatusUtil::IncreaseThreadOperationProperty(
+ ThreadStatus::COMPACTION_BYTES_WRITTEN, IOSTATS(bytes_written));
+ IOSTATS_RESET(bytes_written);
+}
+
+Status CompactionJob::OpenCompactionOutputFile(
+ SubcompactionState* sub_compact) {
+ assert(sub_compact != nullptr);
+ assert(sub_compact->builder == nullptr);
+ // no need to lock because VersionSet::next_file_number_ is atomic
+ uint64_t file_number = versions_->NewFileNumber();
+ std::string fname =
+ TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths,
+ file_number, sub_compact->compaction->output_path_id());
+ // Fire events.
+ ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
+#ifndef ROCKSDB_LITE
+ EventHelpers::NotifyTableFileCreationStarted(
+ cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname, job_id_,
+ TableFileCreationReason::kCompaction);
+#endif // !ROCKSDB_LITE
+ // Make the output file
+ std::unique_ptr<FSWritableFile> writable_file;
+#ifndef NDEBUG
+ bool syncpoint_arg = file_options_.use_direct_writes;
+ TEST_SYNC_POINT_CALLBACK("CompactionJob::OpenCompactionOutputFile",
+ &syncpoint_arg);
+#endif
+ Status s = NewWritableFile(fs_, fname, &writable_file, file_options_);
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(
+ db_options_.info_log,
+ "[%s] [JOB %d] OpenCompactionOutputFiles for table #%" PRIu64
+ " fails at NewWritableFile with status %s",
+ sub_compact->compaction->column_family_data()->GetName().c_str(),
+ job_id_, file_number, s.ToString().c_str());
+ LogFlush(db_options_.info_log);
+ EventHelpers::LogAndNotifyTableFileCreationFinished(
+ event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(),
+ fname, job_id_, FileDescriptor(), kInvalidBlobFileNumber,
+ TableProperties(), TableFileCreationReason::kCompaction, s);
+ return s;
+ }
+
+ // Try to figure out the output file's oldest ancester time.
+ int64_t temp_current_time = 0;
+ auto get_time_status = env_->GetCurrentTime(&temp_current_time);
+ // Safe to proceed even if GetCurrentTime fails. So, log and proceed.
+ if (!get_time_status.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Failed to get current time. Status: %s",
+ get_time_status.ToString().c_str());
+ }
+ uint64_t current_time = static_cast<uint64_t>(temp_current_time);
+ uint64_t oldest_ancester_time =
+ sub_compact->compaction->MinInputFileOldestAncesterTime();
+ if (oldest_ancester_time == port::kMaxUint64) {
+ oldest_ancester_time = current_time;
+ }
+
+ // Initialize a SubcompactionState::Output and add it to sub_compact->outputs
+ {
+ SubcompactionState::Output out;
+ out.meta.fd = FileDescriptor(file_number,
+ sub_compact->compaction->output_path_id(), 0);
+ out.meta.oldest_ancester_time = oldest_ancester_time;
+ out.meta.file_creation_time = current_time;
+ out.finished = false;
+ sub_compact->outputs.push_back(out);
+ }
+
+ writable_file->SetIOPriority(Env::IOPriority::IO_LOW);
+ writable_file->SetWriteLifeTimeHint(write_hint_);
+ writable_file->SetPreallocationBlockSize(static_cast<size_t>(
+ sub_compact->compaction->OutputFilePreallocationSize()));
+ const auto& listeners =
+ sub_compact->compaction->immutable_cf_options()->listeners;
+ sub_compact->outfile.reset(
+ new WritableFileWriter(std::move(writable_file), fname, file_options_,
+ env_, db_options_.statistics.get(), listeners,
+ db_options_.sst_file_checksum_func.get()));
+
+ // If the Column family flag is to only optimize filters for hits,
+ // we can skip creating filters if this is the bottommost_level where
+ // data is going to be found
+ bool skip_filters =
+ cfd->ioptions()->optimize_filters_for_hits && bottommost_level_;
+
+ sub_compact->builder.reset(NewTableBuilder(
+ *cfd->ioptions(), *(sub_compact->compaction->mutable_cf_options()),
+ cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(),
+ cfd->GetID(), cfd->GetName(), sub_compact->outfile.get(),
+ sub_compact->compaction->output_compression(),
+ 0 /*sample_for_compression */,
+ sub_compact->compaction->output_compression_opts(),
+ sub_compact->compaction->output_level(), skip_filters,
+ oldest_ancester_time, 0 /* oldest_key_time */,
+ sub_compact->compaction->max_output_file_size(), current_time));
+ LogFlush(db_options_.info_log);
+ return s;
+}
+
+void CompactionJob::CleanupCompaction() {
+ for (SubcompactionState& sub_compact : compact_->sub_compact_states) {
+ const auto& sub_status = sub_compact.status;
+
+ if (sub_compact.builder != nullptr) {
+ // May happen if we get a shutdown call in the middle of compaction
+ sub_compact.builder->Abandon();
+ sub_compact.builder.reset();
+ } else {
+ assert(!sub_status.ok() || sub_compact.outfile == nullptr);
+ }
+ for (const auto& out : sub_compact.outputs) {
+ // If this file was inserted into the table cache then remove
+ // them here because this compaction was not committed.
+ if (!sub_status.ok()) {
+ TableCache::Evict(table_cache_.get(), out.meta.fd.GetNumber());
+ }
+ }
+ }
+ delete compact_;
+ compact_ = nullptr;
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) {
+ assert(prefix_length > 0);
+ size_t length = src.size() > prefix_length ? prefix_length : src.size();
+ dst->assign(src.data(), length);
+}
+} // namespace
+
+#endif // !ROCKSDB_LITE
+
+void CompactionJob::UpdateCompactionStats() {
+ Compaction* compaction = compact_->compaction;
+ compaction_stats_.num_input_files_in_non_output_levels = 0;
+ compaction_stats_.num_input_files_in_output_level = 0;
+ for (int input_level = 0;
+ input_level < static_cast<int>(compaction->num_input_levels());
+ ++input_level) {
+ if (compaction->level(input_level) != compaction->output_level()) {
+ UpdateCompactionInputStatsHelper(
+ &compaction_stats_.num_input_files_in_non_output_levels,
+ &compaction_stats_.bytes_read_non_output_levels, input_level);
+ } else {
+ UpdateCompactionInputStatsHelper(
+ &compaction_stats_.num_input_files_in_output_level,
+ &compaction_stats_.bytes_read_output_level, input_level);
+ }
+ }
+
+ uint64_t num_output_records = 0;
+
+ for (const auto& sub_compact : compact_->sub_compact_states) {
+ size_t num_output_files = sub_compact.outputs.size();
+ if (sub_compact.builder != nullptr) {
+ // An error occurred so ignore the last output.
+ assert(num_output_files > 0);
+ --num_output_files;
+ }
+ compaction_stats_.num_output_files += static_cast<int>(num_output_files);
+
+ num_output_records += sub_compact.num_output_records;
+
+ for (const auto& out : sub_compact.outputs) {
+ compaction_stats_.bytes_written += out.meta.fd.file_size;
+ }
+ }
+
+ if (compaction_stats_.num_input_records > num_output_records) {
+ compaction_stats_.num_dropped_records =
+ compaction_stats_.num_input_records - num_output_records;
+ }
+}
+
+void CompactionJob::UpdateCompactionInputStatsHelper(int* num_files,
+ uint64_t* bytes_read,
+ int input_level) {
+ const Compaction* compaction = compact_->compaction;
+ auto num_input_files = compaction->num_input_files(input_level);
+ *num_files += static_cast<int>(num_input_files);
+
+ for (size_t i = 0; i < num_input_files; ++i) {
+ const auto* file_meta = compaction->input(input_level, i);
+ *bytes_read += file_meta->fd.GetFileSize();
+ compaction_stats_.num_input_records +=
+ static_cast<uint64_t>(file_meta->num_entries);
+ }
+}
+
+void CompactionJob::UpdateCompactionJobStats(
+ const InternalStats::CompactionStats& stats) const {
+#ifndef ROCKSDB_LITE
+ if (compaction_job_stats_) {
+ compaction_job_stats_->elapsed_micros = stats.micros;
+
+ // input information
+ compaction_job_stats_->total_input_bytes =
+ stats.bytes_read_non_output_levels + stats.bytes_read_output_level;
+ compaction_job_stats_->num_input_records = stats.num_input_records;
+ compaction_job_stats_->num_input_files =
+ stats.num_input_files_in_non_output_levels +
+ stats.num_input_files_in_output_level;
+ compaction_job_stats_->num_input_files_at_output_level =
+ stats.num_input_files_in_output_level;
+
+ // output information
+ compaction_job_stats_->total_output_bytes = stats.bytes_written;
+ compaction_job_stats_->num_output_records = compact_->num_output_records;
+ compaction_job_stats_->num_output_files = stats.num_output_files;
+
+ if (compact_->NumOutputFiles() > 0U) {
+ CopyPrefix(compact_->SmallestUserKey(),
+ CompactionJobStats::kMaxPrefixLength,
+ &compaction_job_stats_->smallest_output_key_prefix);
+ CopyPrefix(compact_->LargestUserKey(),
+ CompactionJobStats::kMaxPrefixLength,
+ &compaction_job_stats_->largest_output_key_prefix);
+ }
+ }
+#else
+ (void)stats;
+#endif // !ROCKSDB_LITE
+}
+
+void CompactionJob::LogCompaction() {
+ Compaction* compaction = compact_->compaction;
+ ColumnFamilyData* cfd = compaction->column_family_data();
+
+ // Let's check if anything will get logged. Don't prepare all the info if
+ // we're not logging
+ if (db_options_.info_log_level <= InfoLogLevel::INFO_LEVEL) {
+ Compaction::InputLevelSummaryBuffer inputs_summary;
+ ROCKS_LOG_INFO(
+ db_options_.info_log, "[%s] [JOB %d] Compacting %s, score %.2f",
+ cfd->GetName().c_str(), job_id_,
+ compaction->InputLevelSummary(&inputs_summary), compaction->score());
+ char scratch[2345];
+ compaction->Summary(scratch, sizeof(scratch));
+ ROCKS_LOG_INFO(db_options_.info_log, "[%s] Compaction start summary: %s\n",
+ cfd->GetName().c_str(), scratch);
+ // build event logger report
+ auto stream = event_logger_->Log();
+ stream << "job" << job_id_ << "event"
+ << "compaction_started"
+ << "compaction_reason"
+ << GetCompactionReasonString(compaction->compaction_reason());
+ for (size_t i = 0; i < compaction->num_input_levels(); ++i) {
+ stream << ("files_L" + ToString(compaction->level(i)));
+ stream.StartArray();
+ for (auto f : *compaction->inputs(i)) {
+ stream << f->fd.GetNumber();
+ }
+ stream.EndArray();
+ }
+ stream << "score" << compaction->score() << "input_data_size"
+ << compaction->CalculateTotalInputSize();
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_job.h b/src/rocksdb/db/compaction/compaction_job.h
new file mode 100644
index 000000000..c15f502a1
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_job.h
@@ -0,0 +1,198 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <functional>
+#include <limits>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/compaction/compaction_iterator.h"
+#include "db/dbformat.h"
+#include "db/flush_scheduler.h"
+#include "db/internal_stats.h"
+#include "db/job_context.h"
+#include "db/log_writer.h"
+#include "db/memtable_list.h"
+#include "db/range_del_aggregator.h"
+#include "db/version_edit.h"
+#include "db/write_controller.h"
+#include "db/write_thread.h"
+#include "logging/event_logger.h"
+#include "options/cf_options.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/compaction_job_stats.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/transaction_log.h"
+#include "table/scoped_arena_iterator.h"
+#include "util/autovector.h"
+#include "util/stop_watch.h"
+#include "util/thread_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Arena;
+class ErrorHandler;
+class MemTable;
+class SnapshotChecker;
+class TableCache;
+class Version;
+class VersionEdit;
+class VersionSet;
+
+// CompactionJob is responsible for executing the compaction. Each (manual or
+// automated) compaction corresponds to a CompactionJob object, and usually
+// goes through the stages of `Prepare()`->`Run()`->`Install()`. CompactionJob
+// will divide the compaction into subcompactions and execute them in parallel
+// if needed.
+class CompactionJob {
+ public:
+ CompactionJob(int job_id, Compaction* compaction,
+ const ImmutableDBOptions& db_options,
+ const FileOptions& file_options, VersionSet* versions,
+ const std::atomic<bool>* shutting_down,
+ const SequenceNumber preserve_deletes_seqnum,
+ LogBuffer* log_buffer, Directory* db_directory,
+ Directory* output_directory, Statistics* stats,
+ InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
+ std::vector<SequenceNumber> existing_snapshots,
+ SequenceNumber earliest_write_conflict_snapshot,
+ const SnapshotChecker* snapshot_checker,
+ std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
+ bool paranoid_file_checks, bool measure_io_stats,
+ const std::string& dbname,
+ CompactionJobStats* compaction_job_stats,
+ Env::Priority thread_pri,
+ const std::atomic<bool>* manual_compaction_paused = nullptr);
+
+ ~CompactionJob();
+
+ // no copy/move
+ CompactionJob(CompactionJob&& job) = delete;
+ CompactionJob(const CompactionJob& job) = delete;
+ CompactionJob& operator=(const CompactionJob& job) = delete;
+
+ // REQUIRED: mutex held
+ // Prepare for the compaction by setting up boundaries for each subcompaction
+ void Prepare();
+ // REQUIRED mutex not held
+ // Launch threads for each subcompaction and wait for them to finish. After
+ // that, verify table is usable and finally do bookkeeping to unify
+ // subcompaction results
+ Status Run();
+
+ // REQUIRED: mutex held
+ // Add compaction input/output to the current version
+ Status Install(const MutableCFOptions& mutable_cf_options);
+
+ private:
+ struct SubcompactionState;
+
+ void AggregateStatistics();
+
+ // Generates a histogram representing potential divisions of key ranges from
+ // the input. It adds the starting and/or ending keys of certain input files
+ // to the working set and then finds the approximate size of data in between
+ // each consecutive pair of slices. Then it divides these ranges into
+ // consecutive groups such that each group has a similar size.
+ void GenSubcompactionBoundaries();
+
+ // update the thread status for starting a compaction.
+ void ReportStartedCompaction(Compaction* compaction);
+ void AllocateCompactionOutputFileNumbers();
+ // Call compaction filter. Then iterate through input and compact the
+ // kv-pairs
+ void ProcessKeyValueCompaction(SubcompactionState* sub_compact);
+
+ Status FinishCompactionOutputFile(
+ const Status& input_status, SubcompactionState* sub_compact,
+ CompactionRangeDelAggregator* range_del_agg,
+ CompactionIterationStats* range_del_out_stats,
+ const Slice* next_table_min_key = nullptr);
+ Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options);
+ void RecordCompactionIOStats();
+ Status OpenCompactionOutputFile(SubcompactionState* sub_compact);
+ void CleanupCompaction();
+ void UpdateCompactionJobStats(
+ const InternalStats::CompactionStats& stats) const;
+ void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats,
+ CompactionJobStats* compaction_job_stats = nullptr);
+
+ void UpdateCompactionStats();
+ void UpdateCompactionInputStatsHelper(
+ int* num_files, uint64_t* bytes_read, int input_level);
+
+ void LogCompaction();
+
+ int job_id_;
+
+ // CompactionJob state
+ struct CompactionState;
+ CompactionState* compact_;
+ CompactionJobStats* compaction_job_stats_;
+ InternalStats::CompactionStats compaction_stats_;
+
+ // DBImpl state
+ const std::string& dbname_;
+ const ImmutableDBOptions& db_options_;
+ const FileOptions file_options_;
+
+ Env* env_;
+ FileSystem* fs_;
+ // env_option optimized for compaction table reads
+ FileOptions file_options_for_read_;
+ VersionSet* versions_;
+ const std::atomic<bool>* shutting_down_;
+ const std::atomic<bool>* manual_compaction_paused_;
+ const SequenceNumber preserve_deletes_seqnum_;
+ LogBuffer* log_buffer_;
+ Directory* db_directory_;
+ Directory* output_directory_;
+ Statistics* stats_;
+ InstrumentedMutex* db_mutex_;
+ ErrorHandler* db_error_handler_;
+ // If there were two snapshots with seq numbers s1 and
+ // s2 and s1 < s2, and if we find two instances of a key k1 then lies
+ // entirely within s1 and s2, then the earlier version of k1 can be safely
+ // deleted because that version is not visible in any snapshot.
+ std::vector<SequenceNumber> existing_snapshots_;
+
+ // This is the earliest snapshot that could be used for write-conflict
+ // checking by a transaction. For any user-key newer than this snapshot, we
+ // should make sure not to remove evidence that a write occurred.
+ SequenceNumber earliest_write_conflict_snapshot_;
+
+ const SnapshotChecker* const snapshot_checker_;
+
+ std::shared_ptr<Cache> table_cache_;
+
+ EventLogger* event_logger_;
+
+ // Is this compaction creating a file in the bottom most level?
+ bool bottommost_level_;
+ bool paranoid_file_checks_;
+ bool measure_io_stats_;
+ // Stores the Slices that designate the boundaries for each subcompaction
+ std::vector<Slice> boundaries_;
+ // Stores the approx size of keys covered in the range of each subcompaction
+ std::vector<uint64_t> sizes_;
+ Env::WriteLifeTimeHint write_hint_;
+ Env::Priority thread_pri_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_job_stats_test.cc b/src/rocksdb/db/compaction/compaction_job_stats_test.cc
new file mode 100644
index 000000000..51a665797
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_job_stats_test.cc
@@ -0,0 +1,1043 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <algorithm>
+#include <cinttypes>
+#include <iostream>
+#include <mutex>
+#include <queue>
+#include <set>
+#include <thread>
+#include <unordered_set>
+#include <utility>
+
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "db/job_context.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "logging/logging.h"
+#include "memtable/hash_linklist_rep.h"
+#include "monitoring/statistics.h"
+#include "monitoring/thread_status_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/experimental.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/thread_status.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/mock_table.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/scoped_arena_iterator.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/compression.h"
+#include "util/hash.h"
+#include "util/mutexlock.h"
+#include "util/rate_limiter.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+#if !defined(IOS_CROSS_COMPILE)
+#ifndef ROCKSDB_LITE
+namespace ROCKSDB_NAMESPACE {
+
+static std::string RandomString(Random* rnd, int len, double ratio) {
+ std::string r;
+ test::CompressibleString(rnd, ratio, len, &r);
+ return r;
+}
+
+std::string Key(uint64_t key, int length) {
+ const int kBufSize = 1000;
+ char buf[kBufSize];
+ if (length > kBufSize) {
+ length = kBufSize;
+ }
+ snprintf(buf, kBufSize, "%0*" PRIu64, length, key);
+ return std::string(buf);
+}
+
+class CompactionJobStatsTest : public testing::Test,
+ public testing::WithParamInterface<bool> {
+ public:
+ std::string dbname_;
+ std::string alternative_wal_dir_;
+ Env* env_;
+ DB* db_;
+ std::vector<ColumnFamilyHandle*> handles_;
+ uint32_t max_subcompactions_;
+
+ Options last_options_;
+
+ CompactionJobStatsTest() : env_(Env::Default()) {
+ env_->SetBackgroundThreads(1, Env::LOW);
+ env_->SetBackgroundThreads(1, Env::HIGH);
+ dbname_ = test::PerThreadDBPath("compaction_job_stats_test");
+ alternative_wal_dir_ = dbname_ + "/wal";
+ Options options;
+ options.create_if_missing = true;
+ max_subcompactions_ = GetParam();
+ options.max_subcompactions = max_subcompactions_;
+ auto delete_options = options;
+ delete_options.wal_dir = alternative_wal_dir_;
+ EXPECT_OK(DestroyDB(dbname_, delete_options));
+ // Destroy it for not alternative WAL dir is used.
+ EXPECT_OK(DestroyDB(dbname_, options));
+ db_ = nullptr;
+ Reopen(options);
+ }
+
+ ~CompactionJobStatsTest() override {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ Close();
+ Options options;
+ options.db_paths.emplace_back(dbname_, 0);
+ options.db_paths.emplace_back(dbname_ + "_2", 0);
+ options.db_paths.emplace_back(dbname_ + "_3", 0);
+ options.db_paths.emplace_back(dbname_ + "_4", 0);
+ EXPECT_OK(DestroyDB(dbname_, options));
+ }
+
+ // Required if inheriting from testing::WithParamInterface<>
+ static void SetUpTestCase() {}
+ static void TearDownTestCase() {}
+
+ DBImpl* dbfull() {
+ return reinterpret_cast<DBImpl*>(db_);
+ }
+
+ void CreateColumnFamilies(const std::vector<std::string>& cfs,
+ const Options& options) {
+ ColumnFamilyOptions cf_opts(options);
+ size_t cfi = handles_.size();
+ handles_.resize(cfi + cfs.size());
+ for (auto cf : cfs) {
+ ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]));
+ }
+ }
+
+ void CreateAndReopenWithCF(const std::vector<std::string>& cfs,
+ const Options& options) {
+ CreateColumnFamilies(cfs, options);
+ std::vector<std::string> cfs_plus_default = cfs;
+ cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName);
+ ReopenWithColumnFamilies(cfs_plus_default, options);
+ }
+
+ void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+ const std::vector<Options>& options) {
+ ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+ }
+
+ void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+ const Options& options) {
+ ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+ }
+
+ Status TryReopenWithColumnFamilies(
+ const std::vector<std::string>& cfs,
+ const std::vector<Options>& options) {
+ Close();
+ EXPECT_EQ(cfs.size(), options.size());
+ std::vector<ColumnFamilyDescriptor> column_families;
+ for (size_t i = 0; i < cfs.size(); ++i) {
+ column_families.push_back(ColumnFamilyDescriptor(cfs[i], options[i]));
+ }
+ DBOptions db_opts = DBOptions(options[0]);
+ return DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
+ }
+
+ Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+ const Options& options) {
+ Close();
+ std::vector<Options> v_opts(cfs.size(), options);
+ return TryReopenWithColumnFamilies(cfs, v_opts);
+ }
+
+ void Reopen(const Options& options) {
+ ASSERT_OK(TryReopen(options));
+ }
+
+ void Close() {
+ for (auto h : handles_) {
+ delete h;
+ }
+ handles_.clear();
+ delete db_;
+ db_ = nullptr;
+ }
+
+ void DestroyAndReopen(const Options& options) {
+ // Destroy using last options
+ Destroy(last_options_);
+ ASSERT_OK(TryReopen(options));
+ }
+
+ void Destroy(const Options& options) {
+ Close();
+ ASSERT_OK(DestroyDB(dbname_, options));
+ }
+
+ Status ReadOnlyReopen(const Options& options) {
+ return DB::OpenForReadOnly(options, dbname_, &db_);
+ }
+
+ Status TryReopen(const Options& options) {
+ Close();
+ last_options_ = options;
+ return DB::Open(options, dbname_, &db_);
+ }
+
+ Status Flush(int cf = 0) {
+ if (cf == 0) {
+ return db_->Flush(FlushOptions());
+ } else {
+ return db_->Flush(FlushOptions(), handles_[cf]);
+ }
+ }
+
+ Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions()) {
+ return db_->Put(wo, k, v);
+ }
+
+ Status Put(int cf, const Slice& k, const Slice& v,
+ WriteOptions wo = WriteOptions()) {
+ return db_->Put(wo, handles_[cf], k, v);
+ }
+
+ Status Delete(const std::string& k) {
+ return db_->Delete(WriteOptions(), k);
+ }
+
+ Status Delete(int cf, const std::string& k) {
+ return db_->Delete(WriteOptions(), handles_[cf], k);
+ }
+
+ std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
+ ReadOptions options;
+ options.verify_checksums = true;
+ options.snapshot = snapshot;
+ std::string result;
+ Status s = db_->Get(options, k, &result);
+ if (s.IsNotFound()) {
+ result = "NOT_FOUND";
+ } else if (!s.ok()) {
+ result = s.ToString();
+ }
+ return result;
+ }
+
+ std::string Get(int cf, const std::string& k,
+ const Snapshot* snapshot = nullptr) {
+ ReadOptions options;
+ options.verify_checksums = true;
+ options.snapshot = snapshot;
+ std::string result;
+ Status s = db_->Get(options, handles_[cf], k, &result);
+ if (s.IsNotFound()) {
+ result = "NOT_FOUND";
+ } else if (!s.ok()) {
+ result = s.ToString();
+ }
+ return result;
+ }
+
+ int NumTableFilesAtLevel(int level, int cf = 0) {
+ std::string property;
+ if (cf == 0) {
+ // default cfd
+ EXPECT_TRUE(db_->GetProperty(
+ "rocksdb.num-files-at-level" + NumberToString(level), &property));
+ } else {
+ EXPECT_TRUE(db_->GetProperty(
+ handles_[cf], "rocksdb.num-files-at-level" + NumberToString(level),
+ &property));
+ }
+ return atoi(property.c_str());
+ }
+
+ // Return spread of files per level
+ std::string FilesPerLevel(int cf = 0) {
+ int num_levels =
+ (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]);
+ std::string result;
+ size_t last_non_zero_offset = 0;
+ for (int level = 0; level < num_levels; level++) {
+ int f = NumTableFilesAtLevel(level, cf);
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+ result += buf;
+ if (f > 0) {
+ last_non_zero_offset = result.size();
+ }
+ }
+ result.resize(last_non_zero_offset);
+ return result;
+ }
+
+ uint64_t Size(const Slice& start, const Slice& limit, int cf = 0) {
+ Range r(start, limit);
+ uint64_t size;
+ if (cf == 0) {
+ db_->GetApproximateSizes(&r, 1, &size);
+ } else {
+ db_->GetApproximateSizes(handles_[1], &r, 1, &size);
+ }
+ return size;
+ }
+
+ void Compact(int cf, const Slice& start, const Slice& limit,
+ uint32_t target_path_id) {
+ CompactRangeOptions compact_options;
+ compact_options.target_path_id = target_path_id;
+ ASSERT_OK(db_->CompactRange(compact_options, handles_[cf], &start, &limit));
+ }
+
+ void Compact(int cf, const Slice& start, const Slice& limit) {
+ ASSERT_OK(
+ db_->CompactRange(CompactRangeOptions(), handles_[cf], &start, &limit));
+ }
+
+ void Compact(const Slice& start, const Slice& limit) {
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &limit));
+ }
+
+ void TEST_Compact(int level, int cf, const Slice& start, const Slice& limit) {
+ ASSERT_OK(dbfull()->TEST_CompactRange(level, &start, &limit, handles_[cf],
+ true /* disallow trivial move */));
+ }
+
+ // Do n memtable compactions, each of which produces an sstable
+ // covering the range [small,large].
+ void MakeTables(int n, const std::string& small, const std::string& large,
+ int cf = 0) {
+ for (int i = 0; i < n; i++) {
+ ASSERT_OK(Put(cf, small, "begin"));
+ ASSERT_OK(Put(cf, large, "end"));
+ ASSERT_OK(Flush(cf));
+ }
+ }
+
+ static void SetDeletionCompactionStats(
+ CompactionJobStats *stats, uint64_t input_deletions,
+ uint64_t expired_deletions, uint64_t records_replaced) {
+ stats->num_input_deletion_records = input_deletions;
+ stats->num_expired_deletion_records = expired_deletions;
+ stats->num_records_replaced = records_replaced;
+ }
+
+ void MakeTableWithKeyValues(
+ Random* rnd, uint64_t smallest, uint64_t largest,
+ int key_size, int value_size, uint64_t interval,
+ double ratio, int cf = 0) {
+ for (auto key = smallest; key < largest; key += interval) {
+ ASSERT_OK(Put(cf, Slice(Key(key, key_size)),
+ Slice(RandomString(rnd, value_size, ratio))));
+ }
+ ASSERT_OK(Flush(cf));
+ }
+
+ // This function behaves with the implicit understanding that two
+ // rounds of keys are inserted into the database, as per the behavior
+ // of the DeletionStatsTest.
+ void SelectivelyDeleteKeys(uint64_t smallest, uint64_t largest,
+ uint64_t interval, int deletion_interval, int key_size,
+ uint64_t cutoff_key_num, CompactionJobStats* stats, int cf = 0) {
+
+ // interval needs to be >= 2 so that deletion entries can be inserted
+ // that are intended to not result in an actual key deletion by using
+ // an offset of 1 from another existing key
+ ASSERT_GE(interval, 2);
+
+ uint64_t ctr = 1;
+ uint32_t deletions_made = 0;
+ uint32_t num_deleted = 0;
+ uint32_t num_expired = 0;
+ for (auto key = smallest; key <= largest; key += interval, ctr++) {
+ if (ctr % deletion_interval == 0) {
+ ASSERT_OK(Delete(cf, Key(key, key_size)));
+ deletions_made++;
+ num_deleted++;
+
+ if (key > cutoff_key_num) {
+ num_expired++;
+ }
+ }
+ }
+
+ // Insert some deletions for keys that don't exist that
+ // are both in and out of the key range
+ ASSERT_OK(Delete(cf, Key(smallest+1, key_size)));
+ deletions_made++;
+
+ ASSERT_OK(Delete(cf, Key(smallest-1, key_size)));
+ deletions_made++;
+ num_expired++;
+
+ ASSERT_OK(Delete(cf, Key(smallest-9, key_size)));
+ deletions_made++;
+ num_expired++;
+
+ ASSERT_OK(Flush(cf));
+ SetDeletionCompactionStats(stats, deletions_made, num_expired,
+ num_deleted);
+ }
+};
+
+// An EventListener which helps verify the compaction results in
+// test CompactionJobStatsTest.
+class CompactionJobStatsChecker : public EventListener {
+ public:
+ CompactionJobStatsChecker()
+ : compression_enabled_(false), verify_next_comp_io_stats_(false) {}
+
+ size_t NumberOfUnverifiedStats() { return expected_stats_.size(); }
+
+ void set_verify_next_comp_io_stats(bool v) { verify_next_comp_io_stats_ = v; }
+
+ // Once a compaction completed, this function will verify the returned
+ // CompactionJobInfo with the oldest CompactionJobInfo added earlier
+ // in "expected_stats_" which has not yet being used for verification.
+ void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
+ if (verify_next_comp_io_stats_) {
+ ASSERT_GT(ci.stats.file_write_nanos, 0);
+ ASSERT_GT(ci.stats.file_range_sync_nanos, 0);
+ ASSERT_GT(ci.stats.file_fsync_nanos, 0);
+ ASSERT_GT(ci.stats.file_prepare_write_nanos, 0);
+ verify_next_comp_io_stats_ = false;
+ }
+
+ std::lock_guard<std::mutex> lock(mutex_);
+ if (expected_stats_.size()) {
+ Verify(ci.stats, expected_stats_.front());
+ expected_stats_.pop();
+ }
+ }
+
+ // A helper function which verifies whether two CompactionJobStats
+ // match. The verification of all compaction stats are done by
+ // ASSERT_EQ except for the total input / output bytes, which we
+ // use ASSERT_GE and ASSERT_LE with a reasonable bias ---
+ // 10% in uncompressed case and 20% when compression is used.
+ virtual void Verify(const CompactionJobStats& current_stats,
+ const CompactionJobStats& stats) {
+ // time
+ ASSERT_GT(current_stats.elapsed_micros, 0U);
+
+ ASSERT_EQ(current_stats.num_input_records,
+ stats.num_input_records);
+ ASSERT_EQ(current_stats.num_input_files,
+ stats.num_input_files);
+ ASSERT_EQ(current_stats.num_input_files_at_output_level,
+ stats.num_input_files_at_output_level);
+
+ ASSERT_EQ(current_stats.num_output_records,
+ stats.num_output_records);
+ ASSERT_EQ(current_stats.num_output_files,
+ stats.num_output_files);
+
+ ASSERT_EQ(current_stats.is_manual_compaction,
+ stats.is_manual_compaction);
+
+ // file size
+ double kFileSizeBias = compression_enabled_ ? 0.20 : 0.10;
+ ASSERT_GE(current_stats.total_input_bytes * (1.00 + kFileSizeBias),
+ stats.total_input_bytes);
+ ASSERT_LE(current_stats.total_input_bytes,
+ stats.total_input_bytes * (1.00 + kFileSizeBias));
+ ASSERT_GE(current_stats.total_output_bytes * (1.00 + kFileSizeBias),
+ stats.total_output_bytes);
+ ASSERT_LE(current_stats.total_output_bytes,
+ stats.total_output_bytes * (1.00 + kFileSizeBias));
+ ASSERT_EQ(current_stats.total_input_raw_key_bytes,
+ stats.total_input_raw_key_bytes);
+ ASSERT_EQ(current_stats.total_input_raw_value_bytes,
+ stats.total_input_raw_value_bytes);
+
+ ASSERT_EQ(current_stats.num_records_replaced,
+ stats.num_records_replaced);
+
+ ASSERT_EQ(current_stats.num_corrupt_keys,
+ stats.num_corrupt_keys);
+
+ ASSERT_EQ(
+ std::string(current_stats.smallest_output_key_prefix),
+ std::string(stats.smallest_output_key_prefix));
+ ASSERT_EQ(
+ std::string(current_stats.largest_output_key_prefix),
+ std::string(stats.largest_output_key_prefix));
+ }
+
+ // Add an expected compaction stats, which will be used to
+ // verify the CompactionJobStats returned by the OnCompactionCompleted()
+ // callback.
+ void AddExpectedStats(const CompactionJobStats& stats) {
+ std::lock_guard<std::mutex> lock(mutex_);
+ expected_stats_.push(stats);
+ }
+
+ void EnableCompression(bool flag) {
+ compression_enabled_ = flag;
+ }
+
+ bool verify_next_comp_io_stats() const { return verify_next_comp_io_stats_; }
+
+ private:
+ std::mutex mutex_;
+ std::queue<CompactionJobStats> expected_stats_;
+ bool compression_enabled_;
+ bool verify_next_comp_io_stats_;
+};
+
+// An EventListener which helps verify the compaction statistics in
+// the test DeletionStatsTest.
+class CompactionJobDeletionStatsChecker : public CompactionJobStatsChecker {
+ public:
+ // Verifies whether two CompactionJobStats match.
+ void Verify(const CompactionJobStats& current_stats,
+ const CompactionJobStats& stats) override {
+ ASSERT_EQ(
+ current_stats.num_input_deletion_records,
+ stats.num_input_deletion_records);
+ ASSERT_EQ(
+ current_stats.num_expired_deletion_records,
+ stats.num_expired_deletion_records);
+ ASSERT_EQ(
+ current_stats.num_records_replaced,
+ stats.num_records_replaced);
+
+ ASSERT_EQ(current_stats.num_corrupt_keys,
+ stats.num_corrupt_keys);
+ }
+};
+
+namespace {
+
+uint64_t EstimatedFileSize(
+ uint64_t num_records, size_t key_size, size_t value_size,
+ double compression_ratio = 1.0,
+ size_t block_size = 4096,
+ int bloom_bits_per_key = 10) {
+ const size_t kPerKeyOverhead = 8;
+ const size_t kFooterSize = 512;
+
+ uint64_t data_size =
+ static_cast<uint64_t>(
+ num_records * (key_size + value_size * compression_ratio +
+ kPerKeyOverhead));
+
+ return data_size + kFooterSize
+ + num_records * bloom_bits_per_key / 8 // filter block
+ + data_size * (key_size + 8) / block_size; // index block
+}
+
+namespace {
+
+void CopyPrefix(
+ const Slice& src, size_t prefix_length, std::string* dst) {
+ assert(prefix_length > 0);
+ size_t length = src.size() > prefix_length ? prefix_length : src.size();
+ dst->assign(src.data(), length);
+}
+
+} // namespace
+
+CompactionJobStats NewManualCompactionJobStats(
+ const std::string& smallest_key, const std::string& largest_key,
+ size_t num_input_files, size_t num_input_files_at_output_level,
+ uint64_t num_input_records, size_t key_size, size_t value_size,
+ size_t num_output_files, uint64_t num_output_records,
+ double compression_ratio, uint64_t num_records_replaced,
+ bool is_manual = true) {
+ CompactionJobStats stats;
+ stats.Reset();
+
+ stats.num_input_records = num_input_records;
+ stats.num_input_files = num_input_files;
+ stats.num_input_files_at_output_level = num_input_files_at_output_level;
+
+ stats.num_output_records = num_output_records;
+ stats.num_output_files = num_output_files;
+
+ stats.total_input_bytes =
+ EstimatedFileSize(
+ num_input_records / num_input_files,
+ key_size, value_size, compression_ratio) * num_input_files;
+ stats.total_output_bytes =
+ EstimatedFileSize(
+ num_output_records / num_output_files,
+ key_size, value_size, compression_ratio) * num_output_files;
+ stats.total_input_raw_key_bytes =
+ num_input_records * (key_size + 8);
+ stats.total_input_raw_value_bytes =
+ num_input_records * value_size;
+
+ stats.is_manual_compaction = is_manual;
+
+ stats.num_records_replaced = num_records_replaced;
+
+ CopyPrefix(smallest_key,
+ CompactionJobStats::kMaxPrefixLength,
+ &stats.smallest_output_key_prefix);
+ CopyPrefix(largest_key,
+ CompactionJobStats::kMaxPrefixLength,
+ &stats.largest_output_key_prefix);
+
+ return stats;
+}
+
+CompressionType GetAnyCompression() {
+ if (Snappy_Supported()) {
+ return kSnappyCompression;
+ } else if (Zlib_Supported()) {
+ return kZlibCompression;
+ } else if (BZip2_Supported()) {
+ return kBZip2Compression;
+ } else if (LZ4_Supported()) {
+ return kLZ4Compression;
+ } else if (XPRESS_Supported()) {
+ return kXpressCompression;
+ }
+
+ return kNoCompression;
+}
+
+} // namespace
+
+TEST_P(CompactionJobStatsTest, CompactionJobStatsTest) {
+ Random rnd(301);
+ const int kBufSize = 100;
+ char buf[kBufSize];
+ uint64_t key_base = 100000000l;
+ // Note: key_base must be multiple of num_keys_per_L0_file
+ int num_keys_per_L0_file = 100;
+ const int kTestScale = 8;
+ const int kKeySize = 10;
+ const int kValueSize = 1000;
+ const double kCompressionRatio = 0.5;
+ double compression_ratio = 1.0;
+ uint64_t key_interval = key_base / num_keys_per_L0_file;
+
+ // Whenever a compaction completes, this listener will try to
+ // verify whether the returned CompactionJobStats matches
+ // what we expect. The expected CompactionJobStats is added
+ // via AddExpectedStats().
+ auto* stats_checker = new CompactionJobStatsChecker();
+ Options options;
+ options.listeners.emplace_back(stats_checker);
+ options.create_if_missing = true;
+ // just enough setting to hold off auto-compaction.
+ options.level0_file_num_compaction_trigger = kTestScale + 1;
+ options.num_levels = 3;
+ options.compression = kNoCompression;
+ options.max_subcompactions = max_subcompactions_;
+ options.bytes_per_sync = 512 * 1024;
+
+ options.report_bg_io_stats = true;
+ for (int test = 0; test < 2; ++test) {
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // 1st Phase: generate "num_L0_files" L0 files.
+ int num_L0_files = 0;
+ for (uint64_t start_key = key_base;
+ start_key <= key_base * kTestScale;
+ start_key += key_base) {
+ MakeTableWithKeyValues(
+ &rnd, start_key, start_key + key_base - 1,
+ kKeySize, kValueSize, key_interval,
+ compression_ratio, 1);
+ snprintf(buf, kBufSize, "%d", ++num_L0_files);
+ ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+ }
+ ASSERT_EQ(ToString(num_L0_files), FilesPerLevel(1));
+
+ // 2nd Phase: perform L0 -> L1 compaction.
+ int L0_compaction_count = 6;
+ int count = 1;
+ std::string smallest_key;
+ std::string largest_key;
+ for (uint64_t start_key = key_base;
+ start_key <= key_base * L0_compaction_count;
+ start_key += key_base, count++) {
+ smallest_key = Key(start_key, 10);
+ largest_key = Key(start_key + key_base - key_interval, 10);
+ stats_checker->AddExpectedStats(
+ NewManualCompactionJobStats(
+ smallest_key, largest_key,
+ 1, 0, num_keys_per_L0_file,
+ kKeySize, kValueSize,
+ 1, num_keys_per_L0_file,
+ compression_ratio, 0));
+ ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U);
+ TEST_Compact(0, 1, smallest_key, largest_key);
+ snprintf(buf, kBufSize, "%d,%d", num_L0_files - count, count);
+ ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+ }
+
+ // compact two files into one in the last L0 -> L1 compaction
+ int num_remaining_L0 = num_L0_files - L0_compaction_count;
+ smallest_key = Key(key_base * (L0_compaction_count + 1), 10);
+ largest_key = Key(key_base * (kTestScale + 1) - key_interval, 10);
+ stats_checker->AddExpectedStats(
+ NewManualCompactionJobStats(
+ smallest_key, largest_key,
+ num_remaining_L0,
+ 0, num_keys_per_L0_file * num_remaining_L0,
+ kKeySize, kValueSize,
+ 1, num_keys_per_L0_file * num_remaining_L0,
+ compression_ratio, 0));
+ ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U);
+ TEST_Compact(0, 1, smallest_key, largest_key);
+
+ int num_L1_files = num_L0_files - num_remaining_L0 + 1;
+ num_L0_files = 0;
+ snprintf(buf, kBufSize, "%d,%d", num_L0_files, num_L1_files);
+ ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+
+ // 3rd Phase: generate sparse L0 files (wider key-range, same num of keys)
+ int sparseness = 2;
+ for (uint64_t start_key = key_base;
+ start_key <= key_base * kTestScale;
+ start_key += key_base * sparseness) {
+ MakeTableWithKeyValues(
+ &rnd, start_key, start_key + key_base * sparseness - 1,
+ kKeySize, kValueSize,
+ key_base * sparseness / num_keys_per_L0_file,
+ compression_ratio, 1);
+ snprintf(buf, kBufSize, "%d,%d", ++num_L0_files, num_L1_files);
+ ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+ }
+
+ // 4th Phase: perform L0 -> L1 compaction again, expect higher write amp
+ // When subcompactions are enabled, the number of output files increases
+ // by 1 because multiple threads are consuming the input and generating
+ // output files without coordinating to see if the output could fit into
+ // a smaller number of files like it does when it runs sequentially
+ int num_output_files = options.max_subcompactions > 1 ? 2 : 1;
+ for (uint64_t start_key = key_base;
+ num_L0_files > 1;
+ start_key += key_base * sparseness) {
+ smallest_key = Key(start_key, 10);
+ largest_key =
+ Key(start_key + key_base * sparseness - key_interval, 10);
+ stats_checker->AddExpectedStats(
+ NewManualCompactionJobStats(
+ smallest_key, largest_key,
+ 3, 2, num_keys_per_L0_file * 3,
+ kKeySize, kValueSize,
+ num_output_files,
+ num_keys_per_L0_file * 2, // 1/3 of the data will be updated.
+ compression_ratio,
+ num_keys_per_L0_file));
+ ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U);
+ Compact(1, smallest_key, largest_key);
+ if (options.max_subcompactions == 1) {
+ --num_L1_files;
+ }
+ snprintf(buf, kBufSize, "%d,%d", --num_L0_files, num_L1_files);
+ ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+ }
+
+ // 5th Phase: Do a full compaction, which involves in two sub-compactions.
+ // Here we expect to have 1 L0 files and 4 L1 files
+ // In the first sub-compaction, we expect L0 compaction.
+ smallest_key = Key(key_base, 10);
+ largest_key = Key(key_base * (kTestScale + 1) - key_interval, 10);
+ stats_checker->AddExpectedStats(
+ NewManualCompactionJobStats(
+ Key(key_base * (kTestScale + 1 - sparseness), 10), largest_key,
+ 2, 1, num_keys_per_L0_file * 3,
+ kKeySize, kValueSize,
+ 1, num_keys_per_L0_file * 2,
+ compression_ratio,
+ num_keys_per_L0_file));
+ ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U);
+ Compact(1, smallest_key, largest_key);
+
+ num_L1_files = options.max_subcompactions > 1 ? 7 : 4;
+ char L1_buf[4];
+ snprintf(L1_buf, sizeof(L1_buf), "0,%d", num_L1_files);
+ std::string L1_files(L1_buf);
+ ASSERT_EQ(L1_files, FilesPerLevel(1));
+ options.compression = GetAnyCompression();
+ if (options.compression == kNoCompression) {
+ break;
+ }
+ stats_checker->EnableCompression(true);
+ compression_ratio = kCompressionRatio;
+
+ for (int i = 0; i < 5; i++) {
+ ASSERT_OK(Put(1, Slice(Key(key_base + i, 10)),
+ Slice(RandomString(&rnd, 512 * 1024, 1))));
+ }
+
+ ASSERT_OK(Flush(1));
+ reinterpret_cast<DBImpl*>(db_)->TEST_WaitForCompact();
+
+ stats_checker->set_verify_next_comp_io_stats(true);
+ std::atomic<bool> first_prepare_write(true);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WritableFileWriter::Append:BeforePrepareWrite", [&](void* /*arg*/) {
+ if (first_prepare_write.load()) {
+ options.env->SleepForMicroseconds(3);
+ first_prepare_write.store(false);
+ }
+ });
+
+ std::atomic<bool> first_flush(true);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WritableFileWriter::Flush:BeforeAppend", [&](void* /*arg*/) {
+ if (first_flush.load()) {
+ options.env->SleepForMicroseconds(3);
+ first_flush.store(false);
+ }
+ });
+
+ std::atomic<bool> first_sync(true);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WritableFileWriter::SyncInternal:0", [&](void* /*arg*/) {
+ if (first_sync.load()) {
+ options.env->SleepForMicroseconds(3);
+ first_sync.store(false);
+ }
+ });
+
+ std::atomic<bool> first_range_sync(true);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WritableFileWriter::RangeSync:0", [&](void* /*arg*/) {
+ if (first_range_sync.load()) {
+ options.env->SleepForMicroseconds(3);
+ first_range_sync.store(false);
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Compact(1, smallest_key, largest_key);
+
+ ASSERT_TRUE(!stats_checker->verify_next_comp_io_stats());
+ ASSERT_TRUE(!first_prepare_write.load());
+ ASSERT_TRUE(!first_flush.load());
+ ASSERT_TRUE(!first_sync.load());
+ ASSERT_TRUE(!first_range_sync.load());
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+ ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 0U);
+}
+
+TEST_P(CompactionJobStatsTest, DeletionStatsTest) {
+ Random rnd(301);
+ uint64_t key_base = 100000l;
+ // Note: key_base must be multiple of num_keys_per_L0_file
+ int num_keys_per_L0_file = 20;
+ const int kTestScale = 8; // make sure this is even
+ const int kKeySize = 10;
+ const int kValueSize = 100;
+ double compression_ratio = 1.0;
+ uint64_t key_interval = key_base / num_keys_per_L0_file;
+ uint64_t largest_key_num = key_base * (kTestScale + 1) - key_interval;
+ uint64_t cutoff_key_num = key_base * (kTestScale / 2 + 1) - key_interval;
+ const std::string smallest_key = Key(key_base - 10, kKeySize);
+ const std::string largest_key = Key(largest_key_num + 10, kKeySize);
+
+ // Whenever a compaction completes, this listener will try to
+ // verify whether the returned CompactionJobStats matches
+ // what we expect.
+ auto* stats_checker = new CompactionJobDeletionStatsChecker();
+ Options options;
+ options.listeners.emplace_back(stats_checker);
+ options.create_if_missing = true;
+ options.level0_file_num_compaction_trigger = kTestScale+1;
+ options.num_levels = 3;
+ options.compression = kNoCompression;
+ options.max_bytes_for_level_multiplier = 2;
+ options.max_subcompactions = max_subcompactions_;
+
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Stage 1: Generate several L0 files and then send them to L2 by
+ // using CompactRangeOptions and CompactRange(). These files will
+ // have a strict subset of the keys from the full key-range
+ for (uint64_t start_key = key_base;
+ start_key <= key_base * kTestScale / 2;
+ start_key += key_base) {
+ MakeTableWithKeyValues(
+ &rnd, start_key, start_key + key_base - 1,
+ kKeySize, kValueSize, key_interval,
+ compression_ratio, 1);
+ }
+
+ CompactRangeOptions cr_options;
+ cr_options.change_level = true;
+ cr_options.target_level = 2;
+ db_->CompactRange(cr_options, handles_[1], nullptr, nullptr);
+ ASSERT_GT(NumTableFilesAtLevel(2, 1), 0);
+
+ // Stage 2: Generate files including keys from the entire key range
+ for (uint64_t start_key = key_base;
+ start_key <= key_base * kTestScale;
+ start_key += key_base) {
+ MakeTableWithKeyValues(
+ &rnd, start_key, start_key + key_base - 1,
+ kKeySize, kValueSize, key_interval,
+ compression_ratio, 1);
+ }
+
+ // Send these L0 files to L1
+ TEST_Compact(0, 1, smallest_key, largest_key);
+ ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);
+
+ // Add a new record and flush so now there is a L0 file
+ // with a value too (not just deletions from the next step)
+ ASSERT_OK(Put(1, Key(key_base-6, kKeySize), "test"));
+ ASSERT_OK(Flush(1));
+
+ // Stage 3: Generate L0 files with some deletions so now
+ // there are files with the same key range in L0, L1, and L2
+ int deletion_interval = 3;
+ CompactionJobStats first_compaction_stats;
+ SelectivelyDeleteKeys(key_base, largest_key_num,
+ key_interval, deletion_interval, kKeySize, cutoff_key_num,
+ &first_compaction_stats, 1);
+
+ stats_checker->AddExpectedStats(first_compaction_stats);
+
+ // Stage 4: Trigger compaction and verify the stats
+ TEST_Compact(0, 1, smallest_key, largest_key);
+}
+
+namespace {
+int GetUniversalCompactionInputUnits(uint32_t num_flushes) {
+ uint32_t compaction_input_units;
+ for (compaction_input_units = 1;
+ num_flushes >= compaction_input_units;
+ compaction_input_units *= 2) {
+ if ((num_flushes & compaction_input_units) != 0) {
+ return compaction_input_units > 1 ? compaction_input_units : 0;
+ }
+ }
+ return 0;
+}
+} // namespace
+
+TEST_P(CompactionJobStatsTest, UniversalCompactionTest) {
+ Random rnd(301);
+ uint64_t key_base = 100000000l;
+ // Note: key_base must be multiple of num_keys_per_L0_file
+ int num_keys_per_table = 100;
+ const uint32_t kTestScale = 6;
+ const int kKeySize = 10;
+ const int kValueSize = 900;
+ double compression_ratio = 1.0;
+ uint64_t key_interval = key_base / num_keys_per_table;
+
+ auto* stats_checker = new CompactionJobStatsChecker();
+ Options options;
+ options.listeners.emplace_back(stats_checker);
+ options.create_if_missing = true;
+ options.num_levels = 3;
+ options.compression = kNoCompression;
+ options.level0_file_num_compaction_trigger = 2;
+ options.target_file_size_base = num_keys_per_table * 1000;
+ options.compaction_style = kCompactionStyleUniversal;
+ options.compaction_options_universal.size_ratio = 1;
+ options.compaction_options_universal.max_size_amplification_percent = 1000;
+ options.max_subcompactions = max_subcompactions_;
+
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Generates the expected CompactionJobStats for each compaction
+ for (uint32_t num_flushes = 2; num_flushes <= kTestScale; num_flushes++) {
+ // Here we treat one newly flushed file as an unit.
+ //
+ // For example, if a newly flushed file is 100k, and a compaction has
+ // 4 input units, then this compaction inputs 400k.
+ uint32_t num_input_units = GetUniversalCompactionInputUnits(num_flushes);
+ if (num_input_units == 0) {
+ continue;
+ }
+ // The following statement determines the expected smallest key
+ // based on whether it is a full compaction. A full compaction only
+ // happens when the number of flushes equals to the number of compaction
+ // input runs.
+ uint64_t smallest_key =
+ (num_flushes == num_input_units) ?
+ key_base : key_base * (num_flushes - 1);
+
+ stats_checker->AddExpectedStats(
+ NewManualCompactionJobStats(
+ Key(smallest_key, 10),
+ Key(smallest_key + key_base * num_input_units - key_interval, 10),
+ num_input_units,
+ num_input_units > 2 ? num_input_units / 2 : 0,
+ num_keys_per_table * num_input_units,
+ kKeySize, kValueSize,
+ num_input_units,
+ num_keys_per_table * num_input_units,
+ 1.0, 0, false));
+ dbfull()->TEST_WaitForCompact();
+ }
+ ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 3U);
+
+ for (uint64_t start_key = key_base;
+ start_key <= key_base * kTestScale;
+ start_key += key_base) {
+ MakeTableWithKeyValues(
+ &rnd, start_key, start_key + key_base - 1,
+ kKeySize, kValueSize, key_interval,
+ compression_ratio, 1);
+ reinterpret_cast<DBImpl*>(db_)->TEST_WaitForCompact();
+ }
+ ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 0U);
+}
+
+INSTANTIATE_TEST_CASE_P(CompactionJobStatsTest, CompactionJobStatsTest,
+ ::testing::Values(1, 4));
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "SKIPPED, not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE
+
+#else
+
+int main(int /*argc*/, char** /*argv*/) { return 0; }
+#endif // !defined(IOS_CROSS_COMPILE)
diff --git a/src/rocksdb/db/compaction/compaction_job_test.cc b/src/rocksdb/db/compaction/compaction_job_test.cc
new file mode 100644
index 000000000..e7b46ef97
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_job_test.cc
@@ -0,0 +1,1082 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include <algorithm>
+#include <array>
+#include <cinttypes>
+#include <map>
+#include <string>
+#include <tuple>
+
+#include "db/blob_index.h"
+#include "db/column_family.h"
+#include "db/compaction/compaction_job.h"
+#include "db/db_impl/db_impl.h"
+#include "db/error_handler.h"
+#include "db/version_set.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/mock_table.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+void VerifyInitializationOfCompactionJobStats(
+ const CompactionJobStats& compaction_job_stats) {
+#if !defined(IOS_CROSS_COMPILE)
+ ASSERT_EQ(compaction_job_stats.elapsed_micros, 0U);
+
+ ASSERT_EQ(compaction_job_stats.num_input_records, 0U);
+ ASSERT_EQ(compaction_job_stats.num_input_files, 0U);
+ ASSERT_EQ(compaction_job_stats.num_input_files_at_output_level, 0U);
+
+ ASSERT_EQ(compaction_job_stats.num_output_records, 0U);
+ ASSERT_EQ(compaction_job_stats.num_output_files, 0U);
+
+ ASSERT_EQ(compaction_job_stats.is_manual_compaction, true);
+
+ ASSERT_EQ(compaction_job_stats.total_input_bytes, 0U);
+ ASSERT_EQ(compaction_job_stats.total_output_bytes, 0U);
+
+ ASSERT_EQ(compaction_job_stats.total_input_raw_key_bytes, 0U);
+ ASSERT_EQ(compaction_job_stats.total_input_raw_value_bytes, 0U);
+
+ ASSERT_EQ(compaction_job_stats.smallest_output_key_prefix[0], 0);
+ ASSERT_EQ(compaction_job_stats.largest_output_key_prefix[0], 0);
+
+ ASSERT_EQ(compaction_job_stats.num_records_replaced, 0U);
+
+ ASSERT_EQ(compaction_job_stats.num_input_deletion_records, 0U);
+ ASSERT_EQ(compaction_job_stats.num_expired_deletion_records, 0U);
+
+ ASSERT_EQ(compaction_job_stats.num_corrupt_keys, 0U);
+#endif // !defined(IOS_CROSS_COMPILE)
+}
+
+} // namespace
+
+// TODO(icanadi) Make it simpler once we mock out VersionSet
+class CompactionJobTest : public testing::Test {
+ public:
+ CompactionJobTest()
+ : env_(Env::Default()),
+ fs_(std::make_shared<LegacyFileSystemWrapper>(env_)),
+ dbname_(test::PerThreadDBPath("compaction_job_test")),
+ db_options_(),
+ mutable_cf_options_(cf_options_),
+ table_cache_(NewLRUCache(50000, 16)),
+ write_buffer_manager_(db_options_.db_write_buffer_size),
+ versions_(new VersionSet(dbname_, &db_options_, env_options_,
+ table_cache_.get(), &write_buffer_manager_,
+ &write_controller_,
+ /*block_cache_tracer=*/nullptr)),
+ shutting_down_(false),
+ preserve_deletes_seqnum_(0),
+ mock_table_factory_(new mock::MockTableFactory()),
+ error_handler_(nullptr, db_options_, &mutex_) {
+ EXPECT_OK(env_->CreateDirIfMissing(dbname_));
+ db_options_.env = env_;
+ db_options_.fs = fs_;
+ db_options_.db_paths.emplace_back(dbname_,
+ std::numeric_limits<uint64_t>::max());
+ }
+
+ std::string GenerateFileName(uint64_t file_number) {
+ FileMetaData meta;
+ std::vector<DbPath> db_paths;
+ db_paths.emplace_back(dbname_, std::numeric_limits<uint64_t>::max());
+ meta.fd = FileDescriptor(file_number, 0, 0);
+ return TableFileName(db_paths, meta.fd.GetNumber(), meta.fd.GetPathId());
+ }
+
+ static std::string KeyStr(const std::string& user_key,
+ const SequenceNumber seq_num, const ValueType t) {
+ return InternalKey(user_key, seq_num, t).Encode().ToString();
+ }
+
+ static std::string BlobStr(uint64_t blob_file_number, uint64_t offset,
+ uint64_t size) {
+ std::string blob_index;
+ BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size,
+ kNoCompression);
+ return blob_index;
+ }
+
+ static std::string BlobStrTTL(uint64_t blob_file_number, uint64_t offset,
+ uint64_t size, uint64_t expiration) {
+ std::string blob_index;
+ BlobIndex::EncodeBlobTTL(&blob_index, expiration, blob_file_number, offset,
+ size, kNoCompression);
+ return blob_index;
+ }
+
+ static std::string BlobStrInlinedTTL(const Slice& value,
+ uint64_t expiration) {
+ std::string blob_index;
+ BlobIndex::EncodeInlinedTTL(&blob_index, expiration, value);
+ return blob_index;
+ }
+
+ void AddMockFile(const stl_wrappers::KVMap& contents, int level = 0) {
+ assert(contents.size() > 0);
+
+ bool first_key = true;
+ std::string smallest, largest;
+ InternalKey smallest_key, largest_key;
+ SequenceNumber smallest_seqno = kMaxSequenceNumber;
+ SequenceNumber largest_seqno = 0;
+ uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
+ for (auto kv : contents) {
+ ParsedInternalKey key;
+ std::string skey;
+ std::string value;
+ std::tie(skey, value) = kv;
+ bool parsed = ParseInternalKey(skey, &key);
+
+ smallest_seqno = std::min(smallest_seqno, key.sequence);
+ largest_seqno = std::max(largest_seqno, key.sequence);
+
+ if (first_key ||
+ cfd_->user_comparator()->Compare(key.user_key, smallest) < 0) {
+ smallest.assign(key.user_key.data(), key.user_key.size());
+ smallest_key.DecodeFrom(skey);
+ }
+ if (first_key ||
+ cfd_->user_comparator()->Compare(key.user_key, largest) > 0) {
+ largest.assign(key.user_key.data(), key.user_key.size());
+ largest_key.DecodeFrom(skey);
+ }
+
+ first_key = false;
+
+ if (parsed && key.type == kTypeBlobIndex) {
+ BlobIndex blob_index;
+ const Status s = blob_index.DecodeFrom(value);
+ if (!s.ok()) {
+ continue;
+ }
+
+ if (blob_index.IsInlined() || blob_index.HasTTL() ||
+ blob_index.file_number() == kInvalidBlobFileNumber) {
+ continue;
+ }
+
+ if (oldest_blob_file_number == kInvalidBlobFileNumber ||
+ oldest_blob_file_number > blob_index.file_number()) {
+ oldest_blob_file_number = blob_index.file_number();
+ }
+ }
+ }
+
+ uint64_t file_number = versions_->NewFileNumber();
+ EXPECT_OK(mock_table_factory_->CreateMockTable(
+ env_, GenerateFileName(file_number), std::move(contents)));
+
+ VersionEdit edit;
+ edit.AddFile(level, file_number, 0, 10, smallest_key, largest_key,
+ smallest_seqno, largest_seqno, false, oldest_blob_file_number,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+
+ mutex_.Lock();
+ versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
+ mutable_cf_options_, &edit, &mutex_);
+ mutex_.Unlock();
+ }
+
+ void SetLastSequence(const SequenceNumber sequence_number) {
+ versions_->SetLastAllocatedSequence(sequence_number + 1);
+ versions_->SetLastPublishedSequence(sequence_number + 1);
+ versions_->SetLastSequence(sequence_number + 1);
+ }
+
+ // returns expected result after compaction
+ stl_wrappers::KVMap CreateTwoFiles(bool gen_corrupted_keys) {
+ auto expected_results = mock::MakeMockFile();
+ const int kKeysPerFile = 10000;
+ const int kCorruptKeysPerFile = 200;
+ const int kMatchingKeys = kKeysPerFile / 2;
+ SequenceNumber sequence_number = 0;
+
+ auto corrupt_id = [&](int id) {
+ return gen_corrupted_keys && id > 0 && id <= kCorruptKeysPerFile;
+ };
+
+ for (int i = 0; i < 2; ++i) {
+ auto contents = mock::MakeMockFile();
+ for (int k = 0; k < kKeysPerFile; ++k) {
+ auto key = ToString(i * kMatchingKeys + k);
+ auto value = ToString(i * kKeysPerFile + k);
+ InternalKey internal_key(key, ++sequence_number, kTypeValue);
+
+ // This is how the key will look like once it's written in bottommost
+ // file
+ InternalKey bottommost_internal_key(
+ key, 0, kTypeValue);
+
+ if (corrupt_id(k)) {
+ test::CorruptKeyType(&internal_key);
+ test::CorruptKeyType(&bottommost_internal_key);
+ }
+ contents.insert({ internal_key.Encode().ToString(), value });
+ if (i == 1 || k < kMatchingKeys || corrupt_id(k - kMatchingKeys)) {
+ expected_results.insert(
+ { bottommost_internal_key.Encode().ToString(), value });
+ }
+ }
+
+ AddMockFile(contents);
+ }
+
+ SetLastSequence(sequence_number);
+
+ return expected_results;
+ }
+
+ void NewDB() {
+ DestroyDB(dbname_, Options());
+ EXPECT_OK(env_->CreateDirIfMissing(dbname_));
+ versions_.reset(new VersionSet(dbname_, &db_options_, env_options_,
+ table_cache_.get(), &write_buffer_manager_,
+ &write_controller_,
+ /*block_cache_tracer=*/nullptr));
+ compaction_job_stats_.Reset();
+ SetIdentityFile(env_, dbname_);
+
+ VersionEdit new_db;
+ if (db_options_.write_dbid_to_manifest) {
+ DBImpl* impl = new DBImpl(DBOptions(), dbname_);
+ std::string db_id;
+ impl->GetDbIdentityFromIdentityFile(&db_id);
+ new_db.SetDBId(db_id);
+ }
+ new_db.SetLogNumber(0);
+ new_db.SetNextFile(2);
+ new_db.SetLastSequence(0);
+
+ const std::string manifest = DescriptorFileName(dbname_, 1);
+ std::unique_ptr<WritableFile> file;
+ Status s = env_->NewWritableFile(
+ manifest, &file, env_->OptimizeForManifestWrite(env_options_));
+ ASSERT_OK(s);
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ NewLegacyWritableFileWrapper(std::move(file)), manifest, env_options_));
+ {
+ log::Writer log(std::move(file_writer), 0, false);
+ std::string record;
+ new_db.EncodeTo(&record);
+ s = log.AddRecord(record);
+ }
+ ASSERT_OK(s);
+ // Make "CURRENT" file that points to the new manifest file.
+ s = SetCurrentFile(env_, dbname_, 1, nullptr);
+
+ std::vector<ColumnFamilyDescriptor> column_families;
+ cf_options_.table_factory = mock_table_factory_;
+ cf_options_.merge_operator = merge_op_;
+ cf_options_.compaction_filter = compaction_filter_.get();
+ column_families.emplace_back(kDefaultColumnFamilyName, cf_options_);
+
+ EXPECT_OK(versions_->Recover(column_families, false));
+ cfd_ = versions_->GetColumnFamilySet()->GetDefault();
+ }
+
+ void RunCompaction(
+ const std::vector<std::vector<FileMetaData*>>& input_files,
+ const stl_wrappers::KVMap& expected_results,
+ const std::vector<SequenceNumber>& snapshots = {},
+ SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber,
+ int output_level = 1, bool verify = true,
+ uint64_t expected_oldest_blob_file_number = kInvalidBlobFileNumber) {
+ auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+
+ size_t num_input_files = 0;
+ std::vector<CompactionInputFiles> compaction_input_files;
+ for (size_t level = 0; level < input_files.size(); level++) {
+ auto level_files = input_files[level];
+ CompactionInputFiles compaction_level;
+ compaction_level.level = static_cast<int>(level);
+ compaction_level.files.insert(compaction_level.files.end(),
+ level_files.begin(), level_files.end());
+ compaction_input_files.push_back(compaction_level);
+ num_input_files += level_files.size();
+ }
+
+ Compaction compaction(cfd->current()->storage_info(), *cfd->ioptions(),
+ *cfd->GetLatestMutableCFOptions(),
+ compaction_input_files, output_level, 1024 * 1024,
+ 10 * 1024 * 1024, 0, kNoCompression,
+ cfd->ioptions()->compression_opts, 0, {}, true);
+ compaction.SetInputVersion(cfd->current());
+
+ LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get());
+ mutex_.Lock();
+ EventLogger event_logger(db_options_.info_log.get());
+ // TODO(yiwu) add a mock snapshot checker and add test for it.
+ SnapshotChecker* snapshot_checker = nullptr;
+ CompactionJob compaction_job(
+ 0, &compaction, db_options_, env_options_, versions_.get(),
+ &shutting_down_, preserve_deletes_seqnum_, &log_buffer, nullptr,
+ nullptr, nullptr, &mutex_, &error_handler_, snapshots,
+ earliest_write_conflict_snapshot, snapshot_checker, table_cache_,
+ &event_logger, false, false, dbname_, &compaction_job_stats_,
+ Env::Priority::USER);
+ VerifyInitializationOfCompactionJobStats(compaction_job_stats_);
+
+ compaction_job.Prepare();
+ mutex_.Unlock();
+ Status s;
+ s = compaction_job.Run();
+ ASSERT_OK(s);
+ mutex_.Lock();
+ ASSERT_OK(compaction_job.Install(*cfd->GetLatestMutableCFOptions()));
+ mutex_.Unlock();
+
+ if (verify) {
+ ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U);
+ ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files);
+
+ if (expected_results.empty()) {
+ ASSERT_EQ(compaction_job_stats_.num_output_files, 0U);
+ } else {
+ ASSERT_EQ(compaction_job_stats_.num_output_files, 1U);
+ mock_table_factory_->AssertLatestFile(expected_results);
+
+ auto output_files =
+ cfd->current()->storage_info()->LevelFiles(output_level);
+ ASSERT_EQ(output_files.size(), 1);
+ ASSERT_EQ(output_files[0]->oldest_blob_file_number,
+ expected_oldest_blob_file_number);
+ }
+ }
+ }
+
+ Env* env_;
+ std::shared_ptr<FileSystem> fs_;
+ std::string dbname_;
+ EnvOptions env_options_;
+ ImmutableDBOptions db_options_;
+ ColumnFamilyOptions cf_options_;
+ MutableCFOptions mutable_cf_options_;
+ std::shared_ptr<Cache> table_cache_;
+ WriteController write_controller_;
+ WriteBufferManager write_buffer_manager_;
+ std::unique_ptr<VersionSet> versions_;
+ InstrumentedMutex mutex_;
+ std::atomic<bool> shutting_down_;
+ SequenceNumber preserve_deletes_seqnum_;
+ std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
+ CompactionJobStats compaction_job_stats_;
+ ColumnFamilyData* cfd_;
+ std::unique_ptr<CompactionFilter> compaction_filter_;
+ std::shared_ptr<MergeOperator> merge_op_;
+ ErrorHandler error_handler_;
+};
+
+TEST_F(CompactionJobTest, Simple) {
+ NewDB();
+
+ auto expected_results = CreateTwoFiles(false);
+ auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+ auto files = cfd->current()->storage_info()->LevelFiles(0);
+ ASSERT_EQ(2U, files.size());
+ RunCompaction({ files }, expected_results);
+}
+
+TEST_F(CompactionJobTest, SimpleCorrupted) {
+ NewDB();
+
+ auto expected_results = CreateTwoFiles(true);
+ auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+ auto files = cfd->current()->storage_info()->LevelFiles(0);
+ RunCompaction({files}, expected_results);
+ ASSERT_EQ(compaction_job_stats_.num_corrupt_keys, 400U);
+}
+
+TEST_F(CompactionJobTest, SimpleDeletion) {
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({{KeyStr("c", 4U, kTypeDeletion), ""},
+ {KeyStr("c", 3U, kTypeValue), "val"}});
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({{KeyStr("b", 2U, kTypeValue), "val"},
+ {KeyStr("b", 1U, kTypeValue), "val"}});
+ AddMockFile(file2);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("b", 0U, kTypeValue), "val"}});
+
+ SetLastSequence(4U);
+ auto files = cfd_->current()->storage_info()->LevelFiles(0);
+ RunCompaction({files}, expected_results);
+}
+
+TEST_F(CompactionJobTest, OutputNothing) {
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"}});
+
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({{KeyStr("a", 2U, kTypeDeletion), ""}});
+
+ AddMockFile(file2);
+
+ auto expected_results = mock::MakeMockFile();
+
+ SetLastSequence(4U);
+ auto files = cfd_->current()->storage_info()->LevelFiles(0);
+ RunCompaction({files}, expected_results);
+}
+
+TEST_F(CompactionJobTest, SimpleOverwrite) {
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("a", 3U, kTypeValue), "val2"},
+ {KeyStr("b", 4U, kTypeValue), "val3"},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"},
+ {KeyStr("b", 2U, kTypeValue), "val"}});
+ AddMockFile(file2);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "val2"},
+ {KeyStr("b", 0U, kTypeValue), "val3"}});
+
+ SetLastSequence(4U);
+ auto files = cfd_->current()->storage_info()->LevelFiles(0);
+ RunCompaction({files}, expected_results);
+}
+
+TEST_F(CompactionJobTest, SimpleNonLastLevel) {
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("a", 5U, kTypeValue), "val2"},
+ {KeyStr("b", 6U, kTypeValue), "val3"},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({{KeyStr("a", 3U, kTypeValue), "val"},
+ {KeyStr("b", 4U, kTypeValue), "val"}});
+ AddMockFile(file2, 1);
+
+ auto file3 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"},
+ {KeyStr("b", 2U, kTypeValue), "val"}});
+ AddMockFile(file3, 2);
+
+ // Because level 1 is not the last level, the sequence numbers of a and b
+ // cannot be set to 0
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("a", 5U, kTypeValue), "val2"},
+ {KeyStr("b", 6U, kTypeValue), "val3"}});
+
+ SetLastSequence(6U);
+ auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0);
+ auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1);
+ RunCompaction({lvl0_files, lvl1_files}, expected_results);
+}
+
+TEST_F(CompactionJobTest, SimpleMerge) {
+ merge_op_ = MergeOperators::CreateStringAppendOperator();
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("a", 5U, kTypeMerge), "5"},
+ {KeyStr("a", 4U, kTypeMerge), "4"},
+ {KeyStr("a", 3U, kTypeValue), "3"},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile(
+ {{KeyStr("b", 2U, kTypeMerge), "2"}, {KeyStr("b", 1U, kTypeValue), "1"}});
+ AddMockFile(file2);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "3,4,5"},
+ {KeyStr("b", 0U, kTypeValue), "1,2"}});
+
+ SetLastSequence(5U);
+ auto files = cfd_->current()->storage_info()->LevelFiles(0);
+ RunCompaction({files}, expected_results);
+}
+
+TEST_F(CompactionJobTest, NonAssocMerge) {
+ merge_op_ = MergeOperators::CreateStringAppendTESTOperator();
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("a", 5U, kTypeMerge), "5"},
+ {KeyStr("a", 4U, kTypeMerge), "4"},
+ {KeyStr("a", 3U, kTypeMerge), "3"},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile(
+ {{KeyStr("b", 2U, kTypeMerge), "2"}, {KeyStr("b", 1U, kTypeMerge), "1"}});
+ AddMockFile(file2);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "3,4,5"},
+ {KeyStr("b", 0U, kTypeValue), "1,2"}});
+
+ SetLastSequence(5U);
+ auto files = cfd_->current()->storage_info()->LevelFiles(0);
+ RunCompaction({files}, expected_results);
+}
+
+// Filters merge operands with value 10.
+TEST_F(CompactionJobTest, MergeOperandFilter) {
+ merge_op_ = MergeOperators::CreateUInt64AddOperator();
+ compaction_filter_.reset(new test::FilterNumber(10U));
+ NewDB();
+
+ auto file1 = mock::MakeMockFile(
+ {{KeyStr("a", 5U, kTypeMerge), test::EncodeInt(5U)},
+ {KeyStr("a", 4U, kTypeMerge), test::EncodeInt(10U)}, // Filtered
+ {KeyStr("a", 3U, kTypeMerge), test::EncodeInt(3U)}});
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({
+ {KeyStr("b", 2U, kTypeMerge), test::EncodeInt(2U)},
+ {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)} // Filtered
+ });
+ AddMockFile(file2);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), test::EncodeInt(8U)},
+ {KeyStr("b", 0U, kTypeValue), test::EncodeInt(2U)}});
+
+ SetLastSequence(5U);
+ auto files = cfd_->current()->storage_info()->LevelFiles(0);
+ RunCompaction({files}, expected_results);
+}
+
+TEST_F(CompactionJobTest, FilterSomeMergeOperands) {
+ merge_op_ = MergeOperators::CreateUInt64AddOperator();
+ compaction_filter_.reset(new test::FilterNumber(10U));
+ NewDB();
+
+ auto file1 = mock::MakeMockFile(
+ {{KeyStr("a", 5U, kTypeMerge), test::EncodeInt(5U)},
+ {KeyStr("a", 4U, kTypeMerge), test::EncodeInt(10U)}, // Filtered
+ {KeyStr("a", 3U, kTypeValue), test::EncodeInt(5U)},
+ {KeyStr("d", 8U, kTypeMerge), test::EncodeInt(10U)}});
+ AddMockFile(file1);
+
+ auto file2 =
+ mock::MakeMockFile({{KeyStr("b", 2U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("c", 2U, kTypeMerge), test::EncodeInt(3U)},
+ {KeyStr("c", 1U, kTypeValue), test::EncodeInt(7U)},
+ {KeyStr("d", 1U, kTypeValue), test::EncodeInt(6U)}});
+ AddMockFile(file2);
+
+ auto file3 =
+ mock::MakeMockFile({{KeyStr("a", 1U, kTypeMerge), test::EncodeInt(3U)}});
+ AddMockFile(file3, 2);
+
+ auto expected_results = mock::MakeMockFile({
+ {KeyStr("a", 5U, kTypeValue), test::EncodeInt(10U)},
+ {KeyStr("c", 2U, kTypeValue), test::EncodeInt(10U)},
+ {KeyStr("d", 1U, kTypeValue), test::EncodeInt(6U)}
+ // b does not appear because the operands are filtered
+ });
+
+ SetLastSequence(5U);
+ auto files = cfd_->current()->storage_info()->LevelFiles(0);
+ RunCompaction({files}, expected_results);
+}
+
+// Test where all operands/merge results are filtered out.
+TEST_F(CompactionJobTest, FilterAllMergeOperands) {
+ merge_op_ = MergeOperators::CreateUInt64AddOperator();
+ compaction_filter_.reset(new test::FilterNumber(10U));
+ NewDB();
+
+ auto file1 =
+ mock::MakeMockFile({{KeyStr("a", 11U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("a", 10U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("a", 9U, kTypeMerge), test::EncodeInt(10U)}});
+ AddMockFile(file1);
+
+ auto file2 =
+ mock::MakeMockFile({{KeyStr("b", 8U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("b", 7U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("b", 6U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("b", 5U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("b", 4U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("b", 3U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("b", 2U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("c", 2U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("c", 1U, kTypeMerge), test::EncodeInt(10U)}});
+ AddMockFile(file2);
+
+ auto file3 =
+ mock::MakeMockFile({{KeyStr("a", 2U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)}});
+ AddMockFile(file3, 2);
+
+ SetLastSequence(11U);
+ auto files = cfd_->current()->storage_info()->LevelFiles(0);
+
+ stl_wrappers::KVMap empty_map;
+ RunCompaction({files}, empty_map);
+}
+
+TEST_F(CompactionJobTest, SimpleSingleDelete) {
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("a", 5U, kTypeDeletion), ""},
+ {KeyStr("b", 6U, kTypeSingleDeletion), ""},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({{KeyStr("a", 3U, kTypeValue), "val"},
+ {KeyStr("b", 4U, kTypeValue), "val"}});
+ AddMockFile(file2);
+
+ auto file3 = mock::MakeMockFile({
+ {KeyStr("a", 1U, kTypeValue), "val"},
+ });
+ AddMockFile(file3, 2);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("a", 5U, kTypeDeletion), ""}});
+
+ SetLastSequence(6U);
+ auto files = cfd_->current()->storage_info()->LevelFiles(0);
+ RunCompaction({files}, expected_results);
+}
+
+TEST_F(CompactionJobTest, SingleDeleteSnapshots) {
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("A", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("a", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("b", 21U, kTypeSingleDeletion), ""},
+ {KeyStr("c", 22U, kTypeSingleDeletion), ""},
+ {KeyStr("d", 9U, kTypeSingleDeletion), ""},
+ {KeyStr("f", 21U, kTypeSingleDeletion), ""},
+ {KeyStr("j", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("j", 9U, kTypeSingleDeletion), ""},
+ {KeyStr("k", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("k", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("l", 3U, kTypeSingleDeletion), ""},
+ {KeyStr("l", 2U, kTypeSingleDeletion), ""},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({
+ {KeyStr("0", 2U, kTypeSingleDeletion), ""},
+ {KeyStr("a", 11U, kTypeValue), "val1"},
+ {KeyStr("b", 11U, kTypeValue), "val2"},
+ {KeyStr("c", 21U, kTypeValue), "val3"},
+ {KeyStr("d", 8U, kTypeValue), "val4"},
+ {KeyStr("e", 2U, kTypeSingleDeletion), ""},
+ {KeyStr("f", 1U, kTypeValue), "val1"},
+ {KeyStr("g", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("h", 2U, kTypeSingleDeletion), ""},
+ {KeyStr("m", 12U, kTypeValue), "val1"},
+ {KeyStr("m", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("m", 8U, kTypeValue), "val2"},
+ });
+ AddMockFile(file2);
+
+ auto file3 = mock::MakeMockFile({
+ {KeyStr("A", 1U, kTypeValue), "val"},
+ {KeyStr("e", 1U, kTypeValue), "val"},
+ });
+ AddMockFile(file3, 2);
+
+ auto expected_results = mock::MakeMockFile({
+ {KeyStr("A", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("a", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("a", 11U, kTypeValue), ""},
+ {KeyStr("b", 21U, kTypeSingleDeletion), ""},
+ {KeyStr("b", 11U, kTypeValue), "val2"},
+ {KeyStr("c", 22U, kTypeSingleDeletion), ""},
+ {KeyStr("c", 21U, kTypeValue), ""},
+ {KeyStr("e", 2U, kTypeSingleDeletion), ""},
+ {KeyStr("f", 21U, kTypeSingleDeletion), ""},
+ {KeyStr("f", 1U, kTypeValue), "val1"},
+ {KeyStr("g", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("j", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("k", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("m", 12U, kTypeValue), "val1"},
+ {KeyStr("m", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("m", 8U, kTypeValue), "val2"},
+ });
+
+ SetLastSequence(22U);
+ auto files = cfd_->current()->storage_info()->LevelFiles(0);
+ RunCompaction({files}, expected_results, {10U, 20U}, 10U);
+}
+
+TEST_F(CompactionJobTest, EarliestWriteConflictSnapshot) {
+ NewDB();
+
+ // Test multiple snapshots where the earliest snapshot is not a
+ // write-conflic-snapshot.
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("A", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("A", 23U, kTypeValue), "val"},
+ {KeyStr("B", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("B", 23U, kTypeValue), "val"},
+ {KeyStr("D", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("G", 32U, kTypeSingleDeletion), ""},
+ {KeyStr("G", 31U, kTypeValue), "val"},
+ {KeyStr("G", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("G", 23U, kTypeValue), "val2"},
+ {KeyStr("H", 31U, kTypeValue), "val"},
+ {KeyStr("H", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("H", 23U, kTypeValue), "val"},
+ {KeyStr("I", 35U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 34U, kTypeValue), "val2"},
+ {KeyStr("I", 33U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 32U, kTypeValue), "val3"},
+ {KeyStr("I", 31U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 34U, kTypeValue), "val"},
+ {KeyStr("J", 33U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 25U, kTypeValue), "val2"},
+ {KeyStr("J", 24U, kTypeSingleDeletion), ""},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({
+ {KeyStr("A", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("A", 13U, kTypeValue), "val2"},
+ {KeyStr("C", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("C", 13U, kTypeValue), "val"},
+ {KeyStr("E", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("F", 4U, kTypeSingleDeletion), ""},
+ {KeyStr("F", 3U, kTypeValue), "val"},
+ {KeyStr("G", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("G", 13U, kTypeValue), "val3"},
+ {KeyStr("H", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("H", 13U, kTypeValue), "val2"},
+ {KeyStr("I", 13U, kTypeValue), "val4"},
+ {KeyStr("I", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 11U, kTypeValue), "val5"},
+ {KeyStr("J", 15U, kTypeValue), "val3"},
+ {KeyStr("J", 14U, kTypeSingleDeletion), ""},
+ });
+ AddMockFile(file2);
+
+ auto expected_results = mock::MakeMockFile({
+ {KeyStr("A", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("A", 23U, kTypeValue), ""},
+ {KeyStr("B", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("B", 23U, kTypeValue), ""},
+ {KeyStr("D", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("E", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("G", 32U, kTypeSingleDeletion), ""},
+ {KeyStr("G", 31U, kTypeValue), ""},
+ {KeyStr("H", 31U, kTypeValue), "val"},
+ {KeyStr("I", 35U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 34U, kTypeValue), ""},
+ {KeyStr("I", 31U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 13U, kTypeValue), "val4"},
+ {KeyStr("J", 34U, kTypeValue), "val"},
+ {KeyStr("J", 33U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 25U, kTypeValue), "val2"},
+ {KeyStr("J", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 15U, kTypeValue), "val3"},
+ {KeyStr("J", 14U, kTypeSingleDeletion), ""},
+ });
+
+ SetLastSequence(24U);
+ auto files = cfd_->current()->storage_info()->LevelFiles(0);
+ RunCompaction({files}, expected_results, {10U, 20U, 30U}, 20U);
+}
+
+TEST_F(CompactionJobTest, SingleDeleteZeroSeq) {
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("A", 10U, kTypeSingleDeletion), ""},
+ {KeyStr("dummy", 5U, kTypeValue), "val2"},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({
+ {KeyStr("A", 0U, kTypeValue), "val"},
+ });
+ AddMockFile(file2);
+
+ auto expected_results = mock::MakeMockFile({
+ {KeyStr("dummy", 0U, kTypeValue), "val2"},
+ });
+
+ SetLastSequence(22U);
+ auto files = cfd_->current()->storage_info()->LevelFiles(0);
+ RunCompaction({files}, expected_results, {});
+}
+
+TEST_F(CompactionJobTest, MultiSingleDelete) {
+ // Tests three scenarios involving multiple single delete/put pairs:
+ //
+ // A: Put Snapshot SDel Put SDel -> Put Snapshot SDel
+ // B: Snapshot Put SDel Put SDel Snapshot -> Snapshot SDel Snapshot
+ // C: SDel Put SDel Snapshot Put -> Snapshot Put
+ // D: (Put) SDel Snapshot Put SDel -> (Put) SDel Snapshot SDel
+ // E: Put SDel Snapshot Put SDel -> Snapshot SDel
+ // F: Put SDel Put Sdel Snapshot -> removed
+ // G: Snapshot SDel Put SDel Put -> Snapshot Put SDel
+ // H: (Put) Put SDel Put Sdel Snapshot -> Removed
+ // I: (Put) Snapshot Put SDel Put SDel -> SDel
+ // J: Put Put SDel Put SDel SDel Snapshot Put Put SDel SDel Put
+ // -> Snapshot Put
+ // K: SDel SDel Put SDel Put Put Snapshot SDel Put SDel SDel Put SDel
+ // -> Snapshot Put Snapshot SDel
+ // L: SDel Put Del Put SDel Snapshot Del Put Del SDel Put SDel
+ // -> Snapshot SDel
+ // M: (Put) SDel Put Del Put SDel Snapshot Put Del SDel Put SDel Del
+ // -> SDel Snapshot Del
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("A", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("A", 13U, kTypeValue), "val5"},
+ {KeyStr("A", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("B", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("B", 13U, kTypeValue), "val2"},
+ {KeyStr("C", 14U, kTypeValue), "val3"},
+ {KeyStr("D", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("D", 11U, kTypeValue), "val4"},
+ {KeyStr("G", 15U, kTypeValue), "val"},
+ {KeyStr("G", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("G", 13U, kTypeValue), "val"},
+ {KeyStr("I", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 13U, kTypeValue), "val"},
+ {KeyStr("J", 15U, kTypeValue), "val"},
+ {KeyStr("J", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 13U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 12U, kTypeValue), "val"},
+ {KeyStr("J", 11U, kTypeValue), "val"},
+ {KeyStr("K", 16U, kTypeSingleDeletion), ""},
+ {KeyStr("K", 15U, kTypeValue), "val1"},
+ {KeyStr("K", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("K", 13U, kTypeSingleDeletion), ""},
+ {KeyStr("K", 12U, kTypeValue), "val2"},
+ {KeyStr("K", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("L", 16U, kTypeSingleDeletion), ""},
+ {KeyStr("L", 15U, kTypeValue), "val"},
+ {KeyStr("L", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("L", 13U, kTypeDeletion), ""},
+ {KeyStr("L", 12U, kTypeValue), "val"},
+ {KeyStr("L", 11U, kTypeDeletion), ""},
+ {KeyStr("M", 16U, kTypeDeletion), ""},
+ {KeyStr("M", 15U, kTypeSingleDeletion), ""},
+ {KeyStr("M", 14U, kTypeValue), "val"},
+ {KeyStr("M", 13U, kTypeSingleDeletion), ""},
+ {KeyStr("M", 12U, kTypeDeletion), ""},
+ {KeyStr("M", 11U, kTypeValue), "val"},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({
+ {KeyStr("A", 10U, kTypeValue), "val"},
+ {KeyStr("B", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("B", 11U, kTypeValue), "val2"},
+ {KeyStr("C", 10U, kTypeSingleDeletion), ""},
+ {KeyStr("C", 9U, kTypeValue), "val6"},
+ {KeyStr("C", 8U, kTypeSingleDeletion), ""},
+ {KeyStr("D", 10U, kTypeSingleDeletion), ""},
+ {KeyStr("E", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("E", 11U, kTypeValue), "val"},
+ {KeyStr("E", 5U, kTypeSingleDeletion), ""},
+ {KeyStr("E", 4U, kTypeValue), "val"},
+ {KeyStr("F", 6U, kTypeSingleDeletion), ""},
+ {KeyStr("F", 5U, kTypeValue), "val"},
+ {KeyStr("F", 4U, kTypeSingleDeletion), ""},
+ {KeyStr("F", 3U, kTypeValue), "val"},
+ {KeyStr("G", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("H", 6U, kTypeSingleDeletion), ""},
+ {KeyStr("H", 5U, kTypeValue), "val"},
+ {KeyStr("H", 4U, kTypeSingleDeletion), ""},
+ {KeyStr("H", 3U, kTypeValue), "val"},
+ {KeyStr("I", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 11U, kTypeValue), "val"},
+ {KeyStr("J", 6U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 5U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 4U, kTypeValue), "val"},
+ {KeyStr("J", 3U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 2U, kTypeValue), "val"},
+ {KeyStr("K", 8U, kTypeValue), "val3"},
+ {KeyStr("K", 7U, kTypeValue), "val4"},
+ {KeyStr("K", 6U, kTypeSingleDeletion), ""},
+ {KeyStr("K", 5U, kTypeValue), "val5"},
+ {KeyStr("K", 2U, kTypeSingleDeletion), ""},
+ {KeyStr("K", 1U, kTypeSingleDeletion), ""},
+ {KeyStr("L", 5U, kTypeSingleDeletion), ""},
+ {KeyStr("L", 4U, kTypeValue), "val"},
+ {KeyStr("L", 3U, kTypeDeletion), ""},
+ {KeyStr("L", 2U, kTypeValue), "val"},
+ {KeyStr("L", 1U, kTypeSingleDeletion), ""},
+ {KeyStr("M", 10U, kTypeSingleDeletion), ""},
+ {KeyStr("M", 7U, kTypeValue), "val"},
+ {KeyStr("M", 5U, kTypeDeletion), ""},
+ {KeyStr("M", 4U, kTypeValue), "val"},
+ {KeyStr("M", 3U, kTypeSingleDeletion), ""},
+ });
+ AddMockFile(file2);
+
+ auto file3 = mock::MakeMockFile({
+ {KeyStr("D", 1U, kTypeValue), "val"},
+ {KeyStr("H", 1U, kTypeValue), "val"},
+ {KeyStr("I", 2U, kTypeValue), "val"},
+ });
+ AddMockFile(file3, 2);
+
+ auto file4 = mock::MakeMockFile({
+ {KeyStr("M", 1U, kTypeValue), "val"},
+ });
+ AddMockFile(file4, 2);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("A", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("A", 13U, kTypeValue), ""},
+ {KeyStr("A", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("A", 10U, kTypeValue), "val"},
+ {KeyStr("B", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("B", 13U, kTypeValue), ""},
+ {KeyStr("C", 14U, kTypeValue), "val3"},
+ {KeyStr("D", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("D", 11U, kTypeValue), ""},
+ {KeyStr("D", 10U, kTypeSingleDeletion), ""},
+ {KeyStr("E", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("E", 11U, kTypeValue), ""},
+ {KeyStr("G", 15U, kTypeValue), "val"},
+ {KeyStr("G", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 13U, kTypeValue), ""},
+ {KeyStr("J", 15U, kTypeValue), "val"},
+ {KeyStr("K", 16U, kTypeSingleDeletion), ""},
+ {KeyStr("K", 15U, kTypeValue), ""},
+ {KeyStr("K", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("K", 8U, kTypeValue), "val3"},
+ {KeyStr("L", 16U, kTypeSingleDeletion), ""},
+ {KeyStr("L", 15U, kTypeValue), ""},
+ {KeyStr("M", 16U, kTypeDeletion), ""},
+ {KeyStr("M", 3U, kTypeSingleDeletion), ""}});
+
+ SetLastSequence(22U);
+ auto files = cfd_->current()->storage_info()->LevelFiles(0);
+ RunCompaction({files}, expected_results, {10U}, 10U);
+}
+
+// This test documents the behavior where a corrupt key follows a deletion or a
+// single deletion and the (single) deletion gets removed while the corrupt key
+// gets written out. TODO(noetzli): We probably want a better way to treat
+// corrupt keys.
+TEST_F(CompactionJobTest, CorruptionAfterDeletion) {
+ NewDB();
+
+ auto file1 =
+ mock::MakeMockFile({{test::KeyStr("A", 6U, kTypeValue), "val3"},
+ {test::KeyStr("a", 5U, kTypeDeletion), ""},
+ {test::KeyStr("a", 4U, kTypeValue, true), "val"}});
+ AddMockFile(file1);
+
+ auto file2 =
+ mock::MakeMockFile({{test::KeyStr("b", 3U, kTypeSingleDeletion), ""},
+ {test::KeyStr("b", 2U, kTypeValue, true), "val"},
+ {test::KeyStr("c", 1U, kTypeValue), "val2"}});
+ AddMockFile(file2);
+
+ auto expected_results =
+ mock::MakeMockFile({{test::KeyStr("A", 0U, kTypeValue), "val3"},
+ {test::KeyStr("a", 0U, kTypeValue, true), "val"},
+ {test::KeyStr("b", 0U, kTypeValue, true), "val"},
+ {test::KeyStr("c", 0U, kTypeValue), "val2"}});
+
+ SetLastSequence(6U);
+ auto files = cfd_->current()->storage_info()->LevelFiles(0);
+ RunCompaction({files}, expected_results);
+}
+
+TEST_F(CompactionJobTest, OldestBlobFileNumber) {
+ NewDB();
+
+ // Note: blob1 is inlined TTL, so it will not be considered for the purposes
+ // of identifying the oldest referenced blob file. Similarly, blob6 will be
+ // ignored because it has TTL and hence refers to a TTL blob file.
+ const stl_wrappers::KVMap::value_type blob1(
+ KeyStr("a", 1U, kTypeBlobIndex), BlobStrInlinedTTL("foo", 1234567890ULL));
+ const stl_wrappers::KVMap::value_type blob2(KeyStr("b", 2U, kTypeBlobIndex),
+ BlobStr(59, 123456, 999));
+ const stl_wrappers::KVMap::value_type blob3(KeyStr("c", 3U, kTypeBlobIndex),
+ BlobStr(138, 1000, 1 << 8));
+ auto file1 = mock::MakeMockFile({blob1, blob2, blob3});
+ AddMockFile(file1);
+
+ const stl_wrappers::KVMap::value_type blob4(KeyStr("d", 4U, kTypeBlobIndex),
+ BlobStr(199, 3 << 10, 1 << 20));
+ const stl_wrappers::KVMap::value_type blob5(KeyStr("e", 5U, kTypeBlobIndex),
+ BlobStr(19, 6789, 333));
+ const stl_wrappers::KVMap::value_type blob6(
+ KeyStr("f", 6U, kTypeBlobIndex),
+ BlobStrTTL(5, 2048, 1 << 7, 1234567890ULL));
+ auto file2 = mock::MakeMockFile({blob4, blob5, blob6});
+ AddMockFile(file2);
+
+ const stl_wrappers::KVMap::value_type expected_blob1(
+ KeyStr("a", 0U, kTypeBlobIndex), blob1.second);
+ const stl_wrappers::KVMap::value_type expected_blob2(
+ KeyStr("b", 0U, kTypeBlobIndex), blob2.second);
+ const stl_wrappers::KVMap::value_type expected_blob3(
+ KeyStr("c", 0U, kTypeBlobIndex), blob3.second);
+ const stl_wrappers::KVMap::value_type expected_blob4(
+ KeyStr("d", 0U, kTypeBlobIndex), blob4.second);
+ const stl_wrappers::KVMap::value_type expected_blob5(
+ KeyStr("e", 0U, kTypeBlobIndex), blob5.second);
+ const stl_wrappers::KVMap::value_type expected_blob6(
+ KeyStr("f", 0U, kTypeBlobIndex), blob6.second);
+ auto expected_results =
+ mock::MakeMockFile({expected_blob1, expected_blob2, expected_blob3,
+ expected_blob4, expected_blob5, expected_blob6});
+
+ SetLastSequence(6U);
+ auto files = cfd_->current()->storage_info()->LevelFiles(0);
+ RunCompaction({files}, expected_results, std::vector<SequenceNumber>(),
+ kMaxSequenceNumber, /* output_level */ 1, /* verify */ true,
+ /* expected_oldest_blob_file_number */ 19);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr,
+ "SKIPPED as CompactionJobStats is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_picker.cc b/src/rocksdb/db/compaction/compaction_picker.cc
new file mode 100644
index 000000000..4355d4b91
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker.cc
@@ -0,0 +1,1131 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_picker.h"
+
+#include <cinttypes>
+#include <limits>
+#include <queue>
+#include <string>
+#include <utility>
+#include <vector>
+#include "db/column_family.h"
+#include "file/filename.h"
+#include "logging/log_buffer.h"
+#include "monitoring/statistics.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+uint64_t TotalCompensatedFileSize(const std::vector<FileMetaData*>& files) {
+ uint64_t sum = 0;
+ for (size_t i = 0; i < files.size() && files[i]; i++) {
+ sum += files[i]->compensated_file_size;
+ }
+ return sum;
+}
+} // anonymous namespace
+
+bool FindIntraL0Compaction(const std::vector<FileMetaData*>& level_files,
+ size_t min_files_to_compact,
+ uint64_t max_compact_bytes_per_del_file,
+ uint64_t max_compaction_bytes,
+ CompactionInputFiles* comp_inputs,
+ SequenceNumber earliest_mem_seqno) {
+ // Do not pick ingested file when there is at least one memtable not flushed
+ // which of seqno is overlap with the sst.
+ TEST_SYNC_POINT("FindIntraL0Compaction");
+ size_t start = 0;
+ for (; start < level_files.size(); start++) {
+ if (level_files[start]->being_compacted) {
+ return false;
+ }
+ // If there is no data in memtable, the earliest sequence number would the
+ // largest sequence number in last memtable.
+ // Because all files are sorted in descending order by largest_seqno, so we
+ // only need to check the first one.
+ if (level_files[start]->fd.largest_seqno <= earliest_mem_seqno) {
+ break;
+ }
+ }
+ if (start >= level_files.size()) {
+ return false;
+ }
+ size_t compact_bytes = static_cast<size_t>(level_files[start]->fd.file_size);
+ uint64_t compensated_compact_bytes =
+ level_files[start]->compensated_file_size;
+ size_t compact_bytes_per_del_file = port::kMaxSizet;
+ // Compaction range will be [start, limit).
+ size_t limit;
+ // Pull in files until the amount of compaction work per deleted file begins
+ // increasing or maximum total compaction size is reached.
+ size_t new_compact_bytes_per_del_file = 0;
+ for (limit = start + 1; limit < level_files.size(); ++limit) {
+ compact_bytes += static_cast<size_t>(level_files[limit]->fd.file_size);
+ compensated_compact_bytes += level_files[limit]->compensated_file_size;
+ new_compact_bytes_per_del_file = compact_bytes / (limit - start);
+ if (level_files[limit]->being_compacted ||
+ new_compact_bytes_per_del_file > compact_bytes_per_del_file ||
+ compensated_compact_bytes > max_compaction_bytes) {
+ break;
+ }
+ compact_bytes_per_del_file = new_compact_bytes_per_del_file;
+ }
+
+ if ((limit - start) >= min_files_to_compact &&
+ compact_bytes_per_del_file < max_compact_bytes_per_del_file) {
+ assert(comp_inputs != nullptr);
+ comp_inputs->level = 0;
+ for (size_t i = start; i < limit; ++i) {
+ comp_inputs->files.push_back(level_files[i]);
+ }
+ return true;
+ }
+ return false;
+}
+
+// Determine compression type, based on user options, level of the output
+// file and whether compression is disabled.
+// If enable_compression is false, then compression is always disabled no
+// matter what the values of the other two parameters are.
+// Otherwise, the compression type is determined based on options and level.
+CompressionType GetCompressionType(const ImmutableCFOptions& ioptions,
+ const VersionStorageInfo* vstorage,
+ const MutableCFOptions& mutable_cf_options,
+ int level, int base_level,
+ const bool enable_compression) {
+ if (!enable_compression) {
+ // disable compression
+ return kNoCompression;
+ }
+
+ // If bottommost_compression is set and we are compacting to the
+ // bottommost level then we should use it.
+ if (ioptions.bottommost_compression != kDisableCompressionOption &&
+ level >= (vstorage->num_non_empty_levels() - 1)) {
+ return ioptions.bottommost_compression;
+ }
+ // If the user has specified a different compression level for each level,
+ // then pick the compression for that level.
+ if (!ioptions.compression_per_level.empty()) {
+ assert(level == 0 || level >= base_level);
+ int idx = (level == 0) ? 0 : level - base_level + 1;
+
+ const int n = static_cast<int>(ioptions.compression_per_level.size()) - 1;
+ // It is possible for level_ to be -1; in that case, we use level
+ // 0's compression. This occurs mostly in backwards compatibility
+ // situations when the builder doesn't know what level the file
+ // belongs to. Likewise, if level is beyond the end of the
+ // specified compression levels, use the last value.
+ return ioptions.compression_per_level[std::max(0, std::min(idx, n))];
+ } else {
+ return mutable_cf_options.compression;
+ }
+}
+
+CompressionOptions GetCompressionOptions(const ImmutableCFOptions& ioptions,
+ const VersionStorageInfo* vstorage,
+ int level,
+ const bool enable_compression) {
+ if (!enable_compression) {
+ return ioptions.compression_opts;
+ }
+ // If bottommost_compression is set and we are compacting to the
+ // bottommost level then we should use the specified compression options
+ // for the bottmomost_compression.
+ if (ioptions.bottommost_compression != kDisableCompressionOption &&
+ level >= (vstorage->num_non_empty_levels() - 1) &&
+ ioptions.bottommost_compression_opts.enabled) {
+ return ioptions.bottommost_compression_opts;
+ }
+ return ioptions.compression_opts;
+}
+
+CompactionPicker::CompactionPicker(const ImmutableCFOptions& ioptions,
+ const InternalKeyComparator* icmp)
+ : ioptions_(ioptions), icmp_(icmp) {}
+
+CompactionPicker::~CompactionPicker() {}
+
+// Delete this compaction from the list of running compactions.
+void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) {
+ UnregisterCompaction(c);
+ if (!status.ok()) {
+ c->ResetNextCompactionIndex();
+ }
+}
+
+void CompactionPicker::GetRange(const CompactionInputFiles& inputs,
+ InternalKey* smallest,
+ InternalKey* largest) const {
+ const int level = inputs.level;
+ assert(!inputs.empty());
+ smallest->Clear();
+ largest->Clear();
+
+ if (level == 0) {
+ for (size_t i = 0; i < inputs.size(); i++) {
+ FileMetaData* f = inputs[i];
+ if (i == 0) {
+ *smallest = f->smallest;
+ *largest = f->largest;
+ } else {
+ if (icmp_->Compare(f->smallest, *smallest) < 0) {
+ *smallest = f->smallest;
+ }
+ if (icmp_->Compare(f->largest, *largest) > 0) {
+ *largest = f->largest;
+ }
+ }
+ }
+ } else {
+ *smallest = inputs[0]->smallest;
+ *largest = inputs[inputs.size() - 1]->largest;
+ }
+}
+
+void CompactionPicker::GetRange(const CompactionInputFiles& inputs1,
+ const CompactionInputFiles& inputs2,
+ InternalKey* smallest,
+ InternalKey* largest) const {
+ assert(!inputs1.empty() || !inputs2.empty());
+ if (inputs1.empty()) {
+ GetRange(inputs2, smallest, largest);
+ } else if (inputs2.empty()) {
+ GetRange(inputs1, smallest, largest);
+ } else {
+ InternalKey smallest1, smallest2, largest1, largest2;
+ GetRange(inputs1, &smallest1, &largest1);
+ GetRange(inputs2, &smallest2, &largest2);
+ *smallest =
+ icmp_->Compare(smallest1, smallest2) < 0 ? smallest1 : smallest2;
+ *largest = icmp_->Compare(largest1, largest2) < 0 ? largest2 : largest1;
+ }
+}
+
+void CompactionPicker::GetRange(const std::vector<CompactionInputFiles>& inputs,
+ InternalKey* smallest,
+ InternalKey* largest) const {
+ InternalKey current_smallest;
+ InternalKey current_largest;
+ bool initialized = false;
+ for (const auto& in : inputs) {
+ if (in.empty()) {
+ continue;
+ }
+ GetRange(in, &current_smallest, &current_largest);
+ if (!initialized) {
+ *smallest = current_smallest;
+ *largest = current_largest;
+ initialized = true;
+ } else {
+ if (icmp_->Compare(current_smallest, *smallest) < 0) {
+ *smallest = current_smallest;
+ }
+ if (icmp_->Compare(current_largest, *largest) > 0) {
+ *largest = current_largest;
+ }
+ }
+ }
+ assert(initialized);
+}
+
+bool CompactionPicker::ExpandInputsToCleanCut(const std::string& /*cf_name*/,
+ VersionStorageInfo* vstorage,
+ CompactionInputFiles* inputs,
+ InternalKey** next_smallest) {
+ // This isn't good compaction
+ assert(!inputs->empty());
+
+ const int level = inputs->level;
+ // GetOverlappingInputs will always do the right thing for level-0.
+ // So we don't need to do any expansion if level == 0.
+ if (level == 0) {
+ return true;
+ }
+
+ InternalKey smallest, largest;
+
+ // Keep expanding inputs until we are sure that there is a "clean cut"
+ // boundary between the files in input and the surrounding files.
+ // This will ensure that no parts of a key are lost during compaction.
+ int hint_index = -1;
+ size_t old_size;
+ do {
+ old_size = inputs->size();
+ GetRange(*inputs, &smallest, &largest);
+ inputs->clear();
+ vstorage->GetOverlappingInputs(level, &smallest, &largest, &inputs->files,
+ hint_index, &hint_index, true,
+ next_smallest);
+ } while (inputs->size() > old_size);
+
+ // we started off with inputs non-empty and the previous loop only grew
+ // inputs. thus, inputs should be non-empty here
+ assert(!inputs->empty());
+
+ // If, after the expansion, there are files that are already under
+ // compaction, then we must drop/cancel this compaction.
+ if (AreFilesInCompaction(inputs->files)) {
+ return false;
+ }
+ return true;
+}
+
+bool CompactionPicker::RangeOverlapWithCompaction(
+ const Slice& smallest_user_key, const Slice& largest_user_key,
+ int level) const {
+ const Comparator* ucmp = icmp_->user_comparator();
+ for (Compaction* c : compactions_in_progress_) {
+ if (c->output_level() == level &&
+ ucmp->Compare(smallest_user_key, c->GetLargestUserKey()) <= 0 &&
+ ucmp->Compare(largest_user_key, c->GetSmallestUserKey()) >= 0) {
+ // Overlap
+ return true;
+ }
+ }
+ // Did not overlap with any running compaction in level `level`
+ return false;
+}
+
+bool CompactionPicker::FilesRangeOverlapWithCompaction(
+ const std::vector<CompactionInputFiles>& inputs, int level) const {
+ bool is_empty = true;
+ for (auto& in : inputs) {
+ if (!in.empty()) {
+ is_empty = false;
+ break;
+ }
+ }
+ if (is_empty) {
+ // No files in inputs
+ return false;
+ }
+
+ InternalKey smallest, largest;
+ GetRange(inputs, &smallest, &largest);
+ return RangeOverlapWithCompaction(smallest.user_key(), largest.user_key(),
+ level);
+}
+
+// Returns true if any one of specified files are being compacted
+bool CompactionPicker::AreFilesInCompaction(
+ const std::vector<FileMetaData*>& files) {
+ for (size_t i = 0; i < files.size(); i++) {
+ if (files[i]->being_compacted) {
+ return true;
+ }
+ }
+ return false;
+}
+
+Compaction* CompactionPicker::CompactFiles(
+ const CompactionOptions& compact_options,
+ const std::vector<CompactionInputFiles>& input_files, int output_level,
+ VersionStorageInfo* vstorage, const MutableCFOptions& mutable_cf_options,
+ uint32_t output_path_id) {
+ assert(input_files.size());
+ // This compaction output should not overlap with a running compaction as
+ // `SanitizeCompactionInputFiles` should've checked earlier and db mutex
+ // shouldn't have been released since.
+ assert(!FilesRangeOverlapWithCompaction(input_files, output_level));
+
+ CompressionType compression_type;
+ if (compact_options.compression == kDisableCompressionOption) {
+ int base_level;
+ if (ioptions_.compaction_style == kCompactionStyleLevel) {
+ base_level = vstorage->base_level();
+ } else {
+ base_level = 1;
+ }
+ compression_type =
+ GetCompressionType(ioptions_, vstorage, mutable_cf_options,
+ output_level, base_level);
+ } else {
+ // TODO(ajkr): `CompactionOptions` offers configurable `CompressionType`
+ // without configurable `CompressionOptions`, which is inconsistent.
+ compression_type = compact_options.compression;
+ }
+ auto c = new Compaction(
+ vstorage, ioptions_, mutable_cf_options, input_files, output_level,
+ compact_options.output_file_size_limit,
+ mutable_cf_options.max_compaction_bytes, output_path_id, compression_type,
+ GetCompressionOptions(ioptions_, vstorage, output_level),
+ compact_options.max_subcompactions,
+ /* grandparents */ {}, true);
+ RegisterCompaction(c);
+ return c;
+}
+
+Status CompactionPicker::GetCompactionInputsFromFileNumbers(
+ std::vector<CompactionInputFiles>* input_files,
+ std::unordered_set<uint64_t>* input_set, const VersionStorageInfo* vstorage,
+ const CompactionOptions& /*compact_options*/) const {
+ if (input_set->size() == 0U) {
+ return Status::InvalidArgument(
+ "Compaction must include at least one file.");
+ }
+ assert(input_files);
+
+ std::vector<CompactionInputFiles> matched_input_files;
+ matched_input_files.resize(vstorage->num_levels());
+ int first_non_empty_level = -1;
+ int last_non_empty_level = -1;
+ // TODO(yhchiang): use a lazy-initialized mapping from
+ // file_number to FileMetaData in Version.
+ for (int level = 0; level < vstorage->num_levels(); ++level) {
+ for (auto file : vstorage->LevelFiles(level)) {
+ auto iter = input_set->find(file->fd.GetNumber());
+ if (iter != input_set->end()) {
+ matched_input_files[level].files.push_back(file);
+ input_set->erase(iter);
+ last_non_empty_level = level;
+ if (first_non_empty_level == -1) {
+ first_non_empty_level = level;
+ }
+ }
+ }
+ }
+
+ if (!input_set->empty()) {
+ std::string message(
+ "Cannot find matched SST files for the following file numbers:");
+ for (auto fn : *input_set) {
+ message += " ";
+ message += ToString(fn);
+ }
+ return Status::InvalidArgument(message);
+ }
+
+ for (int level = first_non_empty_level; level <= last_non_empty_level;
+ ++level) {
+ matched_input_files[level].level = level;
+ input_files->emplace_back(std::move(matched_input_files[level]));
+ }
+
+ return Status::OK();
+}
+
+// Returns true if any one of the parent files are being compacted
+bool CompactionPicker::IsRangeInCompaction(VersionStorageInfo* vstorage,
+ const InternalKey* smallest,
+ const InternalKey* largest,
+ int level, int* level_index) {
+ std::vector<FileMetaData*> inputs;
+ assert(level < NumberLevels());
+
+ vstorage->GetOverlappingInputs(level, smallest, largest, &inputs,
+ level_index ? *level_index : 0, level_index);
+ return AreFilesInCompaction(inputs);
+}
+
+// Populates the set of inputs of all other levels that overlap with the
+// start level.
+// Now we assume all levels except start level and output level are empty.
+// Will also attempt to expand "start level" if that doesn't expand
+// "output level" or cause "level" to include a file for compaction that has an
+// overlapping user-key with another file.
+// REQUIRES: input_level and output_level are different
+// REQUIRES: inputs->empty() == false
+// Returns false if files on parent level are currently in compaction, which
+// means that we can't compact them
+bool CompactionPicker::SetupOtherInputs(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage, CompactionInputFiles* inputs,
+ CompactionInputFiles* output_level_inputs, int* parent_index,
+ int base_index) {
+ assert(!inputs->empty());
+ assert(output_level_inputs->empty());
+ const int input_level = inputs->level;
+ const int output_level = output_level_inputs->level;
+ if (input_level == output_level) {
+ // no possibility of conflict
+ return true;
+ }
+
+ // For now, we only support merging two levels, start level and output level.
+ // We need to assert other levels are empty.
+ for (int l = input_level + 1; l < output_level; l++) {
+ assert(vstorage->NumLevelFiles(l) == 0);
+ }
+
+ InternalKey smallest, largest;
+
+ // Get the range one last time.
+ GetRange(*inputs, &smallest, &largest);
+
+ // Populate the set of next-level files (inputs_GetOutputLevelInputs()) to
+ // include in compaction
+ vstorage->GetOverlappingInputs(output_level, &smallest, &largest,
+ &output_level_inputs->files, *parent_index,
+ parent_index);
+ if (AreFilesInCompaction(output_level_inputs->files)) {
+ return false;
+ }
+ if (!output_level_inputs->empty()) {
+ if (!ExpandInputsToCleanCut(cf_name, vstorage, output_level_inputs)) {
+ return false;
+ }
+ }
+
+ // See if we can further grow the number of inputs in "level" without
+ // changing the number of "level+1" files we pick up. We also choose NOT
+ // to expand if this would cause "level" to include some entries for some
+ // user key, while excluding other entries for the same user key. This
+ // can happen when one user key spans multiple files.
+ if (!output_level_inputs->empty()) {
+ const uint64_t limit = mutable_cf_options.max_compaction_bytes;
+ const uint64_t output_level_inputs_size =
+ TotalCompensatedFileSize(output_level_inputs->files);
+ const uint64_t inputs_size = TotalCompensatedFileSize(inputs->files);
+ bool expand_inputs = false;
+
+ CompactionInputFiles expanded_inputs;
+ expanded_inputs.level = input_level;
+ // Get closed interval of output level
+ InternalKey all_start, all_limit;
+ GetRange(*inputs, *output_level_inputs, &all_start, &all_limit);
+ bool try_overlapping_inputs = true;
+ vstorage->GetOverlappingInputs(input_level, &all_start, &all_limit,
+ &expanded_inputs.files, base_index, nullptr);
+ uint64_t expanded_inputs_size =
+ TotalCompensatedFileSize(expanded_inputs.files);
+ if (!ExpandInputsToCleanCut(cf_name, vstorage, &expanded_inputs)) {
+ try_overlapping_inputs = false;
+ }
+ if (try_overlapping_inputs && expanded_inputs.size() > inputs->size() &&
+ output_level_inputs_size + expanded_inputs_size < limit &&
+ !AreFilesInCompaction(expanded_inputs.files)) {
+ InternalKey new_start, new_limit;
+ GetRange(expanded_inputs, &new_start, &new_limit);
+ CompactionInputFiles expanded_output_level_inputs;
+ expanded_output_level_inputs.level = output_level;
+ vstorage->GetOverlappingInputs(output_level, &new_start, &new_limit,
+ &expanded_output_level_inputs.files,
+ *parent_index, parent_index);
+ assert(!expanded_output_level_inputs.empty());
+ if (!AreFilesInCompaction(expanded_output_level_inputs.files) &&
+ ExpandInputsToCleanCut(cf_name, vstorage,
+ &expanded_output_level_inputs) &&
+ expanded_output_level_inputs.size() == output_level_inputs->size()) {
+ expand_inputs = true;
+ }
+ }
+ if (!expand_inputs) {
+ vstorage->GetCleanInputsWithinInterval(input_level, &all_start,
+ &all_limit, &expanded_inputs.files,
+ base_index, nullptr);
+ expanded_inputs_size = TotalCompensatedFileSize(expanded_inputs.files);
+ if (expanded_inputs.size() > inputs->size() &&
+ output_level_inputs_size + expanded_inputs_size < limit &&
+ !AreFilesInCompaction(expanded_inputs.files)) {
+ expand_inputs = true;
+ }
+ }
+ if (expand_inputs) {
+ ROCKS_LOG_INFO(ioptions_.info_log,
+ "[%s] Expanding@%d %" ROCKSDB_PRIszt "+%" ROCKSDB_PRIszt
+ "(%" PRIu64 "+%" PRIu64 " bytes) to %" ROCKSDB_PRIszt
+ "+%" ROCKSDB_PRIszt " (%" PRIu64 "+%" PRIu64 " bytes)\n",
+ cf_name.c_str(), input_level, inputs->size(),
+ output_level_inputs->size(), inputs_size,
+ output_level_inputs_size, expanded_inputs.size(),
+ output_level_inputs->size(), expanded_inputs_size,
+ output_level_inputs_size);
+ inputs->files = expanded_inputs.files;
+ }
+ }
+ return true;
+}
+
+void CompactionPicker::GetGrandparents(
+ VersionStorageInfo* vstorage, const CompactionInputFiles& inputs,
+ const CompactionInputFiles& output_level_inputs,
+ std::vector<FileMetaData*>* grandparents) {
+ InternalKey start, limit;
+ GetRange(inputs, output_level_inputs, &start, &limit);
+ // Compute the set of grandparent files that overlap this compaction
+ // (parent == level+1; grandparent == level+2)
+ if (output_level_inputs.level + 1 < NumberLevels()) {
+ vstorage->GetOverlappingInputs(output_level_inputs.level + 1, &start,
+ &limit, grandparents);
+ }
+}
+
+Compaction* CompactionPicker::CompactRange(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage, int input_level, int output_level,
+ const CompactRangeOptions& compact_range_options, const InternalKey* begin,
+ const InternalKey* end, InternalKey** compaction_end, bool* manual_conflict,
+ uint64_t max_file_num_to_ignore) {
+ // CompactionPickerFIFO has its own implementation of compact range
+ assert(ioptions_.compaction_style != kCompactionStyleFIFO);
+
+ if (input_level == ColumnFamilyData::kCompactAllLevels) {
+ assert(ioptions_.compaction_style == kCompactionStyleUniversal);
+
+ // Universal compaction with more than one level always compacts all the
+ // files together to the last level.
+ assert(vstorage->num_levels() > 1);
+ // DBImpl::CompactRange() set output level to be the last level
+ if (ioptions_.allow_ingest_behind) {
+ assert(output_level == vstorage->num_levels() - 2);
+ } else {
+ assert(output_level == vstorage->num_levels() - 1);
+ }
+ // DBImpl::RunManualCompaction will make full range for universal compaction
+ assert(begin == nullptr);
+ assert(end == nullptr);
+ *compaction_end = nullptr;
+
+ int start_level = 0;
+ for (; start_level < vstorage->num_levels() &&
+ vstorage->NumLevelFiles(start_level) == 0;
+ start_level++) {
+ }
+ if (start_level == vstorage->num_levels()) {
+ return nullptr;
+ }
+
+ if ((start_level == 0) && (!level0_compactions_in_progress_.empty())) {
+ *manual_conflict = true;
+ // Only one level 0 compaction allowed
+ return nullptr;
+ }
+
+ std::vector<CompactionInputFiles> inputs(vstorage->num_levels() -
+ start_level);
+ for (int level = start_level; level < vstorage->num_levels(); level++) {
+ inputs[level - start_level].level = level;
+ auto& files = inputs[level - start_level].files;
+ for (FileMetaData* f : vstorage->LevelFiles(level)) {
+ files.push_back(f);
+ }
+ if (AreFilesInCompaction(files)) {
+ *manual_conflict = true;
+ return nullptr;
+ }
+ }
+
+ // 2 non-exclusive manual compactions could run at the same time producing
+ // overlaping outputs in the same level.
+ if (FilesRangeOverlapWithCompaction(inputs, output_level)) {
+ // This compaction output could potentially conflict with the output
+ // of a currently running compaction, we cannot run it.
+ *manual_conflict = true;
+ return nullptr;
+ }
+
+ Compaction* c = new Compaction(
+ vstorage, ioptions_, mutable_cf_options, std::move(inputs),
+ output_level,
+ MaxFileSizeForLevel(mutable_cf_options, output_level,
+ ioptions_.compaction_style),
+ /* max_compaction_bytes */ LLONG_MAX,
+ compact_range_options.target_path_id,
+ GetCompressionType(ioptions_, vstorage, mutable_cf_options,
+ output_level, 1),
+ GetCompressionOptions(ioptions_, vstorage, output_level),
+ compact_range_options.max_subcompactions, /* grandparents */ {},
+ /* is manual */ true);
+ RegisterCompaction(c);
+ return c;
+ }
+
+ CompactionInputFiles inputs;
+ inputs.level = input_level;
+ bool covering_the_whole_range = true;
+
+ // All files are 'overlapping' in universal style compaction.
+ // We have to compact the entire range in one shot.
+ if (ioptions_.compaction_style == kCompactionStyleUniversal) {
+ begin = nullptr;
+ end = nullptr;
+ }
+
+ vstorage->GetOverlappingInputs(input_level, begin, end, &inputs.files);
+ if (inputs.empty()) {
+ return nullptr;
+ }
+
+ if ((input_level == 0) && (!level0_compactions_in_progress_.empty())) {
+ // Only one level 0 compaction allowed
+ TEST_SYNC_POINT("CompactionPicker::CompactRange:Conflict");
+ *manual_conflict = true;
+ return nullptr;
+ }
+
+ // Avoid compacting too much in one shot in case the range is large.
+ // But we cannot do this for level-0 since level-0 files can overlap
+ // and we must not pick one file and drop another older file if the
+ // two files overlap.
+ if (input_level > 0) {
+ const uint64_t limit = mutable_cf_options.max_compaction_bytes;
+ uint64_t total = 0;
+ for (size_t i = 0; i + 1 < inputs.size(); ++i) {
+ uint64_t s = inputs[i]->compensated_file_size;
+ total += s;
+ if (total >= limit) {
+ covering_the_whole_range = false;
+ inputs.files.resize(i + 1);
+ break;
+ }
+ }
+ }
+ assert(compact_range_options.target_path_id <
+ static_cast<uint32_t>(ioptions_.cf_paths.size()));
+
+ // for BOTTOM LEVEL compaction only, use max_file_num_to_ignore to filter out
+ // files that are created during the current compaction.
+ if (compact_range_options.bottommost_level_compaction ==
+ BottommostLevelCompaction::kForceOptimized &&
+ max_file_num_to_ignore != port::kMaxUint64) {
+ assert(input_level == output_level);
+ // inputs_shrunk holds a continuous subset of input files which were all
+ // created before the current manual compaction
+ std::vector<FileMetaData*> inputs_shrunk;
+ size_t skip_input_index = inputs.size();
+ for (size_t i = 0; i < inputs.size(); ++i) {
+ if (inputs[i]->fd.GetNumber() < max_file_num_to_ignore) {
+ inputs_shrunk.push_back(inputs[i]);
+ } else if (!inputs_shrunk.empty()) {
+ // inputs[i] was created during the current manual compaction and
+ // need to be skipped
+ skip_input_index = i;
+ break;
+ }
+ }
+ if (inputs_shrunk.empty()) {
+ return nullptr;
+ }
+ if (inputs.size() != inputs_shrunk.size()) {
+ inputs.files.swap(inputs_shrunk);
+ }
+ // set covering_the_whole_range to false if there is any file that need to
+ // be compacted in the range of inputs[skip_input_index+1, inputs.size())
+ for (size_t i = skip_input_index + 1; i < inputs.size(); ++i) {
+ if (inputs[i]->fd.GetNumber() < max_file_num_to_ignore) {
+ covering_the_whole_range = false;
+ }
+ }
+ }
+
+ InternalKey key_storage;
+ InternalKey* next_smallest = &key_storage;
+ if (ExpandInputsToCleanCut(cf_name, vstorage, &inputs, &next_smallest) ==
+ false) {
+ // manual compaction is now multi-threaded, so it can
+ // happen that ExpandWhileOverlapping fails
+ // we handle it higher in RunManualCompaction
+ *manual_conflict = true;
+ return nullptr;
+ }
+
+ if (covering_the_whole_range || !next_smallest) {
+ *compaction_end = nullptr;
+ } else {
+ **compaction_end = *next_smallest;
+ }
+
+ CompactionInputFiles output_level_inputs;
+ if (output_level == ColumnFamilyData::kCompactToBaseLevel) {
+ assert(input_level == 0);
+ output_level = vstorage->base_level();
+ assert(output_level > 0);
+ }
+ output_level_inputs.level = output_level;
+ if (input_level != output_level) {
+ int parent_index = -1;
+ if (!SetupOtherInputs(cf_name, mutable_cf_options, vstorage, &inputs,
+ &output_level_inputs, &parent_index, -1)) {
+ // manual compaction is now multi-threaded, so it can
+ // happen that SetupOtherInputs fails
+ // we handle it higher in RunManualCompaction
+ *manual_conflict = true;
+ return nullptr;
+ }
+ }
+
+ std::vector<CompactionInputFiles> compaction_inputs({inputs});
+ if (!output_level_inputs.empty()) {
+ compaction_inputs.push_back(output_level_inputs);
+ }
+ for (size_t i = 0; i < compaction_inputs.size(); i++) {
+ if (AreFilesInCompaction(compaction_inputs[i].files)) {
+ *manual_conflict = true;
+ return nullptr;
+ }
+ }
+
+ // 2 non-exclusive manual compactions could run at the same time producing
+ // overlaping outputs in the same level.
+ if (FilesRangeOverlapWithCompaction(compaction_inputs, output_level)) {
+ // This compaction output could potentially conflict with the output
+ // of a currently running compaction, we cannot run it.
+ *manual_conflict = true;
+ return nullptr;
+ }
+
+ std::vector<FileMetaData*> grandparents;
+ GetGrandparents(vstorage, inputs, output_level_inputs, &grandparents);
+ Compaction* compaction = new Compaction(
+ vstorage, ioptions_, mutable_cf_options, std::move(compaction_inputs),
+ output_level,
+ MaxFileSizeForLevel(mutable_cf_options, output_level,
+ ioptions_.compaction_style, vstorage->base_level(),
+ ioptions_.level_compaction_dynamic_level_bytes),
+ mutable_cf_options.max_compaction_bytes,
+ compact_range_options.target_path_id,
+ GetCompressionType(ioptions_, vstorage, mutable_cf_options, output_level,
+ vstorage->base_level()),
+ GetCompressionOptions(ioptions_, vstorage, output_level),
+ compact_range_options.max_subcompactions, std::move(grandparents),
+ /* is manual compaction */ true);
+
+ TEST_SYNC_POINT_CALLBACK("CompactionPicker::CompactRange:Return", compaction);
+ RegisterCompaction(compaction);
+
+ // Creating a compaction influences the compaction score because the score
+ // takes running compactions into account (by skipping files that are already
+ // being compacted). Since we just changed compaction score, we recalculate it
+ // here
+ vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options);
+
+ return compaction;
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+// Test whether two files have overlapping key-ranges.
+bool HaveOverlappingKeyRanges(const Comparator* c, const SstFileMetaData& a,
+ const SstFileMetaData& b) {
+ if (c->Compare(a.smallestkey, b.smallestkey) >= 0) {
+ if (c->Compare(a.smallestkey, b.largestkey) <= 0) {
+ // b.smallestkey <= a.smallestkey <= b.largestkey
+ return true;
+ }
+ } else if (c->Compare(a.largestkey, b.smallestkey) >= 0) {
+ // a.smallestkey < b.smallestkey <= a.largestkey
+ return true;
+ }
+ if (c->Compare(a.largestkey, b.largestkey) <= 0) {
+ if (c->Compare(a.largestkey, b.smallestkey) >= 0) {
+ // b.smallestkey <= a.largestkey <= b.largestkey
+ return true;
+ }
+ } else if (c->Compare(a.smallestkey, b.largestkey) <= 0) {
+ // a.smallestkey <= b.largestkey < a.largestkey
+ return true;
+ }
+ return false;
+}
+} // namespace
+
+Status CompactionPicker::SanitizeCompactionInputFilesForAllLevels(
+ std::unordered_set<uint64_t>* input_files,
+ const ColumnFamilyMetaData& cf_meta, const int output_level) const {
+ auto& levels = cf_meta.levels;
+ auto comparator = icmp_->user_comparator();
+
+ // TODO(yhchiang): add is_adjustable to CompactionOptions
+
+ // the smallest and largest key of the current compaction input
+ std::string smallestkey;
+ std::string largestkey;
+ // a flag for initializing smallest and largest key
+ bool is_first = false;
+ const int kNotFound = -1;
+
+ // For each level, it does the following things:
+ // 1. Find the first and the last compaction input files
+ // in the current level.
+ // 2. Include all files between the first and the last
+ // compaction input files.
+ // 3. Update the compaction key-range.
+ // 4. For all remaining levels, include files that have
+ // overlapping key-range with the compaction key-range.
+ for (int l = 0; l <= output_level; ++l) {
+ auto& current_files = levels[l].files;
+ int first_included = static_cast<int>(current_files.size());
+ int last_included = kNotFound;
+
+ // identify the first and the last compaction input files
+ // in the current level.
+ for (size_t f = 0; f < current_files.size(); ++f) {
+ if (input_files->find(TableFileNameToNumber(current_files[f].name)) !=
+ input_files->end()) {
+ first_included = std::min(first_included, static_cast<int>(f));
+ last_included = std::max(last_included, static_cast<int>(f));
+ if (is_first == false) {
+ smallestkey = current_files[f].smallestkey;
+ largestkey = current_files[f].largestkey;
+ is_first = true;
+ }
+ }
+ }
+ if (last_included == kNotFound) {
+ continue;
+ }
+
+ if (l != 0) {
+ // expend the compaction input of the current level if it
+ // has overlapping key-range with other non-compaction input
+ // files in the same level.
+ while (first_included > 0) {
+ if (comparator->Compare(current_files[first_included - 1].largestkey,
+ current_files[first_included].smallestkey) <
+ 0) {
+ break;
+ }
+ first_included--;
+ }
+
+ while (last_included < static_cast<int>(current_files.size()) - 1) {
+ if (comparator->Compare(current_files[last_included + 1].smallestkey,
+ current_files[last_included].largestkey) > 0) {
+ break;
+ }
+ last_included++;
+ }
+ } else if (output_level > 0) {
+ last_included = static_cast<int>(current_files.size() - 1);
+ }
+
+ // include all files between the first and the last compaction input files.
+ for (int f = first_included; f <= last_included; ++f) {
+ if (current_files[f].being_compacted) {
+ return Status::Aborted("Necessary compaction input file " +
+ current_files[f].name +
+ " is currently being compacted.");
+ }
+ input_files->insert(TableFileNameToNumber(current_files[f].name));
+ }
+
+ // update smallest and largest key
+ if (l == 0) {
+ for (int f = first_included; f <= last_included; ++f) {
+ if (comparator->Compare(smallestkey, current_files[f].smallestkey) >
+ 0) {
+ smallestkey = current_files[f].smallestkey;
+ }
+ if (comparator->Compare(largestkey, current_files[f].largestkey) < 0) {
+ largestkey = current_files[f].largestkey;
+ }
+ }
+ } else {
+ if (comparator->Compare(smallestkey,
+ current_files[first_included].smallestkey) > 0) {
+ smallestkey = current_files[first_included].smallestkey;
+ }
+ if (comparator->Compare(largestkey,
+ current_files[last_included].largestkey) < 0) {
+ largestkey = current_files[last_included].largestkey;
+ }
+ }
+
+ SstFileMetaData aggregated_file_meta;
+ aggregated_file_meta.smallestkey = smallestkey;
+ aggregated_file_meta.largestkey = largestkey;
+
+ // For all lower levels, include all overlapping files.
+ // We need to add overlapping files from the current level too because even
+ // if there no input_files in level l, we would still need to add files
+ // which overlap with the range containing the input_files in levels 0 to l
+ // Level 0 doesn't need to be handled this way because files are sorted by
+ // time and not by key
+ for (int m = std::max(l, 1); m <= output_level; ++m) {
+ for (auto& next_lv_file : levels[m].files) {
+ if (HaveOverlappingKeyRanges(comparator, aggregated_file_meta,
+ next_lv_file)) {
+ if (next_lv_file.being_compacted) {
+ return Status::Aborted(
+ "File " + next_lv_file.name +
+ " that has overlapping key range with one of the compaction "
+ " input file is currently being compacted.");
+ }
+ input_files->insert(TableFileNameToNumber(next_lv_file.name));
+ }
+ }
+ }
+ }
+ if (RangeOverlapWithCompaction(smallestkey, largestkey, output_level)) {
+ return Status::Aborted(
+ "A running compaction is writing to the same output level in an "
+ "overlapping key range");
+ }
+ return Status::OK();
+}
+
+Status CompactionPicker::SanitizeCompactionInputFiles(
+ std::unordered_set<uint64_t>* input_files,
+ const ColumnFamilyMetaData& cf_meta, const int output_level) const {
+ assert(static_cast<int>(cf_meta.levels.size()) - 1 ==
+ cf_meta.levels[cf_meta.levels.size() - 1].level);
+ if (output_level >= static_cast<int>(cf_meta.levels.size())) {
+ return Status::InvalidArgument(
+ "Output level for column family " + cf_meta.name +
+ " must between [0, " +
+ ToString(cf_meta.levels[cf_meta.levels.size() - 1].level) + "].");
+ }
+
+ if (output_level > MaxOutputLevel()) {
+ return Status::InvalidArgument(
+ "Exceed the maximum output level defined by "
+ "the current compaction algorithm --- " +
+ ToString(MaxOutputLevel()));
+ }
+
+ if (output_level < 0) {
+ return Status::InvalidArgument("Output level cannot be negative.");
+ }
+
+ if (input_files->size() == 0) {
+ return Status::InvalidArgument(
+ "A compaction must contain at least one file.");
+ }
+
+ Status s = SanitizeCompactionInputFilesForAllLevels(input_files, cf_meta,
+ output_level);
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ // for all input files, check whether the file number matches
+ // any currently-existing files.
+ for (auto file_num : *input_files) {
+ bool found = false;
+ for (const auto& level_meta : cf_meta.levels) {
+ for (const auto& file_meta : level_meta.files) {
+ if (file_num == TableFileNameToNumber(file_meta.name)) {
+ if (file_meta.being_compacted) {
+ return Status::Aborted("Specified compaction input file " +
+ MakeTableFileName("", file_num) +
+ " is already being compacted.");
+ }
+ found = true;
+ break;
+ }
+ }
+ if (found) {
+ break;
+ }
+ }
+ if (!found) {
+ return Status::InvalidArgument(
+ "Specified compaction input file " + MakeTableFileName("", file_num) +
+ " does not exist in column family " + cf_meta.name + ".");
+ }
+ }
+
+ return Status::OK();
+}
+#endif // !ROCKSDB_LITE
+
+void CompactionPicker::RegisterCompaction(Compaction* c) {
+ if (c == nullptr) {
+ return;
+ }
+ assert(ioptions_.compaction_style != kCompactionStyleLevel ||
+ c->output_level() == 0 ||
+ !FilesRangeOverlapWithCompaction(*c->inputs(), c->output_level()));
+ if (c->start_level() == 0 ||
+ ioptions_.compaction_style == kCompactionStyleUniversal) {
+ level0_compactions_in_progress_.insert(c);
+ }
+ compactions_in_progress_.insert(c);
+}
+
+void CompactionPicker::UnregisterCompaction(Compaction* c) {
+ if (c == nullptr) {
+ return;
+ }
+ if (c->start_level() == 0 ||
+ ioptions_.compaction_style == kCompactionStyleUniversal) {
+ level0_compactions_in_progress_.erase(c);
+ }
+ compactions_in_progress_.erase(c);
+}
+
+void CompactionPicker::PickFilesMarkedForCompaction(
+ const std::string& cf_name, VersionStorageInfo* vstorage, int* start_level,
+ int* output_level, CompactionInputFiles* start_level_inputs) {
+ if (vstorage->FilesMarkedForCompaction().empty()) {
+ return;
+ }
+
+ auto continuation = [&, cf_name](std::pair<int, FileMetaData*> level_file) {
+ // If it's being compacted it has nothing to do here.
+ // If this assert() fails that means that some function marked some
+ // files as being_compacted, but didn't call ComputeCompactionScore()
+ assert(!level_file.second->being_compacted);
+ *start_level = level_file.first;
+ *output_level =
+ (*start_level == 0) ? vstorage->base_level() : *start_level + 1;
+
+ if (*start_level == 0 && !level0_compactions_in_progress()->empty()) {
+ return false;
+ }
+
+ start_level_inputs->files = {level_file.second};
+ start_level_inputs->level = *start_level;
+ return ExpandInputsToCleanCut(cf_name, vstorage, start_level_inputs);
+ };
+
+ // take a chance on a random file first
+ Random64 rnd(/* seed */ reinterpret_cast<uint64_t>(vstorage));
+ size_t random_file_index = static_cast<size_t>(rnd.Uniform(
+ static_cast<uint64_t>(vstorage->FilesMarkedForCompaction().size())));
+
+ if (continuation(vstorage->FilesMarkedForCompaction()[random_file_index])) {
+ // found the compaction!
+ return;
+ }
+
+ for (auto& level_file : vstorage->FilesMarkedForCompaction()) {
+ if (continuation(level_file)) {
+ // found the compaction!
+ return;
+ }
+ }
+ start_level_inputs->files.clear();
+}
+
+bool CompactionPicker::GetOverlappingL0Files(
+ VersionStorageInfo* vstorage, CompactionInputFiles* start_level_inputs,
+ int output_level, int* parent_index) {
+ // Two level 0 compaction won't run at the same time, so don't need to worry
+ // about files on level 0 being compacted.
+ assert(level0_compactions_in_progress()->empty());
+ InternalKey smallest, largest;
+ GetRange(*start_level_inputs, &smallest, &largest);
+ // Note that the next call will discard the file we placed in
+ // c->inputs_[0] earlier and replace it with an overlapping set
+ // which will include the picked file.
+ start_level_inputs->files.clear();
+ vstorage->GetOverlappingInputs(0, &smallest, &largest,
+ &(start_level_inputs->files));
+
+ // If we include more L0 files in the same compaction run it can
+ // cause the 'smallest' and 'largest' key to get extended to a
+ // larger range. So, re-invoke GetRange to get the new key range
+ GetRange(*start_level_inputs, &smallest, &largest);
+ if (IsRangeInCompaction(vstorage, &smallest, &largest, output_level,
+ parent_index)) {
+ return false;
+ }
+ assert(!start_level_inputs->files.empty());
+
+ return true;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_picker.h b/src/rocksdb/db/compaction/compaction_picker.h
new file mode 100644
index 000000000..36d570e68
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker.h
@@ -0,0 +1,313 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/compaction/compaction.h"
+#include "db/version_set.h"
+#include "options/cf_options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// The file contains an abstract class CompactionPicker, and its two
+// sub-classes LevelCompactionPicker and NullCompactionPicker, as
+// well as some helper functions used by them.
+
+class LogBuffer;
+class Compaction;
+class VersionStorageInfo;
+struct CompactionInputFiles;
+
+// An abstract class to pick compactions from an existing LSM-tree.
+//
+// Each compaction style inherits the class and implement the
+// interface to form automatic compactions. If NeedCompaction() is true,
+// then call PickCompaction() to find what files need to be compacted
+// and where to put the output files.
+//
+// Non-virtual functions CompactRange() and CompactFiles() are used to
+// pick files to compact based on users' DB::CompactRange() and
+// DB::CompactFiles() requests, respectively. There is little
+// compaction style specific logic for them.
+class CompactionPicker {
+ public:
+ CompactionPicker(const ImmutableCFOptions& ioptions,
+ const InternalKeyComparator* icmp);
+ virtual ~CompactionPicker();
+
+ // Pick level and inputs for a new compaction.
+ // Returns nullptr if there is no compaction to be done.
+ // Otherwise returns a pointer to a heap-allocated object that
+ // describes the compaction. Caller should delete the result.
+ virtual Compaction* PickCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+ SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) = 0;
+
+ // Return a compaction object for compacting the range [begin,end] in
+ // the specified level. Returns nullptr if there is nothing in that
+ // level that overlaps the specified range. Caller should delete
+ // the result.
+ //
+ // The returned Compaction might not include the whole requested range.
+ // In that case, compaction_end will be set to the next key that needs
+ // compacting. In case the compaction will compact the whole range,
+ // compaction_end will be set to nullptr.
+ // Client is responsible for compaction_end storage -- when called,
+ // *compaction_end should point to valid InternalKey!
+ virtual Compaction* CompactRange(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage, int input_level, int output_level,
+ const CompactRangeOptions& compact_range_options,
+ const InternalKey* begin, const InternalKey* end,
+ InternalKey** compaction_end, bool* manual_conflict,
+ uint64_t max_file_num_to_ignore);
+
+ // The maximum allowed output level. Default value is NumberLevels() - 1.
+ virtual int MaxOutputLevel() const { return NumberLevels() - 1; }
+
+ virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const = 0;
+
+// Sanitize the input set of compaction input files.
+// When the input parameters do not describe a valid compaction, the
+// function will try to fix the input_files by adding necessary
+// files. If it's not possible to conver an invalid input_files
+// into a valid one by adding more files, the function will return a
+// non-ok status with specific reason.
+#ifndef ROCKSDB_LITE
+ Status SanitizeCompactionInputFiles(std::unordered_set<uint64_t>* input_files,
+ const ColumnFamilyMetaData& cf_meta,
+ const int output_level) const;
+#endif // ROCKSDB_LITE
+
+ // Free up the files that participated in a compaction
+ //
+ // Requirement: DB mutex held
+ void ReleaseCompactionFiles(Compaction* c, Status status);
+
+ // Returns true if any one of the specified files are being compacted
+ bool AreFilesInCompaction(const std::vector<FileMetaData*>& files);
+
+ // Takes a list of CompactionInputFiles and returns a (manual) Compaction
+ // object.
+ //
+ // Caller must provide a set of input files that has been passed through
+ // `SanitizeCompactionInputFiles` earlier. The lock should not be released
+ // between that call and this one.
+ Compaction* CompactFiles(const CompactionOptions& compact_options,
+ const std::vector<CompactionInputFiles>& input_files,
+ int output_level, VersionStorageInfo* vstorage,
+ const MutableCFOptions& mutable_cf_options,
+ uint32_t output_path_id);
+
+ // Converts a set of compaction input file numbers into
+ // a list of CompactionInputFiles.
+ Status GetCompactionInputsFromFileNumbers(
+ std::vector<CompactionInputFiles>* input_files,
+ std::unordered_set<uint64_t>* input_set,
+ const VersionStorageInfo* vstorage,
+ const CompactionOptions& compact_options) const;
+
+ // Is there currently a compaction involving level 0 taking place
+ bool IsLevel0CompactionInProgress() const {
+ return !level0_compactions_in_progress_.empty();
+ }
+
+ // Return true if the passed key range overlap with a compaction output
+ // that is currently running.
+ bool RangeOverlapWithCompaction(const Slice& smallest_user_key,
+ const Slice& largest_user_key,
+ int level) const;
+
+ // Stores the minimal range that covers all entries in inputs in
+ // *smallest, *largest.
+ // REQUIRES: inputs is not empty
+ void GetRange(const CompactionInputFiles& inputs, InternalKey* smallest,
+ InternalKey* largest) const;
+
+ // Stores the minimal range that covers all entries in inputs1 and inputs2
+ // in *smallest, *largest.
+ // REQUIRES: inputs is not empty
+ void GetRange(const CompactionInputFiles& inputs1,
+ const CompactionInputFiles& inputs2, InternalKey* smallest,
+ InternalKey* largest) const;
+
+ // Stores the minimal range that covers all entries in inputs
+ // in *smallest, *largest.
+ // REQUIRES: inputs is not empty (at least on entry have one file)
+ void GetRange(const std::vector<CompactionInputFiles>& inputs,
+ InternalKey* smallest, InternalKey* largest) const;
+
+ int NumberLevels() const { return ioptions_.num_levels; }
+
+ // Add more files to the inputs on "level" to make sure that
+ // no newer version of a key is compacted to "level+1" while leaving an older
+ // version in a "level". Otherwise, any Get() will search "level" first,
+ // and will likely return an old/stale value for the key, since it always
+ // searches in increasing order of level to find the value. This could
+ // also scramble the order of merge operands. This function should be
+ // called any time a new Compaction is created, and its inputs_[0] are
+ // populated.
+ //
+ // Will return false if it is impossible to apply this compaction.
+ bool ExpandInputsToCleanCut(const std::string& cf_name,
+ VersionStorageInfo* vstorage,
+ CompactionInputFiles* inputs,
+ InternalKey** next_smallest = nullptr);
+
+ // Returns true if any one of the parent files are being compacted
+ bool IsRangeInCompaction(VersionStorageInfo* vstorage,
+ const InternalKey* smallest,
+ const InternalKey* largest, int level, int* index);
+
+ // Returns true if the key range that `inputs` files cover overlap with the
+ // key range of a currently running compaction.
+ bool FilesRangeOverlapWithCompaction(
+ const std::vector<CompactionInputFiles>& inputs, int level) const;
+
+ bool SetupOtherInputs(const std::string& cf_name,
+ const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage,
+ CompactionInputFiles* inputs,
+ CompactionInputFiles* output_level_inputs,
+ int* parent_index, int base_index);
+
+ void GetGrandparents(VersionStorageInfo* vstorage,
+ const CompactionInputFiles& inputs,
+ const CompactionInputFiles& output_level_inputs,
+ std::vector<FileMetaData*>* grandparents);
+
+ void PickFilesMarkedForCompaction(const std::string& cf_name,
+ VersionStorageInfo* vstorage,
+ int* start_level, int* output_level,
+ CompactionInputFiles* start_level_inputs);
+
+ bool GetOverlappingL0Files(VersionStorageInfo* vstorage,
+ CompactionInputFiles* start_level_inputs,
+ int output_level, int* parent_index);
+
+ // Register this compaction in the set of running compactions
+ void RegisterCompaction(Compaction* c);
+
+ // Remove this compaction from the set of running compactions
+ void UnregisterCompaction(Compaction* c);
+
+ std::set<Compaction*>* level0_compactions_in_progress() {
+ return &level0_compactions_in_progress_;
+ }
+ std::unordered_set<Compaction*>* compactions_in_progress() {
+ return &compactions_in_progress_;
+ }
+
+ protected:
+ const ImmutableCFOptions& ioptions_;
+
+// A helper function to SanitizeCompactionInputFiles() that
+// sanitizes "input_files" by adding necessary files.
+#ifndef ROCKSDB_LITE
+ virtual Status SanitizeCompactionInputFilesForAllLevels(
+ std::unordered_set<uint64_t>* input_files,
+ const ColumnFamilyMetaData& cf_meta, const int output_level) const;
+#endif // ROCKSDB_LITE
+
+ // Keeps track of all compactions that are running on Level0.
+ // Protected by DB mutex
+ std::set<Compaction*> level0_compactions_in_progress_;
+
+ // Keeps track of all compactions that are running.
+ // Protected by DB mutex
+ std::unordered_set<Compaction*> compactions_in_progress_;
+
+ const InternalKeyComparator* const icmp_;
+};
+
+#ifndef ROCKSDB_LITE
+// A dummy compaction that never triggers any automatic
+// compaction.
+class NullCompactionPicker : public CompactionPicker {
+ public:
+ NullCompactionPicker(const ImmutableCFOptions& ioptions,
+ const InternalKeyComparator* icmp)
+ : CompactionPicker(ioptions, icmp) {}
+ virtual ~NullCompactionPicker() {}
+
+ // Always return "nullptr"
+ Compaction* PickCompaction(
+ const std::string& /*cf_name*/,
+ const MutableCFOptions& /*mutable_cf_options*/,
+ VersionStorageInfo* /*vstorage*/, LogBuffer* /* log_buffer */,
+ SequenceNumber /* earliest_memtable_seqno */) override {
+ return nullptr;
+ }
+
+ // Always return "nullptr"
+ Compaction* CompactRange(const std::string& /*cf_name*/,
+ const MutableCFOptions& /*mutable_cf_options*/,
+ VersionStorageInfo* /*vstorage*/,
+ int /*input_level*/, int /*output_level*/,
+ const CompactRangeOptions& /*compact_range_options*/,
+ const InternalKey* /*begin*/,
+ const InternalKey* /*end*/,
+ InternalKey** /*compaction_end*/,
+ bool* /*manual_conflict*/,
+ uint64_t /*max_file_num_to_ignore*/) override {
+ return nullptr;
+ }
+
+ // Always returns false.
+ virtual bool NeedsCompaction(
+ const VersionStorageInfo* /*vstorage*/) const override {
+ return false;
+ }
+};
+#endif // !ROCKSDB_LITE
+
+// Attempts to find an intra L0 compaction conforming to the given parameters.
+//
+// @param level_files Metadata for L0 files.
+// @param min_files_to_compact Minimum number of files required to
+// do the compaction.
+// @param max_compact_bytes_per_del_file Maximum average size in bytes per
+// file that is going to get deleted by
+// the compaction.
+// @param max_compaction_bytes Maximum total size in bytes (in terms
+// of compensated file size) for files
+// to be compacted.
+// @param [out] comp_inputs If a compaction was found, will be
+// initialized with corresponding input
+// files. Cannot be nullptr.
+//
+// @return true iff compaction was found.
+bool FindIntraL0Compaction(
+ const std::vector<FileMetaData*>& level_files, size_t min_files_to_compact,
+ uint64_t max_compact_bytes_per_del_file, uint64_t max_compaction_bytes,
+ CompactionInputFiles* comp_inputs,
+ SequenceNumber earliest_mem_seqno = kMaxSequenceNumber);
+
+CompressionType GetCompressionType(const ImmutableCFOptions& ioptions,
+ const VersionStorageInfo* vstorage,
+ const MutableCFOptions& mutable_cf_options,
+ int level, int base_level,
+ const bool enable_compression = true);
+
+CompressionOptions GetCompressionOptions(const ImmutableCFOptions& ioptions,
+ const VersionStorageInfo* vstorage,
+ int level,
+ const bool enable_compression = true);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_picker_fifo.cc b/src/rocksdb/db/compaction/compaction_picker_fifo.cc
new file mode 100644
index 000000000..b148aadc2
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_fifo.cc
@@ -0,0 +1,244 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_picker_fifo.h"
+#ifndef ROCKSDB_LITE
+
+#include <cinttypes>
+#include <string>
+#include <vector>
+#include "db/column_family.h"
+#include "logging/log_buffer.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+uint64_t GetTotalFilesSize(const std::vector<FileMetaData*>& files) {
+ uint64_t total_size = 0;
+ for (const auto& f : files) {
+ total_size += f->fd.file_size;
+ }
+ return total_size;
+}
+} // anonymous namespace
+
+bool FIFOCompactionPicker::NeedsCompaction(
+ const VersionStorageInfo* vstorage) const {
+ const int kLevel0 = 0;
+ return vstorage->CompactionScore(kLevel0) >= 1;
+}
+
+Compaction* FIFOCompactionPicker::PickTTLCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
+ assert(mutable_cf_options.ttl > 0);
+
+ const int kLevel0 = 0;
+ const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
+ uint64_t total_size = GetTotalFilesSize(level_files);
+
+ int64_t _current_time;
+ auto status = ioptions_.env->GetCurrentTime(&_current_time);
+ if (!status.ok()) {
+ ROCKS_LOG_BUFFER(log_buffer,
+ "[%s] FIFO compaction: Couldn't get current time: %s. "
+ "Not doing compactions based on TTL. ",
+ cf_name.c_str(), status.ToString().c_str());
+ return nullptr;
+ }
+ const uint64_t current_time = static_cast<uint64_t>(_current_time);
+
+ if (!level0_compactions_in_progress_.empty()) {
+ ROCKS_LOG_BUFFER(
+ log_buffer,
+ "[%s] FIFO compaction: Already executing compaction. No need "
+ "to run parallel compactions since compactions are very fast",
+ cf_name.c_str());
+ return nullptr;
+ }
+
+ std::vector<CompactionInputFiles> inputs;
+ inputs.emplace_back();
+ inputs[0].level = 0;
+
+ // avoid underflow
+ if (current_time > mutable_cf_options.ttl) {
+ for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
+ FileMetaData* f = *ritr;
+ if (f->fd.table_reader && f->fd.table_reader->GetTableProperties()) {
+ uint64_t creation_time =
+ f->fd.table_reader->GetTableProperties()->creation_time;
+ if (creation_time == 0 ||
+ creation_time >= (current_time - mutable_cf_options.ttl)) {
+ break;
+ }
+ total_size -= f->compensated_file_size;
+ inputs[0].files.push_back(f);
+ }
+ }
+ }
+
+ // Return a nullptr and proceed to size-based FIFO compaction if:
+ // 1. there are no files older than ttl OR
+ // 2. there are a few files older than ttl, but deleting them will not bring
+ // the total size to be less than max_table_files_size threshold.
+ if (inputs[0].files.empty() ||
+ total_size >
+ mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+ return nullptr;
+ }
+
+ for (const auto& f : inputs[0].files) {
+ uint64_t creation_time = 0;
+ if (f && f->fd.table_reader && f->fd.table_reader->GetTableProperties()) {
+ creation_time = f->fd.table_reader->GetTableProperties()->creation_time;
+ }
+ ROCKS_LOG_BUFFER(log_buffer,
+ "[%s] FIFO compaction: picking file %" PRIu64
+ " with creation time %" PRIu64 " for deletion",
+ cf_name.c_str(), f->fd.GetNumber(), creation_time);
+ }
+
+ Compaction* c = new Compaction(
+ vstorage, ioptions_, mutable_cf_options, std::move(inputs), 0, 0, 0, 0,
+ kNoCompression, ioptions_.compression_opts, /* max_subcompactions */ 0,
+ {}, /* is manual */ false, vstorage->CompactionScore(0),
+ /* is deletion compaction */ true, CompactionReason::kFIFOTtl);
+ return c;
+}
+
+Compaction* FIFOCompactionPicker::PickSizeCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
+ const int kLevel0 = 0;
+ const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
+ uint64_t total_size = GetTotalFilesSize(level_files);
+
+ if (total_size <=
+ mutable_cf_options.compaction_options_fifo.max_table_files_size ||
+ level_files.size() == 0) {
+ // total size not exceeded
+ if (mutable_cf_options.compaction_options_fifo.allow_compaction &&
+ level_files.size() > 0) {
+ CompactionInputFiles comp_inputs;
+ // try to prevent same files from being compacted multiple times, which
+ // could produce large files that may never TTL-expire. Achieve this by
+ // disallowing compactions with files larger than memtable (inflate its
+ // size by 10% to account for uncompressed L0 files that may have size
+ // slightly greater than memtable size limit).
+ size_t max_compact_bytes_per_del_file =
+ static_cast<size_t>(MultiplyCheckOverflow(
+ static_cast<uint64_t>(mutable_cf_options.write_buffer_size),
+ 1.1));
+ if (FindIntraL0Compaction(
+ level_files,
+ mutable_cf_options
+ .level0_file_num_compaction_trigger /* min_files_to_compact */
+ ,
+ max_compact_bytes_per_del_file,
+ mutable_cf_options.max_compaction_bytes, &comp_inputs)) {
+ Compaction* c = new Compaction(
+ vstorage, ioptions_, mutable_cf_options, {comp_inputs}, 0,
+ 16 * 1024 * 1024 /* output file size limit */,
+ 0 /* max compaction bytes, not applicable */,
+ 0 /* output path ID */, mutable_cf_options.compression,
+ ioptions_.compression_opts, 0 /* max_subcompactions */, {},
+ /* is manual */ false, vstorage->CompactionScore(0),
+ /* is deletion compaction */ false,
+ CompactionReason::kFIFOReduceNumFiles);
+ return c;
+ }
+ }
+
+ ROCKS_LOG_BUFFER(
+ log_buffer,
+ "[%s] FIFO compaction: nothing to do. Total size %" PRIu64
+ ", max size %" PRIu64 "\n",
+ cf_name.c_str(), total_size,
+ mutable_cf_options.compaction_options_fifo.max_table_files_size);
+ return nullptr;
+ }
+
+ if (!level0_compactions_in_progress_.empty()) {
+ ROCKS_LOG_BUFFER(
+ log_buffer,
+ "[%s] FIFO compaction: Already executing compaction. No need "
+ "to run parallel compactions since compactions are very fast",
+ cf_name.c_str());
+ return nullptr;
+ }
+
+ std::vector<CompactionInputFiles> inputs;
+ inputs.emplace_back();
+ inputs[0].level = 0;
+
+ for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
+ auto f = *ritr;
+ total_size -= f->compensated_file_size;
+ inputs[0].files.push_back(f);
+ char tmp_fsize[16];
+ AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize));
+ ROCKS_LOG_BUFFER(log_buffer,
+ "[%s] FIFO compaction: picking file %" PRIu64
+ " with size %s for deletion",
+ cf_name.c_str(), f->fd.GetNumber(), tmp_fsize);
+ if (total_size <=
+ mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+ break;
+ }
+ }
+
+ Compaction* c = new Compaction(
+ vstorage, ioptions_, mutable_cf_options, std::move(inputs), 0, 0, 0, 0,
+ kNoCompression, ioptions_.compression_opts, /* max_subcompactions */ 0,
+ {}, /* is manual */ false, vstorage->CompactionScore(0),
+ /* is deletion compaction */ true, CompactionReason::kFIFOMaxSize);
+ return c;
+}
+
+Compaction* FIFOCompactionPicker::PickCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+ SequenceNumber /*earliest_memtable_seqno*/) {
+ assert(vstorage->num_levels() == 1);
+
+ Compaction* c = nullptr;
+ if (mutable_cf_options.ttl > 0) {
+ c = PickTTLCompaction(cf_name, mutable_cf_options, vstorage, log_buffer);
+ }
+ if (c == nullptr) {
+ c = PickSizeCompaction(cf_name, mutable_cf_options, vstorage, log_buffer);
+ }
+ RegisterCompaction(c);
+ return c;
+}
+
+Compaction* FIFOCompactionPicker::CompactRange(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage, int input_level, int output_level,
+ const CompactRangeOptions& /*compact_range_options*/,
+ const InternalKey* /*begin*/, const InternalKey* /*end*/,
+ InternalKey** compaction_end, bool* /*manual_conflict*/,
+ uint64_t /*max_file_num_to_ignore*/) {
+#ifdef NDEBUG
+ (void)input_level;
+ (void)output_level;
+#endif
+ assert(input_level == 0);
+ assert(output_level == 0);
+ *compaction_end = nullptr;
+ LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.info_log);
+ Compaction* c =
+ PickCompaction(cf_name, mutable_cf_options, vstorage, &log_buffer);
+ log_buffer.FlushBufferToLog();
+ return c;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_picker_fifo.h b/src/rocksdb/db/compaction/compaction_picker_fifo.h
new file mode 100644
index 000000000..eb786e5ac
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_fifo.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "db/compaction/compaction_picker.h"
+
+namespace ROCKSDB_NAMESPACE {
+class FIFOCompactionPicker : public CompactionPicker {
+ public:
+ FIFOCompactionPicker(const ImmutableCFOptions& ioptions,
+ const InternalKeyComparator* icmp)
+ : CompactionPicker(ioptions, icmp) {}
+
+ virtual Compaction* PickCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* version, LogBuffer* log_buffer,
+ SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override;
+
+ virtual Compaction* CompactRange(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage, int input_level, int output_level,
+ const CompactRangeOptions& compact_range_options,
+ const InternalKey* begin, const InternalKey* end,
+ InternalKey** compaction_end, bool* manual_conflict,
+ uint64_t max_file_num_to_ignore) override;
+
+ // The maximum allowed output level. Always returns 0.
+ virtual int MaxOutputLevel() const override { return 0; }
+
+ virtual bool NeedsCompaction(
+ const VersionStorageInfo* vstorage) const override;
+
+ private:
+ Compaction* PickTTLCompaction(const std::string& cf_name,
+ const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* version,
+ LogBuffer* log_buffer);
+
+ Compaction* PickSizeCompaction(const std::string& cf_name,
+ const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* version,
+ LogBuffer* log_buffer);
+};
+} // namespace ROCKSDB_NAMESPACE
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_picker_level.cc b/src/rocksdb/db/compaction/compaction_picker_level.cc
new file mode 100644
index 000000000..012edd080
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_level.cc
@@ -0,0 +1,558 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/compaction/compaction_picker_level.h"
+#include "logging/log_buffer.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool LevelCompactionPicker::NeedsCompaction(
+ const VersionStorageInfo* vstorage) const {
+ if (!vstorage->ExpiredTtlFiles().empty()) {
+ return true;
+ }
+ if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) {
+ return true;
+ }
+ if (!vstorage->BottommostFilesMarkedForCompaction().empty()) {
+ return true;
+ }
+ if (!vstorage->FilesMarkedForCompaction().empty()) {
+ return true;
+ }
+ for (int i = 0; i <= vstorage->MaxInputLevel(); i++) {
+ if (vstorage->CompactionScore(i) >= 1) {
+ return true;
+ }
+ }
+ return false;
+}
+
+namespace {
+// A class to build a leveled compaction step-by-step.
+class LevelCompactionBuilder {
+ public:
+ LevelCompactionBuilder(const std::string& cf_name,
+ VersionStorageInfo* vstorage,
+ SequenceNumber earliest_mem_seqno,
+ CompactionPicker* compaction_picker,
+ LogBuffer* log_buffer,
+ const MutableCFOptions& mutable_cf_options,
+ const ImmutableCFOptions& ioptions)
+ : cf_name_(cf_name),
+ vstorage_(vstorage),
+ earliest_mem_seqno_(earliest_mem_seqno),
+ compaction_picker_(compaction_picker),
+ log_buffer_(log_buffer),
+ mutable_cf_options_(mutable_cf_options),
+ ioptions_(ioptions) {}
+
+ // Pick and return a compaction.
+ Compaction* PickCompaction();
+
+ // Pick the initial files to compact to the next level. (or together
+ // in Intra-L0 compactions)
+ void SetupInitialFiles();
+
+ // If the initial files are from L0 level, pick other L0
+ // files if needed.
+ bool SetupOtherL0FilesIfNeeded();
+
+ // Based on initial files, setup other files need to be compacted
+ // in this compaction, accordingly.
+ bool SetupOtherInputsIfNeeded();
+
+ Compaction* GetCompaction();
+
+ // For the specfied level, pick a file that we want to compact.
+ // Returns false if there is no file to compact.
+ // If it returns true, inputs->files.size() will be exactly one.
+ // If level is 0 and there is already a compaction on that level, this
+ // function will return false.
+ bool PickFileToCompact();
+
+ // For L0->L0, picks the longest span of files that aren't currently
+ // undergoing compaction for which work-per-deleted-file decreases. The span
+ // always starts from the newest L0 file.
+ //
+ // Intra-L0 compaction is independent of all other files, so it can be
+ // performed even when L0->base_level compactions are blocked.
+ //
+ // Returns true if `inputs` is populated with a span of files to be compacted;
+ // otherwise, returns false.
+ bool PickIntraL0Compaction();
+
+ void PickExpiredTtlFiles();
+
+ void PickFilesMarkedForPeriodicCompaction();
+
+ const std::string& cf_name_;
+ VersionStorageInfo* vstorage_;
+ SequenceNumber earliest_mem_seqno_;
+ CompactionPicker* compaction_picker_;
+ LogBuffer* log_buffer_;
+ int start_level_ = -1;
+ int output_level_ = -1;
+ int parent_index_ = -1;
+ int base_index_ = -1;
+ double start_level_score_ = 0;
+ bool is_manual_ = false;
+ CompactionInputFiles start_level_inputs_;
+ std::vector<CompactionInputFiles> compaction_inputs_;
+ CompactionInputFiles output_level_inputs_;
+ std::vector<FileMetaData*> grandparents_;
+ CompactionReason compaction_reason_ = CompactionReason::kUnknown;
+
+ const MutableCFOptions& mutable_cf_options_;
+ const ImmutableCFOptions& ioptions_;
+ // Pick a path ID to place a newly generated file, with its level
+ static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options,
+ int level);
+
+ static const int kMinFilesForIntraL0Compaction = 4;
+};
+
+void LevelCompactionBuilder::PickExpiredTtlFiles() {
+ if (vstorage_->ExpiredTtlFiles().empty()) {
+ return;
+ }
+
+ auto continuation = [&](std::pair<int, FileMetaData*> level_file) {
+ // If it's being compacted it has nothing to do here.
+ // If this assert() fails that means that some function marked some
+ // files as being_compacted, but didn't call ComputeCompactionScore()
+ assert(!level_file.second->being_compacted);
+ start_level_ = level_file.first;
+ output_level_ =
+ (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
+
+ if ((start_level_ == vstorage_->num_non_empty_levels() - 1) ||
+ (start_level_ == 0 &&
+ !compaction_picker_->level0_compactions_in_progress()->empty())) {
+ return false;
+ }
+
+ start_level_inputs_.files = {level_file.second};
+ start_level_inputs_.level = start_level_;
+ return compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+ &start_level_inputs_);
+ };
+
+ for (auto& level_file : vstorage_->ExpiredTtlFiles()) {
+ if (continuation(level_file)) {
+ // found the compaction!
+ return;
+ }
+ }
+
+ start_level_inputs_.files.clear();
+}
+
+void LevelCompactionBuilder::PickFilesMarkedForPeriodicCompaction() {
+ if (vstorage_->FilesMarkedForPeriodicCompaction().empty()) {
+ return;
+ }
+
+ auto continuation = [&](std::pair<int, FileMetaData*> level_file) {
+ // If it's being compacted it has nothing to do here.
+ // If this assert() fails that means that some function marked some
+ // files as being_compacted, but didn't call ComputeCompactionScore()
+ assert(!level_file.second->being_compacted);
+ output_level_ = start_level_ = level_file.first;
+
+ if (start_level_ == 0 &&
+ !compaction_picker_->level0_compactions_in_progress()->empty()) {
+ return false;
+ }
+
+ start_level_inputs_.files = {level_file.second};
+ start_level_inputs_.level = start_level_;
+ return compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+ &start_level_inputs_);
+ };
+
+ for (auto& level_file : vstorage_->FilesMarkedForPeriodicCompaction()) {
+ if (continuation(level_file)) {
+ // found the compaction!
+ return;
+ }
+ }
+
+ start_level_inputs_.files.clear();
+}
+
+void LevelCompactionBuilder::SetupInitialFiles() {
+ // Find the compactions by size on all levels.
+ bool skipped_l0_to_base = false;
+ for (int i = 0; i < compaction_picker_->NumberLevels() - 1; i++) {
+ start_level_score_ = vstorage_->CompactionScore(i);
+ start_level_ = vstorage_->CompactionScoreLevel(i);
+ assert(i == 0 || start_level_score_ <= vstorage_->CompactionScore(i - 1));
+ if (start_level_score_ >= 1) {
+ if (skipped_l0_to_base && start_level_ == vstorage_->base_level()) {
+ // If L0->base_level compaction is pending, don't schedule further
+ // compaction from base level. Otherwise L0->base_level compaction
+ // may starve.
+ continue;
+ }
+ output_level_ =
+ (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
+ if (PickFileToCompact()) {
+ // found the compaction!
+ if (start_level_ == 0) {
+ // L0 score = `num L0 files` / `level0_file_num_compaction_trigger`
+ compaction_reason_ = CompactionReason::kLevelL0FilesNum;
+ } else {
+ // L1+ score = `Level files size` / `MaxBytesForLevel`
+ compaction_reason_ = CompactionReason::kLevelMaxLevelSize;
+ }
+ break;
+ } else {
+ // didn't find the compaction, clear the inputs
+ start_level_inputs_.clear();
+ if (start_level_ == 0) {
+ skipped_l0_to_base = true;
+ // L0->base_level may be blocked due to ongoing L0->base_level
+ // compactions. It may also be blocked by an ongoing compaction from
+ // base_level downwards.
+ //
+ // In these cases, to reduce L0 file count and thus reduce likelihood
+ // of write stalls, we can attempt compacting a span of files within
+ // L0.
+ if (PickIntraL0Compaction()) {
+ output_level_ = 0;
+ compaction_reason_ = CompactionReason::kLevelL0FilesNum;
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ // if we didn't find a compaction, check if there are any files marked for
+ // compaction
+ if (start_level_inputs_.empty()) {
+ parent_index_ = base_index_ = -1;
+
+ compaction_picker_->PickFilesMarkedForCompaction(
+ cf_name_, vstorage_, &start_level_, &output_level_,
+ &start_level_inputs_);
+ if (!start_level_inputs_.empty()) {
+ is_manual_ = true;
+ compaction_reason_ = CompactionReason::kFilesMarkedForCompaction;
+ return;
+ }
+ }
+
+ // Bottommost Files Compaction on deleting tombstones
+ if (start_level_inputs_.empty()) {
+ size_t i;
+ for (i = 0; i < vstorage_->BottommostFilesMarkedForCompaction().size();
+ ++i) {
+ auto& level_and_file = vstorage_->BottommostFilesMarkedForCompaction()[i];
+ assert(!level_and_file.second->being_compacted);
+ start_level_inputs_.level = output_level_ = start_level_ =
+ level_and_file.first;
+ start_level_inputs_.files = {level_and_file.second};
+ if (compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+ &start_level_inputs_)) {
+ break;
+ }
+ }
+ if (i == vstorage_->BottommostFilesMarkedForCompaction().size()) {
+ start_level_inputs_.clear();
+ } else {
+ assert(!start_level_inputs_.empty());
+ compaction_reason_ = CompactionReason::kBottommostFiles;
+ return;
+ }
+ }
+
+ // TTL Compaction
+ if (start_level_inputs_.empty()) {
+ PickExpiredTtlFiles();
+ if (!start_level_inputs_.empty()) {
+ compaction_reason_ = CompactionReason::kTtl;
+ return;
+ }
+ }
+
+ // Periodic Compaction
+ if (start_level_inputs_.empty()) {
+ PickFilesMarkedForPeriodicCompaction();
+ if (!start_level_inputs_.empty()) {
+ compaction_reason_ = CompactionReason::kPeriodicCompaction;
+ return;
+ }
+ }
+}
+
+bool LevelCompactionBuilder::SetupOtherL0FilesIfNeeded() {
+ if (start_level_ == 0 && output_level_ != 0) {
+ return compaction_picker_->GetOverlappingL0Files(
+ vstorage_, &start_level_inputs_, output_level_, &parent_index_);
+ }
+ return true;
+}
+
+bool LevelCompactionBuilder::SetupOtherInputsIfNeeded() {
+ // Setup input files from output level. For output to L0, we only compact
+ // spans of files that do not interact with any pending compactions, so don't
+ // need to consider other levels.
+ if (output_level_ != 0) {
+ output_level_inputs_.level = output_level_;
+ if (!compaction_picker_->SetupOtherInputs(
+ cf_name_, mutable_cf_options_, vstorage_, &start_level_inputs_,
+ &output_level_inputs_, &parent_index_, base_index_)) {
+ return false;
+ }
+
+ compaction_inputs_.push_back(start_level_inputs_);
+ if (!output_level_inputs_.empty()) {
+ compaction_inputs_.push_back(output_level_inputs_);
+ }
+
+ // In some edge cases we could pick a compaction that will be compacting
+ // a key range that overlap with another running compaction, and both
+ // of them have the same output level. This could happen if
+ // (1) we are running a non-exclusive manual compaction
+ // (2) AddFile ingest a new file into the LSM tree
+ // We need to disallow this from happening.
+ if (compaction_picker_->FilesRangeOverlapWithCompaction(compaction_inputs_,
+ output_level_)) {
+ // This compaction output could potentially conflict with the output
+ // of a currently running compaction, we cannot run it.
+ return false;
+ }
+ compaction_picker_->GetGrandparents(vstorage_, start_level_inputs_,
+ output_level_inputs_, &grandparents_);
+ } else {
+ compaction_inputs_.push_back(start_level_inputs_);
+ }
+ return true;
+}
+
+Compaction* LevelCompactionBuilder::PickCompaction() {
+ // Pick up the first file to start compaction. It may have been extended
+ // to a clean cut.
+ SetupInitialFiles();
+ if (start_level_inputs_.empty()) {
+ return nullptr;
+ }
+ assert(start_level_ >= 0 && output_level_ >= 0);
+
+ // If it is a L0 -> base level compaction, we need to set up other L0
+ // files if needed.
+ if (!SetupOtherL0FilesIfNeeded()) {
+ return nullptr;
+ }
+
+ // Pick files in the output level and expand more files in the start level
+ // if needed.
+ if (!SetupOtherInputsIfNeeded()) {
+ return nullptr;
+ }
+
+ // Form a compaction object containing the files we picked.
+ Compaction* c = GetCompaction();
+
+ TEST_SYNC_POINT_CALLBACK("LevelCompactionPicker::PickCompaction:Return", c);
+
+ return c;
+}
+
+Compaction* LevelCompactionBuilder::GetCompaction() {
+ auto c = new Compaction(
+ vstorage_, ioptions_, mutable_cf_options_, std::move(compaction_inputs_),
+ output_level_,
+ MaxFileSizeForLevel(mutable_cf_options_, output_level_,
+ ioptions_.compaction_style, vstorage_->base_level(),
+ ioptions_.level_compaction_dynamic_level_bytes),
+ mutable_cf_options_.max_compaction_bytes,
+ GetPathId(ioptions_, mutable_cf_options_, output_level_),
+ GetCompressionType(ioptions_, vstorage_, mutable_cf_options_,
+ output_level_, vstorage_->base_level()),
+ GetCompressionOptions(ioptions_, vstorage_, output_level_),
+ /* max_subcompactions */ 0, std::move(grandparents_), is_manual_,
+ start_level_score_, false /* deletion_compaction */, compaction_reason_);
+
+ // If it's level 0 compaction, make sure we don't execute any other level 0
+ // compactions in parallel
+ compaction_picker_->RegisterCompaction(c);
+
+ // Creating a compaction influences the compaction score because the score
+ // takes running compactions into account (by skipping files that are already
+ // being compacted). Since we just changed compaction score, we recalculate it
+ // here
+ vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+ return c;
+}
+
+/*
+ * Find the optimal path to place a file
+ * Given a level, finds the path where levels up to it will fit in levels
+ * up to and including this path
+ */
+uint32_t LevelCompactionBuilder::GetPathId(
+ const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options, int level) {
+ uint32_t p = 0;
+ assert(!ioptions.cf_paths.empty());
+
+ // size remaining in the most recent path
+ uint64_t current_path_size = ioptions.cf_paths[0].target_size;
+
+ uint64_t level_size;
+ int cur_level = 0;
+
+ // max_bytes_for_level_base denotes L1 size.
+ // We estimate L0 size to be the same as L1.
+ level_size = mutable_cf_options.max_bytes_for_level_base;
+
+ // Last path is the fallback
+ while (p < ioptions.cf_paths.size() - 1) {
+ if (level_size <= current_path_size) {
+ if (cur_level == level) {
+ // Does desired level fit in this path?
+ return p;
+ } else {
+ current_path_size -= level_size;
+ if (cur_level > 0) {
+ if (ioptions.level_compaction_dynamic_level_bytes) {
+ // Currently, level_compaction_dynamic_level_bytes is ignored when
+ // multiple db paths are specified. https://github.com/facebook/
+ // rocksdb/blob/master/db/column_family.cc.
+ // Still, adding this check to avoid accidentally using
+ // max_bytes_for_level_multiplier_additional
+ level_size = static_cast<uint64_t>(
+ level_size * mutable_cf_options.max_bytes_for_level_multiplier);
+ } else {
+ level_size = static_cast<uint64_t>(
+ level_size * mutable_cf_options.max_bytes_for_level_multiplier *
+ mutable_cf_options.MaxBytesMultiplerAdditional(cur_level));
+ }
+ }
+ cur_level++;
+ continue;
+ }
+ }
+ p++;
+ current_path_size = ioptions.cf_paths[p].target_size;
+ }
+ return p;
+}
+
+bool LevelCompactionBuilder::PickFileToCompact() {
+ // level 0 files are overlapping. So we cannot pick more
+ // than one concurrent compactions at this level. This
+ // could be made better by looking at key-ranges that are
+ // being compacted at level 0.
+ if (start_level_ == 0 &&
+ !compaction_picker_->level0_compactions_in_progress()->empty()) {
+ TEST_SYNC_POINT("LevelCompactionPicker::PickCompactionBySize:0");
+ return false;
+ }
+
+ start_level_inputs_.clear();
+
+ assert(start_level_ >= 0);
+
+ // Pick the largest file in this level that is not already
+ // being compacted
+ const std::vector<int>& file_size =
+ vstorage_->FilesByCompactionPri(start_level_);
+ const std::vector<FileMetaData*>& level_files =
+ vstorage_->LevelFiles(start_level_);
+
+ unsigned int cmp_idx;
+ for (cmp_idx = vstorage_->NextCompactionIndex(start_level_);
+ cmp_idx < file_size.size(); cmp_idx++) {
+ int index = file_size[cmp_idx];
+ auto* f = level_files[index];
+
+ // do not pick a file to compact if it is being compacted
+ // from n-1 level.
+ if (f->being_compacted) {
+ continue;
+ }
+
+ start_level_inputs_.files.push_back(f);
+ start_level_inputs_.level = start_level_;
+ if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+ &start_level_inputs_) ||
+ compaction_picker_->FilesRangeOverlapWithCompaction(
+ {start_level_inputs_}, output_level_)) {
+ // A locked (pending compaction) input-level file was pulled in due to
+ // user-key overlap.
+ start_level_inputs_.clear();
+ continue;
+ }
+
+ // Now that input level is fully expanded, we check whether any output files
+ // are locked due to pending compaction.
+ //
+ // Note we rely on ExpandInputsToCleanCut() to tell us whether any output-
+ // level files are locked, not just the extra ones pulled in for user-key
+ // overlap.
+ InternalKey smallest, largest;
+ compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest);
+ CompactionInputFiles output_level_inputs;
+ output_level_inputs.level = output_level_;
+ vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
+ &output_level_inputs.files);
+ if (!output_level_inputs.empty() &&
+ !compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+ &output_level_inputs)) {
+ start_level_inputs_.clear();
+ continue;
+ }
+ base_index_ = index;
+ break;
+ }
+
+ // store where to start the iteration in the next call to PickCompaction
+ vstorage_->SetNextCompactionIndex(start_level_, cmp_idx);
+
+ return start_level_inputs_.size() > 0;
+}
+
+bool LevelCompactionBuilder::PickIntraL0Compaction() {
+ start_level_inputs_.clear();
+ const std::vector<FileMetaData*>& level_files =
+ vstorage_->LevelFiles(0 /* level */);
+ if (level_files.size() <
+ static_cast<size_t>(
+ mutable_cf_options_.level0_file_num_compaction_trigger + 2) ||
+ level_files[0]->being_compacted) {
+ // If L0 isn't accumulating much files beyond the regular trigger, don't
+ // resort to L0->L0 compaction yet.
+ return false;
+ }
+ return FindIntraL0Compaction(level_files, kMinFilesForIntraL0Compaction,
+ port::kMaxUint64,
+ mutable_cf_options_.max_compaction_bytes,
+ &start_level_inputs_, earliest_mem_seqno_);
+}
+} // namespace
+
+Compaction* LevelCompactionPicker::PickCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+ SequenceNumber earliest_mem_seqno) {
+ LevelCompactionBuilder builder(cf_name, vstorage, earliest_mem_seqno, this,
+ log_buffer, mutable_cf_options, ioptions_);
+ return builder.PickCompaction();
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_picker_level.h b/src/rocksdb/db/compaction/compaction_picker_level.h
new file mode 100644
index 000000000..b82070e14
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_level.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "db/compaction/compaction_picker.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Picking compactions for leveled compaction. See wiki page
+// https://github.com/facebook/rocksdb/wiki/Leveled-Compaction
+// for description of Leveled compaction.
+class LevelCompactionPicker : public CompactionPicker {
+ public:
+ LevelCompactionPicker(const ImmutableCFOptions& ioptions,
+ const InternalKeyComparator* icmp)
+ : CompactionPicker(ioptions, icmp) {}
+ virtual Compaction* PickCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+ SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override;
+
+ virtual bool NeedsCompaction(
+ const VersionStorageInfo* vstorage) const override;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_picker_test.cc b/src/rocksdb/db/compaction/compaction_picker_test.cc
new file mode 100644
index 000000000..278bdb06a
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_test.cc
@@ -0,0 +1,1741 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+
+#include <limits>
+#include <string>
+#include <utility>
+#include "db/compaction/compaction.h"
+#include "db/compaction/compaction_picker_fifo.h"
+#include "db/compaction/compaction_picker_level.h"
+#include "db/compaction/compaction_picker_universal.h"
+
+#include "logging/logging.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CountingLogger : public Logger {
+ public:
+ using Logger::Logv;
+ void Logv(const char* /*format*/, va_list /*ap*/) override { log_count++; }
+ size_t log_count;
+};
+
+class CompactionPickerTest : public testing::Test {
+ public:
+ const Comparator* ucmp_;
+ InternalKeyComparator icmp_;
+ Options options_;
+ ImmutableCFOptions ioptions_;
+ MutableCFOptions mutable_cf_options_;
+ LevelCompactionPicker level_compaction_picker;
+ std::string cf_name_;
+ CountingLogger logger_;
+ LogBuffer log_buffer_;
+ uint32_t file_num_;
+ CompactionOptionsFIFO fifo_options_;
+ std::unique_ptr<VersionStorageInfo> vstorage_;
+ std::vector<std::unique_ptr<FileMetaData>> files_;
+ // does not own FileMetaData
+ std::unordered_map<uint32_t, std::pair<FileMetaData*, int>> file_map_;
+ // input files to compaction process.
+ std::vector<CompactionInputFiles> input_files_;
+ int compaction_level_start_;
+
+ CompactionPickerTest()
+ : ucmp_(BytewiseComparator()),
+ icmp_(ucmp_),
+ ioptions_(options_),
+ mutable_cf_options_(options_),
+ level_compaction_picker(ioptions_, &icmp_),
+ cf_name_("dummy"),
+ log_buffer_(InfoLogLevel::INFO_LEVEL, &logger_),
+ file_num_(1),
+ vstorage_(nullptr) {
+ mutable_cf_options_.ttl = 0;
+ mutable_cf_options_.periodic_compaction_seconds = 0;
+ // ioptions_.compaction_pri = kMinOverlappingRatio has its own set of
+ // tests to cover.
+ ioptions_.compaction_pri = kByCompensatedSize;
+ fifo_options_.max_table_files_size = 1;
+ mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+ ioptions_.cf_paths.emplace_back("dummy",
+ std::numeric_limits<uint64_t>::max());
+ }
+
+ ~CompactionPickerTest() override {}
+
+ void NewVersionStorage(int num_levels, CompactionStyle style) {
+ DeleteVersionStorage();
+ options_.num_levels = num_levels;
+ vstorage_.reset(new VersionStorageInfo(&icmp_, ucmp_, options_.num_levels,
+ style, nullptr, false));
+ vstorage_->CalculateBaseBytes(ioptions_, mutable_cf_options_);
+ }
+
+ void DeleteVersionStorage() {
+ vstorage_.reset();
+ files_.clear();
+ file_map_.clear();
+ input_files_.clear();
+ }
+
+ void Add(int level, uint32_t file_number, const char* smallest,
+ const char* largest, uint64_t file_size = 1, uint32_t path_id = 0,
+ SequenceNumber smallest_seq = 100, SequenceNumber largest_seq = 100,
+ size_t compensated_file_size = 0) {
+ assert(level < vstorage_->num_levels());
+ FileMetaData* f = new FileMetaData(
+ file_number, path_id, file_size,
+ InternalKey(smallest, smallest_seq, kTypeValue),
+ InternalKey(largest, largest_seq, kTypeValue), smallest_seq,
+ largest_seq, /* marked_for_compact */ false, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+ f->compensated_file_size =
+ (compensated_file_size != 0) ? compensated_file_size : file_size;
+ vstorage_->AddFile(level, f);
+ files_.emplace_back(f);
+ file_map_.insert({file_number, {f, level}});
+ }
+
+ void SetCompactionInputFilesLevels(int level_count, int start_level) {
+ input_files_.resize(level_count);
+ for (int i = 0; i < level_count; ++i) {
+ input_files_[i].level = start_level + i;
+ }
+ compaction_level_start_ = start_level;
+ }
+
+ void AddToCompactionFiles(uint32_t file_number) {
+ auto iter = file_map_.find(file_number);
+ assert(iter != file_map_.end());
+ int level = iter->second.second;
+ assert(level < vstorage_->num_levels());
+ input_files_[level - compaction_level_start_].files.emplace_back(
+ iter->second.first);
+ }
+
+ void UpdateVersionStorageInfo() {
+ vstorage_->CalculateBaseBytes(ioptions_, mutable_cf_options_);
+ vstorage_->UpdateFilesByCompactionPri(ioptions_.compaction_pri);
+ vstorage_->UpdateNumNonEmptyLevels();
+ vstorage_->GenerateFileIndexer();
+ vstorage_->GenerateLevelFilesBrief();
+ vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+ vstorage_->GenerateLevel0NonOverlapping();
+ vstorage_->ComputeFilesMarkedForCompaction();
+ vstorage_->SetFinalized();
+ }
+};
+
+TEST_F(CompactionPickerTest, Empty) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ UpdateVersionStorageInfo();
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, Single) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ Add(0, 1U, "p", "q");
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, Level0Trigger) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ Add(0, 1U, "150", "200");
+ Add(0, 2U, "200", "250");
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, Level1Trigger) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(1, 66U, "150", "200", 1000000000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, Level1Trigger2) {
+ mutable_cf_options_.target_file_size_base = 10000000000;
+ mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(1, 66U, "150", "200", 1000000001U);
+ Add(1, 88U, "201", "300", 1000000000U);
+ Add(2, 6U, "150", "179", 1000000000U);
+ Add(2, 7U, "180", "220", 1000000000U);
+ Add(2, 8U, "221", "300", 1000000000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(2U, compaction->num_input_files(1));
+ ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
+ ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber());
+ ASSERT_EQ(uint64_t{1073741824}, compaction->OutputFilePreallocationSize());
+}
+
+TEST_F(CompactionPickerTest, LevelMaxScore) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.target_file_size_base = 10000000;
+ mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024;
+ mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+ Add(0, 1U, "150", "200", 1000000U);
+ // Level 1 score 1.2
+ Add(1, 66U, "150", "200", 6000000U);
+ Add(1, 88U, "201", "300", 6000000U);
+ // Level 2 score 1.8. File 7 is the largest. Should be picked
+ Add(2, 6U, "150", "179", 60000000U);
+ Add(2, 7U, "180", "220", 60000001U);
+ Add(2, 8U, "221", "300", 60000000U);
+ // Level 3 score slightly larger than 1
+ Add(3, 26U, "150", "170", 260000000U);
+ Add(3, 27U, "171", "179", 260000000U);
+ Add(3, 28U, "191", "220", 260000000U);
+ Add(3, 29U, "221", "300", 260000000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(mutable_cf_options_.target_file_size_base +
+ mutable_cf_options_.target_file_size_base / 10,
+ compaction->OutputFilePreallocationSize());
+}
+
+TEST_F(CompactionPickerTest, NeedsCompactionLevel) {
+ const int kLevels = 6;
+ const int kFileCount = 20;
+
+ for (int level = 0; level < kLevels - 1; ++level) {
+ NewVersionStorage(kLevels, kCompactionStyleLevel);
+ uint64_t file_size = vstorage_->MaxBytesForLevel(level) * 2 / kFileCount;
+ for (int file_count = 1; file_count <= kFileCount; ++file_count) {
+ // start a brand new version in each test.
+ NewVersionStorage(kLevels, kCompactionStyleLevel);
+ for (int i = 0; i < file_count; ++i) {
+ Add(level, i, ToString((i + 100) * 1000).c_str(),
+ ToString((i + 100) * 1000 + 999).c_str(),
+ file_size, 0, i * 100, i * 100 + 99);
+ }
+ UpdateVersionStorageInfo();
+ ASSERT_EQ(vstorage_->CompactionScoreLevel(0), level);
+ ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()),
+ vstorage_->CompactionScore(0) >= 1);
+ // release the version storage
+ DeleteVersionStorage();
+ }
+ }
+}
+
+TEST_F(CompactionPickerTest, Level0TriggerDynamic) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 200;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200");
+ Add(0, 2U, "200", "250");
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(1, static_cast<int>(compaction->num_input_levels()));
+ ASSERT_EQ(num_levels - 1, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, Level0TriggerDynamic2) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 200;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200");
+ Add(0, 2U, "200", "250");
+ Add(num_levels - 1, 3U, "200", "250", 300U);
+
+ UpdateVersionStorageInfo();
+ ASSERT_EQ(vstorage_->base_level(), num_levels - 2);
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(1, static_cast<int>(compaction->num_input_levels()));
+ ASSERT_EQ(num_levels - 2, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, Level0TriggerDynamic3) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 200;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200");
+ Add(0, 2U, "200", "250");
+ Add(num_levels - 1, 3U, "200", "250", 300U);
+ Add(num_levels - 1, 4U, "300", "350", 3000U);
+
+ UpdateVersionStorageInfo();
+ ASSERT_EQ(vstorage_->base_level(), num_levels - 3);
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(1, static_cast<int>(compaction->num_input_levels()));
+ ASSERT_EQ(num_levels - 3, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, Level0TriggerDynamic4) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 200;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200");
+ Add(0, 2U, "200", "250");
+ Add(num_levels - 1, 3U, "200", "250", 300U);
+ Add(num_levels - 1, 4U, "300", "350", 3000U);
+ Add(num_levels - 3, 5U, "150", "180", 3U);
+ Add(num_levels - 3, 6U, "181", "300", 3U);
+ Add(num_levels - 3, 7U, "400", "450", 3U);
+
+ UpdateVersionStorageInfo();
+ ASSERT_EQ(vstorage_->base_level(), num_levels - 3);
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->num_input_files(1));
+ ASSERT_EQ(num_levels - 3, compaction->level(1));
+ ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber());
+ ASSERT_EQ(6U, compaction->input(1, 1)->fd.GetNumber());
+ ASSERT_EQ(2, static_cast<int>(compaction->num_input_levels()));
+ ASSERT_EQ(num_levels - 3, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, LevelTriggerDynamic4) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 200;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200");
+ Add(num_levels - 1, 3U, "200", "250", 300U);
+ Add(num_levels - 1, 4U, "300", "350", 3000U);
+ Add(num_levels - 1, 4U, "400", "450", 3U);
+ Add(num_levels - 2, 5U, "150", "180", 300U);
+ Add(num_levels - 2, 6U, "181", "350", 500U);
+ Add(num_levels - 2, 7U, "400", "450", 200U);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(0, compaction->num_input_files(1));
+ ASSERT_EQ(1U, compaction->num_input_levels());
+ ASSERT_EQ(num_levels - 1, compaction->output_level());
+}
+
+// Universal and FIFO Compactions are not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+TEST_F(CompactionPickerTest, NeedsCompactionUniversal) {
+ NewVersionStorage(1, kCompactionStyleUniversal);
+ UniversalCompactionPicker universal_compaction_picker(
+ ioptions_, &icmp_);
+ UpdateVersionStorageInfo();
+ // must return false when there's no files.
+ ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
+ false);
+
+ // verify the trigger given different number of L0 files.
+ for (int i = 1;
+ i <= mutable_cf_options_.level0_file_num_compaction_trigger * 2; ++i) {
+ NewVersionStorage(1, kCompactionStyleUniversal);
+ Add(0, i, ToString((i + 100) * 1000).c_str(),
+ ToString((i + 100) * 1000 + 999).c_str(), 1000000, 0, i * 100,
+ i * 100 + 99);
+ UpdateVersionStorageInfo();
+ ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()),
+ vstorage_->CompactionScore(0) >= 1);
+ }
+}
+
+TEST_F(CompactionPickerTest, CompactionUniversalIngestBehindReservedLevel) {
+ const uint64_t kFileSize = 100000;
+ NewVersionStorage(1, kCompactionStyleUniversal);
+ ioptions_.allow_ingest_behind = true;
+ ioptions_.num_levels = 3;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+ UpdateVersionStorageInfo();
+ // must return false when there's no files.
+ ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
+ false);
+
+ NewVersionStorage(3, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+ Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+ Add(1, 5U, "100", "151", kFileSize, 0, 200, 251);
+ Add(1, 3U, "301", "350", kFileSize, 0, 101, 150);
+ Add(2, 6U, "120", "200", kFileSize, 0, 20, 100);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+
+ // output level should be the one above the bottom-most
+ ASSERT_EQ(1, compaction->output_level());
+}
+// Tests if the files can be trivially moved in multi level
+// universal compaction when allow_trivial_move option is set
+// In this test as the input files overlaps, they cannot
+// be trivially moved.
+
+TEST_F(CompactionPickerTest, CannotTrivialMoveUniversal) {
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.compaction_options_universal.allow_trivial_move = true;
+ NewVersionStorage(1, kCompactionStyleUniversal);
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+ UpdateVersionStorageInfo();
+ // must return false when there's no files.
+ ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
+ false);
+
+ NewVersionStorage(3, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+ Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+ Add(1, 5U, "100", "151", kFileSize, 0, 200, 251);
+ Add(1, 3U, "301", "350", kFileSize, 0, 101, 150);
+ Add(2, 6U, "120", "200", kFileSize, 0, 20, 100);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+
+ ASSERT_TRUE(!compaction->is_trivial_move());
+}
+// Tests if the files can be trivially moved in multi level
+// universal compaction when allow_trivial_move option is set
+// In this test as the input files doesn't overlaps, they should
+// be trivially moved.
+TEST_F(CompactionPickerTest, AllowsTrivialMoveUniversal) {
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.compaction_options_universal.allow_trivial_move = true;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(3, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+ Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+ Add(1, 5U, "010", "080", kFileSize, 0, 200, 251);
+ Add(2, 3U, "301", "350", kFileSize, 0, 101, 150);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+
+ ASSERT_TRUE(compaction->is_trivial_move());
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction1) {
+ // The case where universal periodic compaction can be picked
+ // with some newer files being compacted.
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.periodic_compaction_seconds = 1000;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+ Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+ Add(3, 5U, "010", "080", kFileSize, 0, 200, 251);
+ Add(4, 3U, "301", "350", kFileSize, 0, 101, 150);
+ Add(4, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+ file_map_[2].first->being_compacted = true;
+ UpdateVersionStorageInfo();
+ vstorage_->TEST_AddFileMarkedForPeriodicCompaction(4, file_map_[3].first);
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+
+ ASSERT_TRUE(compaction);
+ ASSERT_EQ(4, compaction->output_level());
+ ASSERT_EQ(0, compaction->start_level());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction2) {
+ // The case where universal periodic compaction does not
+ // pick up only level to compact if it doesn't cover
+ // any file marked as periodic compaction.
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.periodic_compaction_seconds = 1000;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(3, 5U, "010", "080", kFileSize, 0, 200, 251);
+ Add(4, 3U, "301", "350", kFileSize, 0, 101, 150);
+ Add(4, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+ file_map_[5].first->being_compacted = true;
+ UpdateVersionStorageInfo();
+ vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[1].first);
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+
+ ASSERT_FALSE(compaction);
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction3) {
+ // The case where universal periodic compaction does not
+ // pick up only the last sorted run which is an L0 file if it isn't
+ // marked as periodic compaction.
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.periodic_compaction_seconds = 1000;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(0, 5U, "010", "080", kFileSize, 0, 200, 251);
+ Add(0, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+ file_map_[5].first->being_compacted = true;
+ UpdateVersionStorageInfo();
+ vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[1].first);
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+
+ ASSERT_FALSE(compaction);
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction4) {
+ // The case where universal periodic compaction couldn't form
+ // a compaction that inlcudes any file marked for periodic compaction.
+ // Right now we form the compaction anyway if it is more than one
+ // sorted run. Just put the case here to validate that it doesn't
+ // crash.
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.periodic_compaction_seconds = 1000;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(2, 2U, "010", "080", kFileSize, 0, 200, 251);
+ Add(3, 5U, "010", "080", kFileSize, 0, 200, 251);
+ Add(4, 3U, "301", "350", kFileSize, 0, 101, 150);
+ Add(4, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+ file_map_[2].first->being_compacted = true;
+ UpdateVersionStorageInfo();
+ vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[2].first);
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(!compaction ||
+ compaction->start_level() != compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction5) {
+ // Test single L0 file periodic compaction triggering.
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.periodic_compaction_seconds = 1000;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 6U, "150", "200", kFileSize, 0, 500, 550);
+ UpdateVersionStorageInfo();
+ vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[6].first);
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction);
+ ASSERT_EQ(0, compaction->start_level());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(4, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction6) {
+ // Test single sorted run non-L0 periodic compaction
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.periodic_compaction_seconds = 1000;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(4, 5U, "150", "200", kFileSize, 0, 500, 550);
+ Add(4, 6U, "350", "400", kFileSize, 0, 500, 550);
+ UpdateVersionStorageInfo();
+ vstorage_->TEST_AddFileMarkedForPeriodicCompaction(4, file_map_[6].first);
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction);
+ ASSERT_EQ(4, compaction->start_level());
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(6U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(4, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, NeedsCompactionFIFO) {
+ NewVersionStorage(1, kCompactionStyleFIFO);
+ const int kFileCount =
+ mutable_cf_options_.level0_file_num_compaction_trigger * 3;
+ const uint64_t kFileSize = 100000;
+ const uint64_t kMaxSize = kFileSize * kFileCount / 2;
+
+ fifo_options_.max_table_files_size = kMaxSize;
+ mutable_cf_options_.compaction_options_fifo = fifo_options_;
+ FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+ UpdateVersionStorageInfo();
+ // must return false when there's no files.
+ ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), false);
+
+ // verify whether compaction is needed based on the current
+ // size of L0 files.
+ uint64_t current_size = 0;
+ for (int i = 1; i <= kFileCount; ++i) {
+ NewVersionStorage(1, kCompactionStyleFIFO);
+ Add(0, i, ToString((i + 100) * 1000).c_str(),
+ ToString((i + 100) * 1000 + 999).c_str(),
+ kFileSize, 0, i * 100, i * 100 + 99);
+ current_size += kFileSize;
+ UpdateVersionStorageInfo();
+ ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()),
+ vstorage_->CompactionScore(0) >= 1);
+ }
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(CompactionPickerTest, CompactionPriMinOverlapping1) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ mutable_cf_options_.target_file_size_base = 100000000000;
+ mutable_cf_options_.target_file_size_multiplier = 10;
+ mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024;
+ mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+
+ Add(2, 6U, "150", "179", 50000000U);
+ Add(2, 7U, "180", "220", 50000000U);
+ Add(2, 8U, "321", "400", 50000000U); // File not overlapping
+ Add(2, 9U, "721", "800", 50000000U);
+
+ Add(3, 26U, "150", "170", 260000000U);
+ Add(3, 27U, "171", "179", 260000000U);
+ Add(3, 28U, "191", "220", 260000000U);
+ Add(3, 29U, "221", "300", 260000000U);
+ Add(3, 30U, "750", "900", 260000000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ // Pick file 8 because it overlaps with 0 files on level 3.
+ ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber());
+ // Compaction input size * 1.1
+ ASSERT_GE(uint64_t{55000000}, compaction->OutputFilePreallocationSize());
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMinOverlapping2) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ mutable_cf_options_.target_file_size_base = 10000000;
+ mutable_cf_options_.target_file_size_multiplier = 10;
+ mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024;
+
+ Add(2, 6U, "150", "175",
+ 60000000U); // Overlaps with file 26, 27, total size 521M
+ Add(2, 7U, "176", "200", 60000000U); // Overlaps with file 27, 28, total size
+ // 520M, the smalelst overlapping
+ Add(2, 8U, "201", "300",
+ 60000000U); // Overlaps with file 28, 29, total size 521M
+
+ Add(3, 26U, "100", "110", 261000000U);
+ Add(3, 26U, "150", "170", 261000000U);
+ Add(3, 27U, "171", "179", 260000000U);
+ Add(3, 28U, "191", "220", 260000000U);
+ Add(3, 29U, "221", "300", 261000000U);
+ Add(3, 30U, "321", "400", 261000000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ // Picking file 7 because overlapping ratio is the biggest.
+ ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMinOverlapping3) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ mutable_cf_options_.max_bytes_for_level_base = 10000000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+
+ // file 7 and 8 over lap with the same file, but file 8 is smaller so
+ // it will be picked.
+ Add(2, 6U, "150", "167", 60000000U); // Overlaps with file 26, 27
+ Add(2, 7U, "168", "169", 60000000U); // Overlaps with file 27
+ Add(2, 8U, "201", "300", 61000000U); // Overlaps with file 28, but the file
+ // itself is larger. Should be picked.
+
+ Add(3, 26U, "160", "165", 260000000U);
+ Add(3, 27U, "166", "170", 260000000U);
+ Add(3, 28U, "180", "400", 260000000U);
+ Add(3, 29U, "401", "500", 260000000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ // Picking file 8 because overlapping ratio is the biggest.
+ ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMinOverlapping4) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ mutable_cf_options_.max_bytes_for_level_base = 10000000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+
+ // file 7 and 8 over lap with the same file, but file 8 is smaller so
+ // it will be picked.
+ // Overlaps with file 26, 27. And the file is compensated so will be
+ // picked up.
+ Add(2, 6U, "150", "167", 60000000U, 0, 100, 100, 180000000U);
+ Add(2, 7U, "168", "169", 60000000U); // Overlaps with file 27
+ Add(2, 8U, "201", "300", 61000000U); // Overlaps with file 28
+
+ Add(3, 26U, "160", "165", 60000000U);
+ // Boosted file size in output level is not considered.
+ Add(3, 27U, "166", "170", 60000000U, 0, 100, 100, 260000000U);
+ Add(3, 28U, "180", "400", 60000000U);
+ Add(3, 29U, "401", "500", 60000000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ // Picking file 8 because overlapping ratio is the biggest.
+ ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+// This test exhibits the bug where we don't properly reset parent_index in
+// PickCompaction()
+TEST_F(CompactionPickerTest, ParentIndexResetBug) {
+ int num_levels = ioptions_.num_levels;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 200;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200"); // <- marked for compaction
+ Add(1, 3U, "400", "500", 600); // <- this one needs compacting
+ Add(2, 4U, "150", "200");
+ Add(2, 5U, "201", "210");
+ Add(2, 6U, "300", "310");
+ Add(2, 7U, "400", "500"); // <- being compacted
+
+ vstorage_->LevelFiles(2)[3]->being_compacted = true;
+ vstorage_->LevelFiles(0)[0]->marked_for_compaction = true;
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+}
+
+// This test checks ExpandWhileOverlapping() by having overlapping user keys
+// ranges (with different sequence numbers) in the input files.
+TEST_F(CompactionPickerTest, OverlappingUserKeys) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ ioptions_.compaction_pri = kByCompensatedSize;
+
+ Add(1, 1U, "100", "150", 1U);
+ // Overlapping user keys
+ Add(1, 2U, "200", "400", 1U);
+ Add(1, 3U, "400", "500", 1000000000U, 0, 0);
+ Add(2, 4U, "600", "700", 1U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_levels());
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys2) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // Overlapping user keys on same level and output level
+ Add(1, 1U, "200", "400", 1000000000U);
+ Add(1, 2U, "400", "500", 1U, 0, 0);
+ Add(2, 3U, "000", "100", 1U);
+ Add(2, 4U, "100", "600", 1U, 0, 0);
+ Add(2, 5U, "600", "700", 1U, 0, 0);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(3U, compaction->num_input_files(1));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(3U, compaction->input(1, 0)->fd.GetNumber());
+ ASSERT_EQ(4U, compaction->input(1, 1)->fd.GetNumber());
+ ASSERT_EQ(5U, compaction->input(1, 2)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys3) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // Chain of overlapping user key ranges (forces ExpandWhileOverlapping() to
+ // expand multiple times)
+ Add(1, 1U, "100", "150", 1U);
+ Add(1, 2U, "150", "200", 1U, 0, 0);
+ Add(1, 3U, "200", "250", 1000000000U, 0, 0);
+ Add(1, 4U, "250", "300", 1U, 0, 0);
+ Add(1, 5U, "300", "350", 1U, 0, 0);
+ // Output level overlaps with the beginning and the end of the chain
+ Add(2, 6U, "050", "100", 1U);
+ Add(2, 7U, "350", "400", 1U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(5U, compaction->num_input_files(0));
+ ASSERT_EQ(2U, compaction->num_input_files(1));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber());
+ ASSERT_EQ(4U, compaction->input(0, 3)->fd.GetNumber());
+ ASSERT_EQ(5U, compaction->input(0, 4)->fd.GetNumber());
+ ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
+ ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys4) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.max_bytes_for_level_base = 1000000;
+
+ Add(1, 1U, "100", "150", 1U);
+ Add(1, 2U, "150", "199", 1U, 0, 0);
+ Add(1, 3U, "200", "250", 1100000U, 0, 0);
+ Add(1, 4U, "251", "300", 1U, 0, 0);
+ Add(1, 5U, "300", "350", 1U, 0, 0);
+
+ Add(2, 6U, "100", "115", 1U);
+ Add(2, 7U, "125", "325", 1U);
+ Add(2, 8U, "350", "400", 1U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->num_input_files(1));
+ ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys5) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // Overlapping user keys on same level and output level
+ Add(1, 1U, "200", "400", 1000000000U);
+ Add(1, 2U, "400", "500", 1U, 0, 0);
+ Add(2, 3U, "000", "100", 1U);
+ Add(2, 4U, "100", "600", 1U, 0, 0);
+ Add(2, 5U, "600", "700", 1U, 0, 0);
+
+ vstorage_->LevelFiles(2)[2]->being_compacted = true;
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys6) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // Overlapping user keys on same level and output level
+ Add(1, 1U, "200", "400", 1U, 0, 0);
+ Add(1, 2U, "401", "500", 1U, 0, 0);
+ Add(2, 3U, "000", "100", 1U);
+ Add(2, 4U, "100", "300", 1U, 0, 0);
+ Add(2, 5U, "305", "450", 1U, 0, 0);
+ Add(2, 6U, "460", "600", 1U, 0, 0);
+ Add(2, 7U, "600", "700", 1U, 0, 0);
+
+ vstorage_->LevelFiles(1)[0]->marked_for_compaction = true;
+ vstorage_->LevelFiles(1)[1]->marked_for_compaction = true;
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(3U, compaction->num_input_files(1));
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys7) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.max_compaction_bytes = 100000000000u;
+ // Overlapping user keys on same level and output level
+ Add(1, 1U, "200", "400", 1U, 0, 0);
+ Add(1, 2U, "401", "500", 1000000000U, 0, 0);
+ Add(2, 3U, "100", "250", 1U);
+ Add(2, 4U, "300", "600", 1U, 0, 0);
+ Add(2, 5U, "600", "800", 1U, 0, 0);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_GE(1U, compaction->num_input_files(0));
+ ASSERT_GE(2U, compaction->num_input_files(1));
+ // File 5 has to be included in the compaction
+ ASSERT_EQ(5U, compaction->inputs(1)->back()->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys8) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.max_compaction_bytes = 100000000000u;
+ // grow the number of inputs in "level" without
+ // changing the number of "level+1" files we pick up
+ // Expand input level as much as possible
+ // no overlapping case
+ Add(1, 1U, "101", "150", 1U);
+ Add(1, 2U, "151", "200", 1U);
+ Add(1, 3U, "201", "300", 1000000000U);
+ Add(1, 4U, "301", "400", 1U);
+ Add(1, 5U, "401", "500", 1U);
+ Add(2, 6U, "150", "200", 1U);
+ Add(2, 7U, "200", "450", 1U, 0, 0);
+ Add(2, 8U, "500", "600", 1U);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(3U, compaction->num_input_files(0));
+ ASSERT_EQ(2U, compaction->num_input_files(1));
+ ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(4U, compaction->input(0, 2)->fd.GetNumber());
+ ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
+ ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys9) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.max_compaction_bytes = 100000000000u;
+ // grow the number of inputs in "level" without
+ // changing the number of "level+1" files we pick up
+ // Expand input level as much as possible
+ // overlapping case
+ Add(1, 1U, "121", "150", 1U);
+ Add(1, 2U, "151", "200", 1U);
+ Add(1, 3U, "201", "300", 1000000000U);
+ Add(1, 4U, "301", "400", 1U);
+ Add(1, 5U, "401", "500", 1U);
+ Add(2, 6U, "100", "120", 1U);
+ Add(2, 7U, "150", "200", 1U);
+ Add(2, 8U, "200", "450", 1U, 0, 0);
+ Add(2, 9U, "501", "600", 1U);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(5U, compaction->num_input_files(0));
+ ASSERT_EQ(2U, compaction->num_input_files(1));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber());
+ ASSERT_EQ(4U, compaction->input(0, 3)->fd.GetNumber());
+ ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber());
+ ASSERT_EQ(8U, compaction->input(1, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys10) {
+ // Locked file encountered when pulling in extra input-level files with same
+ // user keys. Verify we pick the next-best file from the same input level.
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.max_compaction_bytes = 100000000000u;
+
+ // file_number 2U is largest and thus first choice. But it overlaps with
+ // file_number 1U which is being compacted. So instead we pick the next-
+ // biggest file, 3U, which is eligible for compaction.
+ Add(1 /* level */, 1U /* file_number */, "100" /* smallest */,
+ "150" /* largest */, 1U /* file_size */);
+ file_map_[1U].first->being_compacted = true;
+ Add(1 /* level */, 2U /* file_number */, "150" /* smallest */,
+ "200" /* largest */, 1000000000U /* file_size */, 0 /* smallest_seq */,
+ 0 /* largest_seq */);
+ Add(1 /* level */, 3U /* file_number */, "201" /* smallest */,
+ "250" /* largest */, 900000000U /* file_size */);
+ Add(2 /* level */, 4U /* file_number */, "100" /* smallest */,
+ "150" /* largest */, 1U /* file_size */);
+ Add(2 /* level */, 5U /* file_number */, "151" /* smallest */,
+ "200" /* largest */, 1U /* file_size */);
+ Add(2 /* level */, 6U /* file_number */, "201" /* smallest */,
+ "250" /* largest */, 1U /* file_size */);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->num_input_files(1));
+ ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys11) {
+ // Locked file encountered when pulling in extra output-level files with same
+ // user keys. Expected to skip that compaction and pick the next-best choice.
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.max_compaction_bytes = 100000000000u;
+
+ // score(L1) = 3.7
+ // score(L2) = 1.85
+ // There is no eligible file in L1 to compact since both candidates pull in
+ // file_number 5U, which overlaps with a file pending compaction (6U). The
+ // first eligible compaction is from L2->L3.
+ Add(1 /* level */, 2U /* file_number */, "151" /* smallest */,
+ "200" /* largest */, 1000000000U /* file_size */);
+ Add(1 /* level */, 3U /* file_number */, "201" /* smallest */,
+ "250" /* largest */, 1U /* file_size */);
+ Add(2 /* level */, 4U /* file_number */, "100" /* smallest */,
+ "149" /* largest */, 5000000000U /* file_size */);
+ Add(2 /* level */, 5U /* file_number */, "150" /* smallest */,
+ "201" /* largest */, 1U /* file_size */);
+ Add(2 /* level */, 6U /* file_number */, "201" /* smallest */,
+ "249" /* largest */, 1U /* file_size */, 0 /* smallest_seq */,
+ 0 /* largest_seq */);
+ file_map_[6U].first->being_compacted = true;
+ Add(3 /* level */, 7U /* file_number */, "100" /* smallest */,
+ "149" /* largest */, 1U /* file_size */);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->num_input_files(1));
+ ASSERT_EQ(4U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri1) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 900000000U;
+
+ // 6 L0 files, score 3.
+ Add(0, 1U, "000", "400", 1U);
+ Add(0, 2U, "001", "400", 1U, 0, 0);
+ Add(0, 3U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 31U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 32U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 33U, "001", "400", 1000000000U, 0, 0);
+
+ // L1 total size 2GB, score 2.2. If one file being comapcted, score 1.1.
+ Add(1, 4U, "050", "300", 1000000000U, 0, 0);
+ file_map_[4u].first->being_compacted = true;
+ Add(1, 5U, "301", "350", 1000000000U, 0, 0);
+
+ // Output level overlaps with the beginning and the end of the chain
+ Add(2, 6U, "050", "100", 1U);
+ Add(2, 7U, "300", "400", 1U);
+
+ // No compaction should be scheduled, if L0 has higher priority than L1
+ // but L0->L1 compaction is blocked by a file in L1 being compacted.
+ UpdateVersionStorageInfo();
+ ASSERT_EQ(0, vstorage_->CompactionScoreLevel(0));
+ ASSERT_EQ(1, vstorage_->CompactionScoreLevel(1));
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri2) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 900000000U;
+
+ // 6 L0 files, score 3.
+ Add(0, 1U, "000", "400", 1U);
+ Add(0, 2U, "001", "400", 1U, 0, 0);
+ Add(0, 3U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 31U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 32U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 33U, "001", "400", 1000000000U, 0, 0);
+
+ // L1 total size 2GB, score 2.2. If one file being comapcted, score 1.1.
+ Add(1, 4U, "050", "300", 1000000000U, 0, 0);
+ Add(1, 5U, "301", "350", 1000000000U, 0, 0);
+
+ // Output level overlaps with the beginning and the end of the chain
+ Add(2, 6U, "050", "100", 1U);
+ Add(2, 7U, "300", "400", 1U);
+
+ // If no file in L1 being compacted, L0->L1 compaction will be scheduled.
+ UpdateVersionStorageInfo(); // being_compacted flag is cleared here.
+ ASSERT_EQ(0, vstorage_->CompactionScoreLevel(0));
+ ASSERT_EQ(1, vstorage_->CompactionScoreLevel(1));
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+}
+
+TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri3) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 900000000U;
+
+ // 6 L0 files, score 3.
+ Add(0, 1U, "000", "400", 1U);
+ Add(0, 2U, "001", "400", 1U, 0, 0);
+ Add(0, 3U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 31U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 32U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 33U, "001", "400", 1000000000U, 0, 0);
+
+ // L1 score more than 6.
+ Add(1, 4U, "050", "300", 1000000000U, 0, 0);
+ file_map_[4u].first->being_compacted = true;
+ Add(1, 5U, "301", "350", 1000000000U, 0, 0);
+ Add(1, 51U, "351", "400", 6000000000U, 0, 0);
+
+ // Output level overlaps with the beginning and the end of the chain
+ Add(2, 6U, "050", "100", 1U);
+ Add(2, 7U, "300", "400", 1U);
+
+ // If score in L1 is larger than L0, L1 compaction goes through despite
+ // there is pending L0 compaction.
+ UpdateVersionStorageInfo();
+ ASSERT_EQ(1, vstorage_->CompactionScoreLevel(0));
+ ASSERT_EQ(0, vstorage_->CompactionScoreLevel(1));
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+}
+
+TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded1) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 4;
+ mutable_cf_options_.max_bytes_for_level_base = 1000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200", 200);
+ Add(0, 2U, "150", "200", 200);
+ Add(0, 3U, "150", "200", 200);
+ // Level 1 is over target by 200
+ Add(1, 4U, "400", "500", 600);
+ Add(1, 5U, "600", "700", 600);
+ // Level 2 is less than target 10000 even added size of level 1
+ // Size ratio of L2/L1 is 9600 / 1200 = 8
+ Add(2, 6U, "150", "200", 2500);
+ Add(2, 7U, "201", "210", 2000);
+ Add(2, 8U, "300", "310", 2600);
+ Add(2, 9U, "400", "500", 2500);
+ // Level 3 exceeds target 100,000 of 1000
+ Add(3, 10U, "400", "500", 101000);
+ // Level 4 exceeds target 1,000,000 by 900 after adding size from level 3
+ // Size ratio L4/L3 is 9.9
+ // After merge from L3, L4 size is 1000900
+ Add(4, 11U, "400", "500", 999900);
+ Add(5, 11U, "400", "500", 8007200);
+
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(200u * 9u + 10900u + 900u * 9,
+ vstorage_->estimated_compaction_needed_bytes());
+}
+
+TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded2) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+ mutable_cf_options_.max_bytes_for_level_base = 1000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200", 200);
+ Add(0, 2U, "150", "200", 200);
+ Add(0, 4U, "150", "200", 200);
+ Add(0, 5U, "150", "200", 200);
+ Add(0, 6U, "150", "200", 200);
+ // Level 1 size will be 1400 after merging with L0
+ Add(1, 7U, "400", "500", 200);
+ Add(1, 8U, "600", "700", 200);
+ // Level 2 is less than target 10000 even added size of level 1
+ Add(2, 9U, "150", "200", 9100);
+ // Level 3 over the target, but since level 4 is empty, we assume it will be
+ // a trivial move.
+ Add(3, 10U, "400", "500", 101000);
+
+ UpdateVersionStorageInfo();
+
+ // estimated L1->L2 merge: 400 * (9100.0 / 1400.0 + 1.0)
+ ASSERT_EQ(1400u + 3000u, vstorage_->estimated_compaction_needed_bytes());
+}
+
+TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded3) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+ mutable_cf_options_.max_bytes_for_level_base = 1000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200", 2000);
+ Add(0, 2U, "150", "200", 2000);
+ Add(0, 4U, "150", "200", 2000);
+ Add(0, 5U, "150", "200", 2000);
+ Add(0, 6U, "150", "200", 1000);
+ // Level 1 size will be 10000 after merging with L0
+ Add(1, 7U, "400", "500", 500);
+ Add(1, 8U, "600", "700", 500);
+
+ Add(2, 9U, "150", "200", 10000);
+
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(10000u + 18000u, vstorage_->estimated_compaction_needed_bytes());
+}
+
+TEST_F(CompactionPickerTest, EstimateCompactionBytesNeededDynamicLevel) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+ mutable_cf_options_.max_bytes_for_level_base = 1000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+
+ // Set Last level size 50000
+ // num_levels - 1 target 5000
+ // num_levels - 2 is base level with target 1000 (rounded up to
+ // max_bytes_for_level_base).
+ Add(num_levels - 1, 10U, "400", "500", 50000);
+
+ Add(0, 1U, "150", "200", 200);
+ Add(0, 2U, "150", "200", 200);
+ Add(0, 4U, "150", "200", 200);
+ Add(0, 5U, "150", "200", 200);
+ Add(0, 6U, "150", "200", 200);
+ // num_levels - 3 is over target by 100 + 1000
+ Add(num_levels - 3, 7U, "400", "500", 550);
+ Add(num_levels - 3, 8U, "600", "700", 550);
+ // num_levels - 2 is over target by 1100 + 200
+ Add(num_levels - 2, 9U, "150", "200", 5200);
+
+ UpdateVersionStorageInfo();
+
+ // Merging to the second last level: (5200 / 2100 + 1) * 1100
+ // Merging to the last level: (50000 / 6300 + 1) * 1300
+ ASSERT_EQ(2100u + 3823u + 11617u,
+ vstorage_->estimated_compaction_needed_bytes());
+}
+
+TEST_F(CompactionPickerTest, IsBottommostLevelTest) {
+ // case 1: Higher levels are empty
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(0, 1U, "a", "m");
+ Add(0, 2U, "c", "z");
+ Add(1, 3U, "d", "e");
+ Add(1, 4U, "l", "p");
+ Add(2, 5U, "g", "i");
+ Add(2, 6U, "x", "z");
+ UpdateVersionStorageInfo();
+ SetCompactionInputFilesLevels(2, 1);
+ AddToCompactionFiles(3U);
+ AddToCompactionFiles(5U);
+ bool result =
+ Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+ ASSERT_TRUE(result);
+
+ // case 2: Higher levels have no overlap
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(0, 1U, "a", "m");
+ Add(0, 2U, "c", "z");
+ Add(1, 3U, "d", "e");
+ Add(1, 4U, "l", "p");
+ Add(2, 5U, "g", "i");
+ Add(2, 6U, "x", "z");
+ Add(3, 7U, "k", "p");
+ Add(3, 8U, "t", "w");
+ Add(4, 9U, "a", "b");
+ Add(5, 10U, "c", "cc");
+ UpdateVersionStorageInfo();
+ SetCompactionInputFilesLevels(2, 1);
+ AddToCompactionFiles(3U);
+ AddToCompactionFiles(5U);
+ result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+ ASSERT_TRUE(result);
+
+ // case 3.1: Higher levels (level 3) have overlap
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(0, 1U, "a", "m");
+ Add(0, 2U, "c", "z");
+ Add(1, 3U, "d", "e");
+ Add(1, 4U, "l", "p");
+ Add(2, 5U, "g", "i");
+ Add(2, 6U, "x", "z");
+ Add(3, 7U, "e", "g");
+ Add(3, 8U, "h", "k");
+ Add(4, 9U, "a", "b");
+ Add(5, 10U, "c", "cc");
+ UpdateVersionStorageInfo();
+ SetCompactionInputFilesLevels(2, 1);
+ AddToCompactionFiles(3U);
+ AddToCompactionFiles(5U);
+ result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+ ASSERT_FALSE(result);
+
+ // case 3.2: Higher levels (level 5) have overlap
+ DeleteVersionStorage();
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(0, 1U, "a", "m");
+ Add(0, 2U, "c", "z");
+ Add(1, 3U, "d", "e");
+ Add(1, 4U, "l", "p");
+ Add(2, 5U, "g", "i");
+ Add(2, 6U, "x", "z");
+ Add(3, 7U, "j", "k");
+ Add(3, 8U, "l", "m");
+ Add(4, 9U, "a", "b");
+ Add(5, 10U, "c", "cc");
+ Add(5, 11U, "h", "k");
+ Add(5, 12U, "y", "yy");
+ Add(5, 13U, "z", "zz");
+ UpdateVersionStorageInfo();
+ SetCompactionInputFilesLevels(2, 1);
+ AddToCompactionFiles(3U);
+ AddToCompactionFiles(5U);
+ result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+ ASSERT_FALSE(result);
+
+ // case 3.3: Higher levels (level 5) have overlap, but it's only overlapping
+ // one key ("d")
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(0, 1U, "a", "m");
+ Add(0, 2U, "c", "z");
+ Add(1, 3U, "d", "e");
+ Add(1, 4U, "l", "p");
+ Add(2, 5U, "g", "i");
+ Add(2, 6U, "x", "z");
+ Add(3, 7U, "j", "k");
+ Add(3, 8U, "l", "m");
+ Add(4, 9U, "a", "b");
+ Add(5, 10U, "c", "cc");
+ Add(5, 11U, "ccc", "d");
+ Add(5, 12U, "y", "yy");
+ Add(5, 13U, "z", "zz");
+ UpdateVersionStorageInfo();
+ SetCompactionInputFilesLevels(2, 1);
+ AddToCompactionFiles(3U);
+ AddToCompactionFiles(5U);
+ result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+ ASSERT_FALSE(result);
+
+ // Level 0 files overlap
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(0, 1U, "s", "t");
+ Add(0, 2U, "a", "m");
+ Add(0, 3U, "b", "z");
+ Add(0, 4U, "e", "f");
+ Add(5, 10U, "y", "z");
+ UpdateVersionStorageInfo();
+ SetCompactionInputFilesLevels(1, 0);
+ AddToCompactionFiles(1U);
+ AddToCompactionFiles(2U);
+ AddToCompactionFiles(3U);
+ AddToCompactionFiles(4U);
+ result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+ ASSERT_FALSE(result);
+
+ // Level 0 files don't overlap
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(0, 1U, "s", "t");
+ Add(0, 2U, "a", "m");
+ Add(0, 3U, "b", "k");
+ Add(0, 4U, "e", "f");
+ Add(5, 10U, "y", "z");
+ UpdateVersionStorageInfo();
+ SetCompactionInputFilesLevels(1, 0);
+ AddToCompactionFiles(1U);
+ AddToCompactionFiles(2U);
+ AddToCompactionFiles(3U);
+ AddToCompactionFiles(4U);
+ result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+ ASSERT_TRUE(result);
+
+ // Level 1 files overlap
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(0, 1U, "s", "t");
+ Add(0, 2U, "a", "m");
+ Add(0, 3U, "b", "k");
+ Add(0, 4U, "e", "f");
+ Add(1, 5U, "a", "m");
+ Add(1, 6U, "n", "o");
+ Add(1, 7U, "w", "y");
+ Add(5, 10U, "y", "z");
+ UpdateVersionStorageInfo();
+ SetCompactionInputFilesLevels(2, 0);
+ AddToCompactionFiles(1U);
+ AddToCompactionFiles(2U);
+ AddToCompactionFiles(3U);
+ AddToCompactionFiles(4U);
+ AddToCompactionFiles(5U);
+ AddToCompactionFiles(6U);
+ AddToCompactionFiles(7U);
+ result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+ ASSERT_FALSE(result);
+
+ DeleteVersionStorage();
+}
+
+TEST_F(CompactionPickerTest, MaxCompactionBytesHit) {
+ mutable_cf_options_.max_bytes_for_level_base = 1000000u;
+ mutable_cf_options_.max_compaction_bytes = 800000u;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // A compaction should be triggered and pick file 2 and 5.
+ // It can expand because adding file 1 and 3, the compaction size will
+ // exceed mutable_cf_options_.max_bytes_for_level_base.
+ Add(1, 1U, "100", "150", 300000U);
+ Add(1, 2U, "151", "200", 300001U, 0, 0);
+ Add(1, 3U, "201", "250", 300000U, 0, 0);
+ Add(1, 4U, "251", "300", 300000U, 0, 0);
+ Add(2, 5U, "100", "256", 1U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->num_input_files(1));
+ ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, MaxCompactionBytesNotHit) {
+ mutable_cf_options_.max_bytes_for_level_base = 800000u;
+ mutable_cf_options_.max_compaction_bytes = 1000000u;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // A compaction should be triggered and pick file 2 and 5.
+ // and it expands to file 1 and 3 too.
+ Add(1, 1U, "100", "150", 300000U);
+ Add(1, 2U, "151", "200", 300001U, 0, 0);
+ Add(1, 3U, "201", "250", 300000U, 0, 0);
+ Add(1, 4U, "251", "300", 300000U, 0, 0);
+ Add(2, 5U, "000", "251", 1U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(3U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->num_input_files(1));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber());
+ ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, IsTrivialMoveOn) {
+ mutable_cf_options_.max_bytes_for_level_base = 10000u;
+ mutable_cf_options_.max_compaction_bytes = 10001u;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // A compaction should be triggered and pick file 2
+ Add(1, 1U, "100", "150", 3000U);
+ Add(1, 2U, "151", "200", 3001U);
+ Add(1, 3U, "201", "250", 3000U);
+ Add(1, 4U, "251", "300", 3000U);
+
+ Add(3, 5U, "120", "130", 7000U);
+ Add(3, 6U, "170", "180", 7000U);
+ Add(3, 5U, "220", "230", 7000U);
+ Add(3, 5U, "270", "280", 7000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_TRUE(compaction->IsTrivialMove());
+}
+
+TEST_F(CompactionPickerTest, IsTrivialMoveOff) {
+ mutable_cf_options_.max_bytes_for_level_base = 1000000u;
+ mutable_cf_options_.max_compaction_bytes = 10000u;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // A compaction should be triggered and pick all files from level 1
+ Add(1, 1U, "100", "150", 300000U, 0, 0);
+ Add(1, 2U, "150", "200", 300000U, 0, 0);
+ Add(1, 3U, "200", "250", 300000U, 0, 0);
+ Add(1, 4U, "250", "300", 300000U, 0, 0);
+
+ Add(3, 5U, "120", "130", 6000U);
+ Add(3, 6U, "140", "150", 6000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_FALSE(compaction->IsTrivialMove());
+}
+
+TEST_F(CompactionPickerTest, CacheNextCompactionIndex) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.max_compaction_bytes = 100000000000u;
+
+ Add(1 /* level */, 1U /* file_number */, "100" /* smallest */,
+ "149" /* largest */, 1000000000U /* file_size */);
+ file_map_[1U].first->being_compacted = true;
+ Add(1 /* level */, 2U /* file_number */, "150" /* smallest */,
+ "199" /* largest */, 900000000U /* file_size */);
+ Add(1 /* level */, 3U /* file_number */, "200" /* smallest */,
+ "249" /* largest */, 800000000U /* file_size */);
+ Add(1 /* level */, 4U /* file_number */, "250" /* smallest */,
+ "299" /* largest */, 700000000U /* file_size */);
+ Add(2 /* level */, 5U /* file_number */, "150" /* smallest */,
+ "199" /* largest */, 1U /* file_size */);
+ file_map_[5U].first->being_compacted = true;
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_levels());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(0U, compaction->num_input_files(1));
+ ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2, vstorage_->NextCompactionIndex(1 /* level */));
+
+ compaction.reset(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_levels());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(0U, compaction->num_input_files(1));
+ ASSERT_EQ(4U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(3, vstorage_->NextCompactionIndex(1 /* level */));
+
+ compaction.reset(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() == nullptr);
+ ASSERT_EQ(4, vstorage_->NextCompactionIndex(1 /* level */));
+}
+
+TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesNotHit) {
+ // Intra L0 compaction triggers only if there are at least
+ // level0_file_num_compaction_trigger + 2 L0 files.
+ mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+ mutable_cf_options_.max_compaction_bytes = 1000000u;
+ NewVersionStorage(6, kCompactionStyleLevel);
+
+ // All 5 L0 files will be picked for intra L0 compaction. The one L1 file
+ // spans entire L0 key range and is marked as being compacted to avoid
+ // L0->L1 compaction.
+ Add(0, 1U, "100", "150", 200000U, 0, 100, 101);
+ Add(0, 2U, "151", "200", 200000U, 0, 102, 103);
+ Add(0, 3U, "201", "250", 200000U, 0, 104, 105);
+ Add(0, 4U, "251", "300", 200000U, 0, 106, 107);
+ Add(0, 5U, "301", "350", 200000U, 0, 108, 109);
+ Add(1, 6U, "100", "350", 200000U, 0, 110, 111);
+ vstorage_->LevelFiles(1)[0]->being_compacted = true;
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_levels());
+ ASSERT_EQ(5U, compaction->num_input_files(0));
+ ASSERT_EQ(CompactionReason::kLevelL0FilesNum,
+ compaction->compaction_reason());
+ ASSERT_EQ(0, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesHit) {
+ // Intra L0 compaction triggers only if there are at least
+ // level0_file_num_compaction_trigger + 2 L0 files.
+ mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+ mutable_cf_options_.max_compaction_bytes = 999999u;
+ NewVersionStorage(6, kCompactionStyleLevel);
+
+ // 4 out of 5 L0 files will be picked for intra L0 compaction due to
+ // max_compaction_bytes limit (the minimum number of files for triggering
+ // intra L0 compaction is 4). The one L1 file spans entire L0 key range and
+ // is marked as being compacted to avoid L0->L1 compaction.
+ Add(0, 1U, "100", "150", 200000U, 0, 100, 101);
+ Add(0, 2U, "151", "200", 200000U, 0, 102, 103);
+ Add(0, 3U, "201", "250", 200000U, 0, 104, 105);
+ Add(0, 4U, "251", "300", 200000U, 0, 106, 107);
+ Add(0, 5U, "301", "350", 200000U, 0, 108, 109);
+ Add(1, 6U, "100", "350", 200000U, 0, 109, 110);
+ vstorage_->LevelFiles(1)[0]->being_compacted = true;
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_levels());
+ ASSERT_EQ(4U, compaction->num_input_files(0));
+ ASSERT_EQ(CompactionReason::kLevelL0FilesNum,
+ compaction->compaction_reason());
+ ASSERT_EQ(0, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, IntraL0ForEarliestSeqno) {
+ // Intra L0 compaction triggers only if there are at least
+ // level0_file_num_compaction_trigger + 2 L0 files.
+ mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+ mutable_cf_options_.max_compaction_bytes = 999999u;
+ NewVersionStorage(6, kCompactionStyleLevel);
+
+ // 4 out of 6 L0 files will be picked for intra L0 compaction due to
+ // being_compact limit. And the latest one L0 will be skipped due to earliest
+ // seqno. The one L1 file spans entire L0 key range and is marked as being
+ // compacted to avoid L0->L1 compaction.
+ Add(1, 1U, "100", "350", 200000U, 0, 110, 111);
+ Add(0, 2U, "301", "350", 1U, 0, 108, 109);
+ Add(0, 3U, "251", "300", 1U, 0, 106, 107);
+ Add(0, 4U, "201", "250", 1U, 0, 104, 105);
+ Add(0, 5U, "151", "200", 1U, 0, 102, 103);
+ Add(0, 6U, "100", "150", 1U, 0, 100, 101);
+ Add(0, 7U, "100", "100", 1U, 0, 99, 100);
+ vstorage_->LevelFiles(0)[5]->being_compacted = true;
+ vstorage_->LevelFiles(1)[0]->being_compacted = true;
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_, 107));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_levels());
+ ASSERT_EQ(4U, compaction->num_input_files(0));
+ ASSERT_EQ(CompactionReason::kLevelL0FilesNum,
+ compaction->compaction_reason());
+ ASSERT_EQ(0, compaction->output_level());
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/compaction/compaction_picker_universal.cc b/src/rocksdb/db/compaction/compaction_picker_universal.cc
new file mode 100644
index 000000000..d8b63956e
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_universal.cc
@@ -0,0 +1,1105 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_picker_universal.h"
+#ifndef ROCKSDB_LITE
+
+#include <cinttypes>
+#include <limits>
+#include <queue>
+#include <string>
+#include <utility>
+#include "db/column_family.h"
+#include "file/filename.h"
+#include "logging/log_buffer.h"
+#include "monitoring/statistics.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+// A helper class that form universal compactions. The class is used by
+// UniversalCompactionPicker::PickCompaction().
+// The usage is to create the class, and get the compaction object by calling
+// PickCompaction().
+class UniversalCompactionBuilder {
+ public:
+ UniversalCompactionBuilder(const ImmutableCFOptions& ioptions,
+ const InternalKeyComparator* icmp,
+ const std::string& cf_name,
+ const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage,
+ UniversalCompactionPicker* picker,
+ LogBuffer* log_buffer)
+ : ioptions_(ioptions),
+ icmp_(icmp),
+ cf_name_(cf_name),
+ mutable_cf_options_(mutable_cf_options),
+ vstorage_(vstorage),
+ picker_(picker),
+ log_buffer_(log_buffer) {}
+
+ // Form and return the compaction object. The caller owns return object.
+ Compaction* PickCompaction();
+
+ private:
+ struct SortedRun {
+ SortedRun(int _level, FileMetaData* _file, uint64_t _size,
+ uint64_t _compensated_file_size, bool _being_compacted)
+ : level(_level),
+ file(_file),
+ size(_size),
+ compensated_file_size(_compensated_file_size),
+ being_compacted(_being_compacted) {
+ assert(compensated_file_size > 0);
+ assert(level != 0 || file != nullptr);
+ }
+
+ void Dump(char* out_buf, size_t out_buf_size,
+ bool print_path = false) const;
+
+ // sorted_run_count is added into the string to print
+ void DumpSizeInfo(char* out_buf, size_t out_buf_size,
+ size_t sorted_run_count) const;
+
+ int level;
+ // `file` Will be null for level > 0. For level = 0, the sorted run is
+ // for this file.
+ FileMetaData* file;
+ // For level > 0, `size` and `compensated_file_size` are sum of sizes all
+ // files in the level. `being_compacted` should be the same for all files
+ // in a non-zero level. Use the value here.
+ uint64_t size;
+ uint64_t compensated_file_size;
+ bool being_compacted;
+ };
+
+ // Pick Universal compaction to limit read amplification
+ Compaction* PickCompactionToReduceSortedRuns(
+ unsigned int ratio, unsigned int max_number_of_files_to_compact);
+
+ // Pick Universal compaction to limit space amplification.
+ Compaction* PickCompactionToReduceSizeAmp();
+
+ Compaction* PickDeleteTriggeredCompaction();
+
+ // Form a compaction from the sorted run indicated by start_index to the
+ // oldest sorted run.
+ // The caller is responsible for making sure that those files are not in
+ // compaction.
+ Compaction* PickCompactionToOldest(size_t start_index,
+ CompactionReason compaction_reason);
+
+ // Try to pick periodic compaction. The caller should only call it
+ // if there is at least one file marked for periodic compaction.
+ // null will be returned if no such a compaction can be formed
+ // because some files are being compacted.
+ Compaction* PickPeriodicCompaction();
+
+ // Used in universal compaction when the enabled_trivial_move
+ // option is set. Checks whether there are any overlapping files
+ // in the input. Returns true if the input files are non
+ // overlapping.
+ bool IsInputFilesNonOverlapping(Compaction* c);
+
+ const ImmutableCFOptions& ioptions_;
+ const InternalKeyComparator* icmp_;
+ double score_;
+ std::vector<SortedRun> sorted_runs_;
+ const std::string& cf_name_;
+ const MutableCFOptions& mutable_cf_options_;
+ VersionStorageInfo* vstorage_;
+ UniversalCompactionPicker* picker_;
+ LogBuffer* log_buffer_;
+
+ static std::vector<SortedRun> CalculateSortedRuns(
+ const VersionStorageInfo& vstorage, const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options);
+
+ // Pick a path ID to place a newly generated file, with its estimated file
+ // size.
+ static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options,
+ uint64_t file_size);
+};
+
+// Used in universal compaction when trivial move is enabled.
+// This structure is used for the construction of min heap
+// that contains the file meta data, the level of the file
+// and the index of the file in that level
+
+struct InputFileInfo {
+ InputFileInfo() : f(nullptr), level(0), index(0) {}
+
+ FileMetaData* f;
+ size_t level;
+ size_t index;
+};
+
+// Used in universal compaction when trivial move is enabled.
+// This comparator is used for the construction of min heap
+// based on the smallest key of the file.
+struct SmallestKeyHeapComparator {
+ explicit SmallestKeyHeapComparator(const Comparator* ucmp) { ucmp_ = ucmp; }
+
+ bool operator()(InputFileInfo i1, InputFileInfo i2) const {
+ return (ucmp_->Compare(i1.f->smallest.user_key(),
+ i2.f->smallest.user_key()) > 0);
+ }
+
+ private:
+ const Comparator* ucmp_;
+};
+
+typedef std::priority_queue<InputFileInfo, std::vector<InputFileInfo>,
+ SmallestKeyHeapComparator>
+ SmallestKeyHeap;
+
+// This function creates the heap that is used to find if the files are
+// overlapping during universal compaction when the allow_trivial_move
+// is set.
+SmallestKeyHeap create_level_heap(Compaction* c, const Comparator* ucmp) {
+ SmallestKeyHeap smallest_key_priority_q =
+ SmallestKeyHeap(SmallestKeyHeapComparator(ucmp));
+
+ InputFileInfo input_file;
+
+ for (size_t l = 0; l < c->num_input_levels(); l++) {
+ if (c->num_input_files(l) != 0) {
+ if (l == 0 && c->start_level() == 0) {
+ for (size_t i = 0; i < c->num_input_files(0); i++) {
+ input_file.f = c->input(0, i);
+ input_file.level = 0;
+ input_file.index = i;
+ smallest_key_priority_q.push(std::move(input_file));
+ }
+ } else {
+ input_file.f = c->input(l, 0);
+ input_file.level = l;
+ input_file.index = 0;
+ smallest_key_priority_q.push(std::move(input_file));
+ }
+ }
+ }
+ return smallest_key_priority_q;
+}
+
+#ifndef NDEBUG
+// smallest_seqno and largest_seqno are set iff. `files` is not empty.
+void GetSmallestLargestSeqno(const std::vector<FileMetaData*>& files,
+ SequenceNumber* smallest_seqno,
+ SequenceNumber* largest_seqno) {
+ bool is_first = true;
+ for (FileMetaData* f : files) {
+ assert(f->fd.smallest_seqno <= f->fd.largest_seqno);
+ if (is_first) {
+ is_first = false;
+ *smallest_seqno = f->fd.smallest_seqno;
+ *largest_seqno = f->fd.largest_seqno;
+ } else {
+ if (f->fd.smallest_seqno < *smallest_seqno) {
+ *smallest_seqno = f->fd.smallest_seqno;
+ }
+ if (f->fd.largest_seqno > *largest_seqno) {
+ *largest_seqno = f->fd.largest_seqno;
+ }
+ }
+ }
+}
+#endif
+} // namespace
+
+// Algorithm that checks to see if there are any overlapping
+// files in the input
+bool UniversalCompactionBuilder::IsInputFilesNonOverlapping(Compaction* c) {
+ auto comparator = icmp_->user_comparator();
+ int first_iter = 1;
+
+ InputFileInfo prev, curr, next;
+
+ SmallestKeyHeap smallest_key_priority_q =
+ create_level_heap(c, icmp_->user_comparator());
+
+ while (!smallest_key_priority_q.empty()) {
+ curr = smallest_key_priority_q.top();
+ smallest_key_priority_q.pop();
+
+ if (first_iter) {
+ prev = curr;
+ first_iter = 0;
+ } else {
+ if (comparator->Compare(prev.f->largest.user_key(),
+ curr.f->smallest.user_key()) >= 0) {
+ // found overlapping files, return false
+ return false;
+ }
+ assert(comparator->Compare(curr.f->largest.user_key(),
+ prev.f->largest.user_key()) > 0);
+ prev = curr;
+ }
+
+ next.f = nullptr;
+
+ if (c->level(curr.level) != 0 &&
+ curr.index < c->num_input_files(curr.level) - 1) {
+ next.f = c->input(curr.level, curr.index + 1);
+ next.level = curr.level;
+ next.index = curr.index + 1;
+ }
+
+ if (next.f) {
+ smallest_key_priority_q.push(std::move(next));
+ }
+ }
+ return true;
+}
+
+bool UniversalCompactionPicker::NeedsCompaction(
+ const VersionStorageInfo* vstorage) const {
+ const int kLevel0 = 0;
+ if (vstorage->CompactionScore(kLevel0) >= 1) {
+ return true;
+ }
+ if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) {
+ return true;
+ }
+ if (!vstorage->FilesMarkedForCompaction().empty()) {
+ return true;
+ }
+ return false;
+}
+
+Compaction* UniversalCompactionPicker::PickCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+ SequenceNumber /* earliest_memtable_seqno */) {
+ UniversalCompactionBuilder builder(ioptions_, icmp_, cf_name,
+ mutable_cf_options, vstorage, this,
+ log_buffer);
+ return builder.PickCompaction();
+}
+
+void UniversalCompactionBuilder::SortedRun::Dump(char* out_buf,
+ size_t out_buf_size,
+ bool print_path) const {
+ if (level == 0) {
+ assert(file != nullptr);
+ if (file->fd.GetPathId() == 0 || !print_path) {
+ snprintf(out_buf, out_buf_size, "file %" PRIu64, file->fd.GetNumber());
+ } else {
+ snprintf(out_buf, out_buf_size, "file %" PRIu64
+ "(path "
+ "%" PRIu32 ")",
+ file->fd.GetNumber(), file->fd.GetPathId());
+ }
+ } else {
+ snprintf(out_buf, out_buf_size, "level %d", level);
+ }
+}
+
+void UniversalCompactionBuilder::SortedRun::DumpSizeInfo(
+ char* out_buf, size_t out_buf_size, size_t sorted_run_count) const {
+ if (level == 0) {
+ assert(file != nullptr);
+ snprintf(out_buf, out_buf_size,
+ "file %" PRIu64 "[%" ROCKSDB_PRIszt
+ "] "
+ "with size %" PRIu64 " (compensated size %" PRIu64 ")",
+ file->fd.GetNumber(), sorted_run_count, file->fd.GetFileSize(),
+ file->compensated_file_size);
+ } else {
+ snprintf(out_buf, out_buf_size,
+ "level %d[%" ROCKSDB_PRIszt
+ "] "
+ "with size %" PRIu64 " (compensated size %" PRIu64 ")",
+ level, sorted_run_count, size, compensated_file_size);
+ }
+}
+
+std::vector<UniversalCompactionBuilder::SortedRun>
+UniversalCompactionBuilder::CalculateSortedRuns(
+ const VersionStorageInfo& vstorage, const ImmutableCFOptions& /*ioptions*/,
+ const MutableCFOptions& mutable_cf_options) {
+ std::vector<UniversalCompactionBuilder::SortedRun> ret;
+ for (FileMetaData* f : vstorage.LevelFiles(0)) {
+ ret.emplace_back(0, f, f->fd.GetFileSize(), f->compensated_file_size,
+ f->being_compacted);
+ }
+ for (int level = 1; level < vstorage.num_levels(); level++) {
+ uint64_t total_compensated_size = 0U;
+ uint64_t total_size = 0U;
+ bool being_compacted = false;
+ bool is_first = true;
+ for (FileMetaData* f : vstorage.LevelFiles(level)) {
+ total_compensated_size += f->compensated_file_size;
+ total_size += f->fd.GetFileSize();
+ if (mutable_cf_options.compaction_options_universal.allow_trivial_move ==
+ true) {
+ if (f->being_compacted) {
+ being_compacted = f->being_compacted;
+ }
+ } else {
+ // Compaction always includes all files for a non-zero level, so for a
+ // non-zero level, all the files should share the same being_compacted
+ // value.
+ // This assumption is only valid when
+ // mutable_cf_options.compaction_options_universal.allow_trivial_move
+ // is false
+ assert(is_first || f->being_compacted == being_compacted);
+ }
+ if (is_first) {
+ being_compacted = f->being_compacted;
+ is_first = false;
+ }
+ }
+ if (total_compensated_size > 0) {
+ ret.emplace_back(level, nullptr, total_size, total_compensated_size,
+ being_compacted);
+ }
+ }
+ return ret;
+}
+
+// Universal style of compaction. Pick files that are contiguous in
+// time-range to compact.
+Compaction* UniversalCompactionBuilder::PickCompaction() {
+ const int kLevel0 = 0;
+ score_ = vstorage_->CompactionScore(kLevel0);
+ sorted_runs_ =
+ CalculateSortedRuns(*vstorage_, ioptions_, mutable_cf_options_);
+
+ if (sorted_runs_.size() == 0 ||
+ (vstorage_->FilesMarkedForPeriodicCompaction().empty() &&
+ vstorage_->FilesMarkedForCompaction().empty() &&
+ sorted_runs_.size() < (unsigned int)mutable_cf_options_
+ .level0_file_num_compaction_trigger)) {
+ ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: nothing to do\n",
+ cf_name_.c_str());
+ TEST_SYNC_POINT_CALLBACK(
+ "UniversalCompactionBuilder::PickCompaction:Return", nullptr);
+ return nullptr;
+ }
+ VersionStorageInfo::LevelSummaryStorage tmp;
+ ROCKS_LOG_BUFFER_MAX_SZ(
+ log_buffer_, 3072,
+ "[%s] Universal: sorted runs files(%" ROCKSDB_PRIszt "): %s\n",
+ cf_name_.c_str(), sorted_runs_.size(), vstorage_->LevelSummary(&tmp));
+
+ Compaction* c = nullptr;
+ // Periodic compaction has higher priority than other type of compaction
+ // because it's a hard requirement.
+ if (!vstorage_->FilesMarkedForPeriodicCompaction().empty()) {
+ // Always need to do a full compaction for periodic compaction.
+ c = PickPeriodicCompaction();
+ }
+
+ // Check for size amplification.
+ if (c == nullptr &&
+ sorted_runs_.size() >=
+ static_cast<size_t>(
+ mutable_cf_options_.level0_file_num_compaction_trigger)) {
+ if ((c = PickCompactionToReduceSizeAmp()) != nullptr) {
+ ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: compacting for size amp\n",
+ cf_name_.c_str());
+ } else {
+ // Size amplification is within limits. Try reducing read
+ // amplification while maintaining file size ratios.
+ unsigned int ratio =
+ mutable_cf_options_.compaction_options_universal.size_ratio;
+
+ if ((c = PickCompactionToReduceSortedRuns(ratio, UINT_MAX)) != nullptr) {
+ ROCKS_LOG_BUFFER(log_buffer_,
+ "[%s] Universal: compacting for size ratio\n",
+ cf_name_.c_str());
+ } else {
+ // Size amplification and file size ratios are within configured limits.
+ // If max read amplification is exceeding configured limits, then force
+ // compaction without looking at filesize ratios and try to reduce
+ // the number of files to fewer than level0_file_num_compaction_trigger.
+ // This is guaranteed by NeedsCompaction()
+ assert(sorted_runs_.size() >=
+ static_cast<size_t>(
+ mutable_cf_options_.level0_file_num_compaction_trigger));
+ // Get the total number of sorted runs that are not being compacted
+ int num_sr_not_compacted = 0;
+ for (size_t i = 0; i < sorted_runs_.size(); i++) {
+ if (sorted_runs_[i].being_compacted == false) {
+ num_sr_not_compacted++;
+ }
+ }
+
+ // The number of sorted runs that are not being compacted is greater
+ // than the maximum allowed number of sorted runs
+ if (num_sr_not_compacted >
+ mutable_cf_options_.level0_file_num_compaction_trigger) {
+ unsigned int num_files =
+ num_sr_not_compacted -
+ mutable_cf_options_.level0_file_num_compaction_trigger + 1;
+ if ((c = PickCompactionToReduceSortedRuns(UINT_MAX, num_files)) !=
+ nullptr) {
+ ROCKS_LOG_BUFFER(log_buffer_,
+ "[%s] Universal: compacting for file num -- %u\n",
+ cf_name_.c_str(), num_files);
+ }
+ }
+ }
+ }
+ }
+
+ if (c == nullptr) {
+ if ((c = PickDeleteTriggeredCompaction()) != nullptr) {
+ ROCKS_LOG_BUFFER(log_buffer_,
+ "[%s] Universal: delete triggered compaction\n",
+ cf_name_.c_str());
+ }
+ }
+
+ if (c == nullptr) {
+ TEST_SYNC_POINT_CALLBACK(
+ "UniversalCompactionBuilder::PickCompaction:Return", nullptr);
+ return nullptr;
+ }
+
+ if (mutable_cf_options_.compaction_options_universal.allow_trivial_move ==
+ true &&
+ c->compaction_reason() != CompactionReason::kPeriodicCompaction) {
+ c->set_is_trivial_move(IsInputFilesNonOverlapping(c));
+ }
+
+// validate that all the chosen files of L0 are non overlapping in time
+#ifndef NDEBUG
+ SequenceNumber prev_smallest_seqno = 0U;
+ bool is_first = true;
+
+ size_t level_index = 0U;
+ if (c->start_level() == 0) {
+ for (auto f : *c->inputs(0)) {
+ assert(f->fd.smallest_seqno <= f->fd.largest_seqno);
+ if (is_first) {
+ is_first = false;
+ }
+ prev_smallest_seqno = f->fd.smallest_seqno;
+ }
+ level_index = 1U;
+ }
+ for (; level_index < c->num_input_levels(); level_index++) {
+ if (c->num_input_files(level_index) != 0) {
+ SequenceNumber smallest_seqno = 0U;
+ SequenceNumber largest_seqno = 0U;
+ GetSmallestLargestSeqno(*(c->inputs(level_index)), &smallest_seqno,
+ &largest_seqno);
+ if (is_first) {
+ is_first = false;
+ } else if (prev_smallest_seqno > 0) {
+ // A level is considered as the bottommost level if there are
+ // no files in higher levels or if files in higher levels do
+ // not overlap with the files being compacted. Sequence numbers
+ // of files in bottommost level can be set to 0 to help
+ // compression. As a result, the following assert may not hold
+ // if the prev_smallest_seqno is 0.
+ assert(prev_smallest_seqno > largest_seqno);
+ }
+ prev_smallest_seqno = smallest_seqno;
+ }
+ }
+#endif
+ // update statistics
+ RecordInHistogram(ioptions_.statistics, NUM_FILES_IN_SINGLE_COMPACTION,
+ c->inputs(0)->size());
+
+ picker_->RegisterCompaction(c);
+ vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+
+ TEST_SYNC_POINT_CALLBACK("UniversalCompactionBuilder::PickCompaction:Return",
+ c);
+ return c;
+}
+
+uint32_t UniversalCompactionBuilder::GetPathId(
+ const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options, uint64_t file_size) {
+ // Two conditions need to be satisfied:
+ // (1) the target path needs to be able to hold the file's size
+ // (2) Total size left in this and previous paths need to be not
+ // smaller than expected future file size before this new file is
+ // compacted, which is estimated based on size_ratio.
+ // For example, if now we are compacting files of size (1, 1, 2, 4, 8),
+ // we will make sure the target file, probably with size of 16, will be
+ // placed in a path so that eventually when new files are generated and
+ // compacted to (1, 1, 2, 4, 8, 16), all those files can be stored in or
+ // before the path we chose.
+ //
+ // TODO(sdong): now the case of multiple column families is not
+ // considered in this algorithm. So the target size can be violated in
+ // that case. We need to improve it.
+ uint64_t accumulated_size = 0;
+ uint64_t future_size =
+ file_size *
+ (100 - mutable_cf_options.compaction_options_universal.size_ratio) / 100;
+ uint32_t p = 0;
+ assert(!ioptions.cf_paths.empty());
+ for (; p < ioptions.cf_paths.size() - 1; p++) {
+ uint64_t target_size = ioptions.cf_paths[p].target_size;
+ if (target_size > file_size &&
+ accumulated_size + (target_size - file_size) > future_size) {
+ return p;
+ }
+ accumulated_size += target_size;
+ }
+ return p;
+}
+
+//
+// Consider compaction files based on their size differences with
+// the next file in time order.
+//
+Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
+ unsigned int ratio, unsigned int max_number_of_files_to_compact) {
+ unsigned int min_merge_width =
+ mutable_cf_options_.compaction_options_universal.min_merge_width;
+ unsigned int max_merge_width =
+ mutable_cf_options_.compaction_options_universal.max_merge_width;
+
+ const SortedRun* sr = nullptr;
+ bool done = false;
+ size_t start_index = 0;
+ unsigned int candidate_count = 0;
+
+ unsigned int max_files_to_compact =
+ std::min(max_merge_width, max_number_of_files_to_compact);
+ min_merge_width = std::max(min_merge_width, 2U);
+
+ // Caller checks the size before executing this function. This invariant is
+ // important because otherwise we may have a possible integer underflow when
+ // dealing with unsigned types.
+ assert(sorted_runs_.size() > 0);
+
+ // Considers a candidate file only if it is smaller than the
+ // total size accumulated so far.
+ for (size_t loop = 0; loop < sorted_runs_.size(); loop++) {
+ candidate_count = 0;
+
+ // Skip files that are already being compacted
+ for (sr = nullptr; loop < sorted_runs_.size(); loop++) {
+ sr = &sorted_runs_[loop];
+
+ if (!sr->being_compacted) {
+ candidate_count = 1;
+ break;
+ }
+ char file_num_buf[kFormatFileNumberBufSize];
+ sr->Dump(file_num_buf, sizeof(file_num_buf));
+ ROCKS_LOG_BUFFER(log_buffer_,
+ "[%s] Universal: %s"
+ "[%d] being compacted, skipping",
+ cf_name_.c_str(), file_num_buf, loop);
+
+ sr = nullptr;
+ }
+
+ // This file is not being compacted. Consider it as the
+ // first candidate to be compacted.
+ uint64_t candidate_size = sr != nullptr ? sr->compensated_file_size : 0;
+ if (sr != nullptr) {
+ char file_num_buf[kFormatFileNumberBufSize];
+ sr->Dump(file_num_buf, sizeof(file_num_buf), true);
+ ROCKS_LOG_BUFFER(log_buffer_,
+ "[%s] Universal: Possible candidate %s[%d].",
+ cf_name_.c_str(), file_num_buf, loop);
+ }
+
+ // Check if the succeeding files need compaction.
+ for (size_t i = loop + 1;
+ candidate_count < max_files_to_compact && i < sorted_runs_.size();
+ i++) {
+ const SortedRun* succeeding_sr = &sorted_runs_[i];
+ if (succeeding_sr->being_compacted) {
+ break;
+ }
+ // Pick files if the total/last candidate file size (increased by the
+ // specified ratio) is still larger than the next candidate file.
+ // candidate_size is the total size of files picked so far with the
+ // default kCompactionStopStyleTotalSize; with
+ // kCompactionStopStyleSimilarSize, it's simply the size of the last
+ // picked file.
+ double sz = candidate_size * (100.0 + ratio) / 100.0;
+ if (sz < static_cast<double>(succeeding_sr->size)) {
+ break;
+ }
+ if (mutable_cf_options_.compaction_options_universal.stop_style ==
+ kCompactionStopStyleSimilarSize) {
+ // Similar-size stopping rule: also check the last picked file isn't
+ // far larger than the next candidate file.
+ sz = (succeeding_sr->size * (100.0 + ratio)) / 100.0;
+ if (sz < static_cast<double>(candidate_size)) {
+ // If the small file we've encountered begins a run of similar-size
+ // files, we'll pick them up on a future iteration of the outer
+ // loop. If it's some lonely straggler, it'll eventually get picked
+ // by the last-resort read amp strategy which disregards size ratios.
+ break;
+ }
+ candidate_size = succeeding_sr->compensated_file_size;
+ } else { // default kCompactionStopStyleTotalSize
+ candidate_size += succeeding_sr->compensated_file_size;
+ }
+ candidate_count++;
+ }
+
+ // Found a series of consecutive files that need compaction.
+ if (candidate_count >= (unsigned int)min_merge_width) {
+ start_index = loop;
+ done = true;
+ break;
+ } else {
+ for (size_t i = loop;
+ i < loop + candidate_count && i < sorted_runs_.size(); i++) {
+ const SortedRun* skipping_sr = &sorted_runs_[i];
+ char file_num_buf[256];
+ skipping_sr->DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop);
+ ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Skipping %s",
+ cf_name_.c_str(), file_num_buf);
+ }
+ }
+ }
+ if (!done || candidate_count <= 1) {
+ return nullptr;
+ }
+ size_t first_index_after = start_index + candidate_count;
+ // Compression is enabled if files compacted earlier already reached
+ // size ratio of compression.
+ bool enable_compression = true;
+ int ratio_to_compress =
+ mutable_cf_options_.compaction_options_universal.compression_size_percent;
+ if (ratio_to_compress >= 0) {
+ uint64_t total_size = 0;
+ for (auto& sorted_run : sorted_runs_) {
+ total_size += sorted_run.compensated_file_size;
+ }
+
+ uint64_t older_file_size = 0;
+ for (size_t i = sorted_runs_.size() - 1; i >= first_index_after; i--) {
+ older_file_size += sorted_runs_[i].size;
+ if (older_file_size * 100L >= total_size * (long)ratio_to_compress) {
+ enable_compression = false;
+ break;
+ }
+ }
+ }
+
+ uint64_t estimated_total_size = 0;
+ for (unsigned int i = 0; i < first_index_after; i++) {
+ estimated_total_size += sorted_runs_[i].size;
+ }
+ uint32_t path_id =
+ GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
+ int start_level = sorted_runs_[start_index].level;
+ int output_level;
+ if (first_index_after == sorted_runs_.size()) {
+ output_level = vstorage_->num_levels() - 1;
+ } else if (sorted_runs_[first_index_after].level == 0) {
+ output_level = 0;
+ } else {
+ output_level = sorted_runs_[first_index_after].level - 1;
+ }
+
+ // last level is reserved for the files ingested behind
+ if (ioptions_.allow_ingest_behind &&
+ (output_level == vstorage_->num_levels() - 1)) {
+ assert(output_level > 1);
+ output_level--;
+ }
+
+ std::vector<CompactionInputFiles> inputs(vstorage_->num_levels());
+ for (size_t i = 0; i < inputs.size(); ++i) {
+ inputs[i].level = start_level + static_cast<int>(i);
+ }
+ for (size_t i = start_index; i < first_index_after; i++) {
+ auto& picking_sr = sorted_runs_[i];
+ if (picking_sr.level == 0) {
+ FileMetaData* picking_file = picking_sr.file;
+ inputs[0].files.push_back(picking_file);
+ } else {
+ auto& files = inputs[picking_sr.level - start_level].files;
+ for (auto* f : vstorage_->LevelFiles(picking_sr.level)) {
+ files.push_back(f);
+ }
+ }
+ char file_num_buf[256];
+ picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), i);
+ ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Picking %s",
+ cf_name_.c_str(), file_num_buf);
+ }
+
+ CompactionReason compaction_reason;
+ if (max_number_of_files_to_compact == UINT_MAX) {
+ compaction_reason = CompactionReason::kUniversalSizeRatio;
+ } else {
+ compaction_reason = CompactionReason::kUniversalSortedRunNum;
+ }
+ return new Compaction(
+ vstorage_, ioptions_, mutable_cf_options_, std::move(inputs),
+ output_level,
+ MaxFileSizeForLevel(mutable_cf_options_, output_level,
+ kCompactionStyleUniversal),
+ LLONG_MAX, path_id,
+ GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, start_level,
+ 1, enable_compression),
+ GetCompressionOptions(ioptions_, vstorage_, start_level,
+ enable_compression),
+ /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false,
+ score_, false /* deletion_compaction */, compaction_reason);
+}
+
+// Look at overall size amplification. If size amplification
+// exceeeds the configured value, then do a compaction
+// of the candidate files all the way upto the earliest
+// base file (overrides configured values of file-size ratios,
+// min_merge_width and max_merge_width).
+//
+Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() {
+ // percentage flexibility while reducing size amplification
+ uint64_t ratio = mutable_cf_options_.compaction_options_universal
+ .max_size_amplification_percent;
+
+ unsigned int candidate_count = 0;
+ uint64_t candidate_size = 0;
+ size_t start_index = 0;
+ const SortedRun* sr = nullptr;
+
+ assert(!sorted_runs_.empty());
+ if (sorted_runs_.back().being_compacted) {
+ return nullptr;
+ }
+
+ // Skip files that are already being compacted
+ for (size_t loop = 0; loop < sorted_runs_.size() - 1; loop++) {
+ sr = &sorted_runs_[loop];
+ if (!sr->being_compacted) {
+ start_index = loop; // Consider this as the first candidate.
+ break;
+ }
+ char file_num_buf[kFormatFileNumberBufSize];
+ sr->Dump(file_num_buf, sizeof(file_num_buf), true);
+ ROCKS_LOG_BUFFER(log_buffer_,
+ "[%s] Universal: skipping %s[%d] compacted %s",
+ cf_name_.c_str(), file_num_buf, loop,
+ " cannot be a candidate to reduce size amp.\n");
+ sr = nullptr;
+ }
+
+ if (sr == nullptr) {
+ return nullptr; // no candidate files
+ }
+ {
+ char file_num_buf[kFormatFileNumberBufSize];
+ sr->Dump(file_num_buf, sizeof(file_num_buf), true);
+ ROCKS_LOG_BUFFER(
+ log_buffer_,
+ "[%s] Universal: First candidate %s[%" ROCKSDB_PRIszt "] %s",
+ cf_name_.c_str(), file_num_buf, start_index, " to reduce size amp.\n");
+ }
+
+ // keep adding up all the remaining files
+ for (size_t loop = start_index; loop < sorted_runs_.size() - 1; loop++) {
+ sr = &sorted_runs_[loop];
+ if (sr->being_compacted) {
+ char file_num_buf[kFormatFileNumberBufSize];
+ sr->Dump(file_num_buf, sizeof(file_num_buf), true);
+ ROCKS_LOG_BUFFER(
+ log_buffer_, "[%s] Universal: Possible candidate %s[%d] %s",
+ cf_name_.c_str(), file_num_buf, start_index,
+ " is already being compacted. No size amp reduction possible.\n");
+ return nullptr;
+ }
+ candidate_size += sr->compensated_file_size;
+ candidate_count++;
+ }
+ if (candidate_count == 0) {
+ return nullptr;
+ }
+
+ // size of earliest file
+ uint64_t earliest_file_size = sorted_runs_.back().size;
+
+ // size amplification = percentage of additional size
+ if (candidate_size * 100 < ratio * earliest_file_size) {
+ ROCKS_LOG_BUFFER(
+ log_buffer_,
+ "[%s] Universal: size amp not needed. newer-files-total-size %" PRIu64
+ " earliest-file-size %" PRIu64,
+ cf_name_.c_str(), candidate_size, earliest_file_size);
+ return nullptr;
+ } else {
+ ROCKS_LOG_BUFFER(
+ log_buffer_,
+ "[%s] Universal: size amp needed. newer-files-total-size %" PRIu64
+ " earliest-file-size %" PRIu64,
+ cf_name_.c_str(), candidate_size, earliest_file_size);
+ }
+ return PickCompactionToOldest(start_index,
+ CompactionReason::kUniversalSizeAmplification);
+}
+
+// Pick files marked for compaction. Typically, files are marked by
+// CompactOnDeleteCollector due to the presence of tombstones.
+Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
+ CompactionInputFiles start_level_inputs;
+ int output_level;
+ std::vector<CompactionInputFiles> inputs;
+
+ if (vstorage_->num_levels() == 1) {
+ // This is single level universal. Since we're basically trying to reclaim
+ // space by processing files marked for compaction due to high tombstone
+ // density, let's do the same thing as compaction to reduce size amp which
+ // has the same goals.
+ bool compact = false;
+
+ start_level_inputs.level = 0;
+ start_level_inputs.files.clear();
+ output_level = 0;
+ for (FileMetaData* f : vstorage_->LevelFiles(0)) {
+ if (f->marked_for_compaction) {
+ compact = true;
+ }
+ if (compact) {
+ start_level_inputs.files.push_back(f);
+ }
+ }
+ if (start_level_inputs.size() <= 1) {
+ // If only the last file in L0 is marked for compaction, ignore it
+ return nullptr;
+ }
+ inputs.push_back(start_level_inputs);
+ } else {
+ int start_level;
+
+ // For multi-level universal, the strategy is to make this look more like
+ // leveled. We pick one of the files marked for compaction and compact with
+ // overlapping files in the adjacent level.
+ picker_->PickFilesMarkedForCompaction(cf_name_, vstorage_, &start_level,
+ &output_level, &start_level_inputs);
+ if (start_level_inputs.empty()) {
+ return nullptr;
+ }
+
+ // Pick the first non-empty level after the start_level
+ for (output_level = start_level + 1; output_level < vstorage_->num_levels();
+ output_level++) {
+ if (vstorage_->NumLevelFiles(output_level) != 0) {
+ break;
+ }
+ }
+
+ // If all higher levels are empty, pick the highest level as output level
+ if (output_level == vstorage_->num_levels()) {
+ if (start_level == 0) {
+ output_level = vstorage_->num_levels() - 1;
+ } else {
+ // If start level is non-zero and all higher levels are empty, this
+ // compaction will translate into a trivial move. Since the idea is
+ // to reclaim space and trivial move doesn't help with that, we
+ // skip compaction in this case and return nullptr
+ return nullptr;
+ }
+ }
+ if (ioptions_.allow_ingest_behind &&
+ output_level == vstorage_->num_levels() - 1) {
+ assert(output_level > 1);
+ output_level--;
+ }
+
+ if (output_level != 0) {
+ if (start_level == 0) {
+ if (!picker_->GetOverlappingL0Files(vstorage_, &start_level_inputs,
+ output_level, nullptr)) {
+ return nullptr;
+ }
+ }
+
+ CompactionInputFiles output_level_inputs;
+ int parent_index = -1;
+
+ output_level_inputs.level = output_level;
+ if (!picker_->SetupOtherInputs(cf_name_, mutable_cf_options_, vstorage_,
+ &start_level_inputs, &output_level_inputs,
+ &parent_index, -1)) {
+ return nullptr;
+ }
+ inputs.push_back(start_level_inputs);
+ if (!output_level_inputs.empty()) {
+ inputs.push_back(output_level_inputs);
+ }
+ if (picker_->FilesRangeOverlapWithCompaction(inputs, output_level)) {
+ return nullptr;
+ }
+ } else {
+ inputs.push_back(start_level_inputs);
+ }
+ }
+
+ uint64_t estimated_total_size = 0;
+ // Use size of the output level as estimated file size
+ for (FileMetaData* f : vstorage_->LevelFiles(output_level)) {
+ estimated_total_size += f->fd.GetFileSize();
+ }
+ uint32_t path_id =
+ GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
+ return new Compaction(
+ vstorage_, ioptions_, mutable_cf_options_, std::move(inputs),
+ output_level,
+ MaxFileSizeForLevel(mutable_cf_options_, output_level,
+ kCompactionStyleUniversal),
+ /* max_grandparent_overlap_bytes */ LLONG_MAX, path_id,
+ GetCompressionType(ioptions_, vstorage_, mutable_cf_options_,
+ output_level, 1),
+ GetCompressionOptions(ioptions_, vstorage_, output_level),
+ /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ true,
+ score_, false /* deletion_compaction */,
+ CompactionReason::kFilesMarkedForCompaction);
+}
+
+Compaction* UniversalCompactionBuilder::PickCompactionToOldest(
+ size_t start_index, CompactionReason compaction_reason) {
+ assert(start_index < sorted_runs_.size());
+
+ // Estimate total file size
+ uint64_t estimated_total_size = 0;
+ for (size_t loop = start_index; loop < sorted_runs_.size(); loop++) {
+ estimated_total_size += sorted_runs_[loop].size;
+ }
+ uint32_t path_id =
+ GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
+ int start_level = sorted_runs_[start_index].level;
+
+ std::vector<CompactionInputFiles> inputs(vstorage_->num_levels());
+ for (size_t i = 0; i < inputs.size(); ++i) {
+ inputs[i].level = start_level + static_cast<int>(i);
+ }
+ for (size_t loop = start_index; loop < sorted_runs_.size(); loop++) {
+ auto& picking_sr = sorted_runs_[loop];
+ if (picking_sr.level == 0) {
+ FileMetaData* f = picking_sr.file;
+ inputs[0].files.push_back(f);
+ } else {
+ auto& files = inputs[picking_sr.level - start_level].files;
+ for (auto* f : vstorage_->LevelFiles(picking_sr.level)) {
+ files.push_back(f);
+ }
+ }
+ std::string comp_reason_print_string;
+ if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+ comp_reason_print_string = "periodic compaction";
+ } else if (compaction_reason ==
+ CompactionReason::kUniversalSizeAmplification) {
+ comp_reason_print_string = "size amp";
+ } else {
+ assert(false);
+ }
+
+ char file_num_buf[256];
+ picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop);
+ ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: %s picking %s",
+ cf_name_.c_str(), comp_reason_print_string.c_str(),
+ file_num_buf);
+ }
+
+ // output files at the bottom most level, unless it's reserved
+ int output_level = vstorage_->num_levels() - 1;
+ // last level is reserved for the files ingested behind
+ if (ioptions_.allow_ingest_behind) {
+ assert(output_level > 1);
+ output_level--;
+ }
+
+ // We never check size for
+ // compaction_options_universal.compression_size_percent,
+ // because we always compact all the files, so always compress.
+ return new Compaction(
+ vstorage_, ioptions_, mutable_cf_options_, std::move(inputs),
+ output_level,
+ MaxFileSizeForLevel(mutable_cf_options_, output_level,
+ kCompactionStyleUniversal),
+ LLONG_MAX, path_id,
+ GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, start_level,
+ 1, true /* enable_compression */),
+ GetCompressionOptions(ioptions_, vstorage_, start_level,
+ true /* enable_compression */),
+ /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false,
+ score_, false /* deletion_compaction */, compaction_reason);
+}
+
+Compaction* UniversalCompactionBuilder::PickPeriodicCompaction() {
+ ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Periodic Compaction",
+ cf_name_.c_str());
+
+ // In universal compaction, sorted runs contain older data are almost always
+ // generated earlier too. To simplify the problem, we just try to trigger
+ // a full compaction. We start from the oldest sorted run and include
+ // all sorted runs, until we hit a sorted already being compacted.
+ // Since usually the largest (which is usually the oldest) sorted run is
+ // included anyway, doing a full compaction won't increase write
+ // amplification much.
+
+ // Get some information from marked files to check whether a file is
+ // included in the compaction.
+
+ size_t start_index = sorted_runs_.size();
+ while (start_index > 0 && !sorted_runs_[start_index - 1].being_compacted) {
+ start_index--;
+ }
+ if (start_index == sorted_runs_.size()) {
+ return nullptr;
+ }
+
+ // There is a rare corner case where we can't pick up all the files
+ // because some files are being compacted and we end up with picking files
+ // but none of them need periodic compaction. Unless we simply recompact
+ // the last sorted run (either the last level or last L0 file), we would just
+ // execute the compaction, in order to simplify the logic.
+ if (start_index == sorted_runs_.size() - 1) {
+ bool included_file_marked = false;
+ int start_level = sorted_runs_[start_index].level;
+ FileMetaData* start_file = sorted_runs_[start_index].file;
+ for (const std::pair<int, FileMetaData*>& level_file_pair :
+ vstorage_->FilesMarkedForPeriodicCompaction()) {
+ if (start_level != 0) {
+ // Last sorted run is a level
+ if (start_level == level_file_pair.first) {
+ included_file_marked = true;
+ break;
+ }
+ } else {
+ // Last sorted run is a L0 file.
+ if (start_file == level_file_pair.second) {
+ included_file_marked = true;
+ break;
+ }
+ }
+ }
+ if (!included_file_marked) {
+ ROCKS_LOG_BUFFER(log_buffer_,
+ "[%s] Universal: Cannot form a compaction covering file "
+ "marked for periodic compaction",
+ cf_name_.c_str());
+ return nullptr;
+ }
+ }
+
+ Compaction* c = PickCompactionToOldest(start_index,
+ CompactionReason::kPeriodicCompaction);
+
+ TEST_SYNC_POINT_CALLBACK(
+ "UniversalCompactionPicker::PickPeriodicCompaction:Return", c);
+
+ return c;
+}
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_picker_universal.h b/src/rocksdb/db/compaction/compaction_picker_universal.h
new file mode 100644
index 000000000..c3f55f5d3
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_universal.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "db/compaction/compaction_picker.h"
+
+namespace ROCKSDB_NAMESPACE {
+class UniversalCompactionPicker : public CompactionPicker {
+ public:
+ UniversalCompactionPicker(const ImmutableCFOptions& ioptions,
+ const InternalKeyComparator* icmp)
+ : CompactionPicker(ioptions, icmp) {}
+ virtual Compaction* PickCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+ SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override;
+ virtual int MaxOutputLevel() const override { return NumberLevels() - 1; }
+
+ virtual bool NeedsCompaction(
+ const VersionStorageInfo* vstorage) const override;
+};
+} // namespace ROCKSDB_NAMESPACE
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/comparator_db_test.cc b/src/rocksdb/db/comparator_db_test.cc
new file mode 100644
index 000000000..49f287a97
--- /dev/null
+++ b/src/rocksdb/db/comparator_db_test.cc
@@ -0,0 +1,660 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#include <array>
+#include <map>
+#include <string>
+
+#include "memtable/stl_wrappers.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/hash.h"
+#include "util/kv_map.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+using std::unique_ptr;
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+
+static const Comparator* kTestComparator = nullptr;
+
+class KVIter : public Iterator {
+ public:
+ explicit KVIter(const stl_wrappers::KVMap* map)
+ : map_(map), iter_(map_->end()) {}
+ bool Valid() const override { return iter_ != map_->end(); }
+ void SeekToFirst() override { iter_ = map_->begin(); }
+ void SeekToLast() override {
+ if (map_->empty()) {
+ iter_ = map_->end();
+ } else {
+ iter_ = map_->find(map_->rbegin()->first);
+ }
+ }
+ void Seek(const Slice& k) override {
+ iter_ = map_->lower_bound(k.ToString());
+ }
+ void SeekForPrev(const Slice& k) override {
+ iter_ = map_->upper_bound(k.ToString());
+ Prev();
+ }
+ void Next() override { ++iter_; }
+ void Prev() override {
+ if (iter_ == map_->begin()) {
+ iter_ = map_->end();
+ return;
+ }
+ --iter_;
+ }
+
+ Slice key() const override { return iter_->first; }
+ Slice value() const override { return iter_->second; }
+ Status status() const override { return Status::OK(); }
+
+ private:
+ const stl_wrappers::KVMap* const map_;
+ stl_wrappers::KVMap::const_iterator iter_;
+};
+
+void AssertItersEqual(Iterator* iter1, Iterator* iter2) {
+ ASSERT_EQ(iter1->Valid(), iter2->Valid());
+ if (iter1->Valid()) {
+ ASSERT_EQ(iter1->key().ToString(), iter2->key().ToString());
+ ASSERT_EQ(iter1->value().ToString(), iter2->value().ToString());
+ }
+}
+
+// Measuring operations on DB (expect to be empty).
+// source_strings are candidate keys
+void DoRandomIteraratorTest(DB* db, std::vector<std::string> source_strings,
+ Random* rnd, int num_writes, int num_iter_ops,
+ int num_trigger_flush) {
+ stl_wrappers::KVMap map((stl_wrappers::LessOfComparator(kTestComparator)));
+
+ for (int i = 0; i < num_writes; i++) {
+ if (num_trigger_flush > 0 && i != 0 && i % num_trigger_flush == 0) {
+ db->Flush(FlushOptions());
+ }
+
+ int type = rnd->Uniform(2);
+ int index = rnd->Uniform(static_cast<int>(source_strings.size()));
+ auto& key = source_strings[index];
+ switch (type) {
+ case 0:
+ // put
+ map[key] = key;
+ ASSERT_OK(db->Put(WriteOptions(), key, key));
+ break;
+ case 1:
+ // delete
+ if (map.find(key) != map.end()) {
+ map.erase(key);
+ }
+ ASSERT_OK(db->Delete(WriteOptions(), key));
+ break;
+ default:
+ assert(false);
+ }
+ }
+
+ std::unique_ptr<Iterator> iter(db->NewIterator(ReadOptions()));
+ std::unique_ptr<Iterator> result_iter(new KVIter(&map));
+
+ bool is_valid = false;
+ for (int i = 0; i < num_iter_ops; i++) {
+ // Random walk and make sure iter and result_iter returns the
+ // same key and value
+ int type = rnd->Uniform(6);
+ ASSERT_OK(iter->status());
+ switch (type) {
+ case 0:
+ // Seek to First
+ iter->SeekToFirst();
+ result_iter->SeekToFirst();
+ break;
+ case 1:
+ // Seek to last
+ iter->SeekToLast();
+ result_iter->SeekToLast();
+ break;
+ case 2: {
+ // Seek to random key
+ auto key_idx = rnd->Uniform(static_cast<int>(source_strings.size()));
+ auto key = source_strings[key_idx];
+ iter->Seek(key);
+ result_iter->Seek(key);
+ break;
+ }
+ case 3:
+ // Next
+ if (is_valid) {
+ iter->Next();
+ result_iter->Next();
+ } else {
+ continue;
+ }
+ break;
+ case 4:
+ // Prev
+ if (is_valid) {
+ iter->Prev();
+ result_iter->Prev();
+ } else {
+ continue;
+ }
+ break;
+ default: {
+ assert(type == 5);
+ auto key_idx = rnd->Uniform(static_cast<int>(source_strings.size()));
+ auto key = source_strings[key_idx];
+ std::string result;
+ auto status = db->Get(ReadOptions(), key, &result);
+ if (map.find(key) == map.end()) {
+ ASSERT_TRUE(status.IsNotFound());
+ } else {
+ ASSERT_EQ(map[key], result);
+ }
+ break;
+ }
+ }
+ AssertItersEqual(iter.get(), result_iter.get());
+ is_valid = iter->Valid();
+ }
+}
+
+class DoubleComparator : public Comparator {
+ public:
+ DoubleComparator() {}
+
+ const char* Name() const override { return "DoubleComparator"; }
+
+ int Compare(const Slice& a, const Slice& b) const override {
+#ifndef CYGWIN
+ double da = std::stod(a.ToString());
+ double db = std::stod(b.ToString());
+#else
+ double da = std::strtod(a.ToString().c_str(), 0 /* endptr */);
+ double db = std::strtod(a.ToString().c_str(), 0 /* endptr */);
+#endif
+ if (da == db) {
+ return a.compare(b);
+ } else if (da > db) {
+ return 1;
+ } else {
+ return -1;
+ }
+ }
+ void FindShortestSeparator(std::string* /*start*/,
+ const Slice& /*limit*/) const override {}
+
+ void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+class HashComparator : public Comparator {
+ public:
+ HashComparator() {}
+
+ const char* Name() const override { return "HashComparator"; }
+
+ int Compare(const Slice& a, const Slice& b) const override {
+ uint32_t ha = Hash(a.data(), a.size(), 66);
+ uint32_t hb = Hash(b.data(), b.size(), 66);
+ if (ha == hb) {
+ return a.compare(b);
+ } else if (ha > hb) {
+ return 1;
+ } else {
+ return -1;
+ }
+ }
+ void FindShortestSeparator(std::string* /*start*/,
+ const Slice& /*limit*/) const override {}
+
+ void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+class TwoStrComparator : public Comparator {
+ public:
+ TwoStrComparator() {}
+
+ const char* Name() const override { return "TwoStrComparator"; }
+
+ int Compare(const Slice& a, const Slice& b) const override {
+ assert(a.size() >= 2);
+ assert(b.size() >= 2);
+ size_t size_a1 = static_cast<size_t>(a[0]);
+ size_t size_b1 = static_cast<size_t>(b[0]);
+ size_t size_a2 = static_cast<size_t>(a[1]);
+ size_t size_b2 = static_cast<size_t>(b[1]);
+ assert(size_a1 + size_a2 + 2 == a.size());
+ assert(size_b1 + size_b2 + 2 == b.size());
+
+ Slice a1 = Slice(a.data() + 2, size_a1);
+ Slice b1 = Slice(b.data() + 2, size_b1);
+ Slice a2 = Slice(a.data() + 2 + size_a1, size_a2);
+ Slice b2 = Slice(b.data() + 2 + size_b1, size_b2);
+
+ if (a1 != b1) {
+ return a1.compare(b1);
+ }
+ return a2.compare(b2);
+ }
+ void FindShortestSeparator(std::string* /*start*/,
+ const Slice& /*limit*/) const override {}
+
+ void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+} // namespace
+
+class ComparatorDBTest
+ : public testing::Test,
+ virtual public ::testing::WithParamInterface<uint32_t> {
+ private:
+ std::string dbname_;
+ Env* env_;
+ DB* db_;
+ Options last_options_;
+ std::unique_ptr<const Comparator> comparator_guard;
+
+ public:
+ ComparatorDBTest() : env_(Env::Default()), db_(nullptr) {
+ kTestComparator = BytewiseComparator();
+ dbname_ = test::PerThreadDBPath("comparator_db_test");
+ BlockBasedTableOptions toptions;
+ toptions.format_version = GetParam();
+ last_options_.table_factory.reset(
+ ROCKSDB_NAMESPACE::NewBlockBasedTableFactory(toptions));
+ EXPECT_OK(DestroyDB(dbname_, last_options_));
+ }
+
+ ~ComparatorDBTest() override {
+ delete db_;
+ EXPECT_OK(DestroyDB(dbname_, last_options_));
+ kTestComparator = BytewiseComparator();
+ }
+
+ DB* GetDB() { return db_; }
+
+ void SetOwnedComparator(const Comparator* cmp, bool owner = true) {
+ if (owner) {
+ comparator_guard.reset(cmp);
+ } else {
+ comparator_guard.reset();
+ }
+ kTestComparator = cmp;
+ last_options_.comparator = cmp;
+ }
+
+ // Return the current option configuration.
+ Options* GetOptions() { return &last_options_; }
+
+ void DestroyAndReopen() {
+ // Destroy using last options
+ Destroy();
+ ASSERT_OK(TryReopen());
+ }
+
+ void Destroy() {
+ delete db_;
+ db_ = nullptr;
+ ASSERT_OK(DestroyDB(dbname_, last_options_));
+ }
+
+ Status TryReopen() {
+ delete db_;
+ db_ = nullptr;
+ last_options_.create_if_missing = true;
+
+ return DB::Open(last_options_, dbname_, &db_);
+ }
+};
+
+INSTANTIATE_TEST_CASE_P(FormatDef, ComparatorDBTest,
+ testing::Values(test::kDefaultFormatVersion));
+INSTANTIATE_TEST_CASE_P(FormatLatest, ComparatorDBTest,
+ testing::Values(test::kLatestFormatVersion));
+
+TEST_P(ComparatorDBTest, Bytewise) {
+ for (int rand_seed = 301; rand_seed < 306; rand_seed++) {
+ DestroyAndReopen();
+ Random rnd(rand_seed);
+ DoRandomIteraratorTest(GetDB(),
+ {"a", "b", "c", "d", "e", "f", "g", "h", "i"}, &rnd,
+ 8, 100, 3);
+ }
+}
+
+TEST_P(ComparatorDBTest, SimpleSuffixReverseComparator) {
+ SetOwnedComparator(new test::SimpleSuffixReverseComparator());
+
+ for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+ Options* opt = GetOptions();
+ opt->comparator = kTestComparator;
+ DestroyAndReopen();
+ Random rnd(rnd_seed);
+
+ std::vector<std::string> source_strings;
+ std::vector<std::string> source_prefixes;
+ // Randomly generate 5 prefixes
+ for (int i = 0; i < 5; i++) {
+ source_prefixes.push_back(test::RandomHumanReadableString(&rnd, 8));
+ }
+ for (int j = 0; j < 20; j++) {
+ int prefix_index = rnd.Uniform(static_cast<int>(source_prefixes.size()));
+ std::string key = source_prefixes[prefix_index] +
+ test::RandomHumanReadableString(&rnd, rnd.Uniform(8));
+ source_strings.push_back(key);
+ }
+
+ DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 30, 600, 66);
+ }
+}
+
+TEST_P(ComparatorDBTest, Uint64Comparator) {
+ SetOwnedComparator(test::Uint64Comparator(), false /* owner */);
+
+ for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+ Options* opt = GetOptions();
+ opt->comparator = kTestComparator;
+ DestroyAndReopen();
+ Random rnd(rnd_seed);
+ Random64 rnd64(rnd_seed);
+
+ std::vector<std::string> source_strings;
+ // Randomly generate source keys
+ for (int i = 0; i < 100; i++) {
+ uint64_t r = rnd64.Next();
+ std::string str;
+ str.resize(8);
+ memcpy(&str[0], static_cast<void*>(&r), 8);
+ source_strings.push_back(str);
+ }
+
+ DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
+ }
+}
+
+TEST_P(ComparatorDBTest, DoubleComparator) {
+ SetOwnedComparator(new DoubleComparator());
+
+ for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+ Options* opt = GetOptions();
+ opt->comparator = kTestComparator;
+ DestroyAndReopen();
+ Random rnd(rnd_seed);
+
+ std::vector<std::string> source_strings;
+ // Randomly generate source keys
+ for (int i = 0; i < 100; i++) {
+ uint32_t r = rnd.Next();
+ uint32_t divide_order = rnd.Uniform(8);
+ double to_divide = 1.0;
+ for (uint32_t j = 0; j < divide_order; j++) {
+ to_divide *= 10.0;
+ }
+ source_strings.push_back(ToString(r / to_divide));
+ }
+
+ DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
+ }
+}
+
+TEST_P(ComparatorDBTest, HashComparator) {
+ SetOwnedComparator(new HashComparator());
+
+ for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+ Options* opt = GetOptions();
+ opt->comparator = kTestComparator;
+ DestroyAndReopen();
+ Random rnd(rnd_seed);
+
+ std::vector<std::string> source_strings;
+ // Randomly generate source keys
+ for (int i = 0; i < 100; i++) {
+ source_strings.push_back(test::RandomKey(&rnd, 8));
+ }
+
+ DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
+ }
+}
+
+TEST_P(ComparatorDBTest, TwoStrComparator) {
+ SetOwnedComparator(new TwoStrComparator());
+
+ for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+ Options* opt = GetOptions();
+ opt->comparator = kTestComparator;
+ DestroyAndReopen();
+ Random rnd(rnd_seed);
+
+ std::vector<std::string> source_strings;
+ // Randomly generate source keys
+ for (int i = 0; i < 100; i++) {
+ std::string str;
+ uint32_t size1 = rnd.Uniform(8);
+ uint32_t size2 = rnd.Uniform(8);
+ str.append(1, static_cast<char>(size1));
+ str.append(1, static_cast<char>(size2));
+ str.append(test::RandomKey(&rnd, size1));
+ str.append(test::RandomKey(&rnd, size2));
+ source_strings.push_back(str);
+ }
+
+ DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
+ }
+}
+
+TEST_P(ComparatorDBTest, IsSameLengthImmediateSuccessor) {
+ {
+ // different length
+ Slice s("abcxy");
+ Slice t("abcxyz");
+ ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+ }
+ {
+ Slice s("abcxyz");
+ Slice t("abcxy");
+ ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+ }
+ {
+ // not last byte different
+ Slice s("abc1xyz");
+ Slice t("abc2xyz");
+ ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+ }
+ {
+ // same string
+ Slice s("abcxyz");
+ Slice t("abcxyz");
+ ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+ }
+ {
+ Slice s("abcxy");
+ Slice t("abcxz");
+ ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+ }
+ {
+ Slice s("abcxz");
+ Slice t("abcxy");
+ ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+ }
+ {
+ const char s_array[] = "\x50\x8a\xac";
+ const char t_array[] = "\x50\x8a\xad";
+ Slice s(s_array);
+ Slice t(t_array);
+ ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+ }
+ {
+ const char s_array[] = "\x50\x8a\xff";
+ const char t_array[] = "\x50\x8b\x00";
+ Slice s(s_array, 3);
+ Slice t(t_array, 3);
+ ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+ }
+ {
+ const char s_array[] = "\x50\x8a\xff\xff";
+ const char t_array[] = "\x50\x8b\x00\x00";
+ Slice s(s_array, 4);
+ Slice t(t_array, 4);
+ ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+ }
+ {
+ const char s_array[] = "\x50\x8a\xff\xff";
+ const char t_array[] = "\x50\x8b\x00\x01";
+ Slice s(s_array, 4);
+ Slice t(t_array, 4);
+ ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+ }
+}
+
+TEST_P(ComparatorDBTest, FindShortestSeparator) {
+ std::string s1 = "abc1xyz";
+ std::string s2 = "abc3xy";
+
+ BytewiseComparator()->FindShortestSeparator(&s1, s2);
+ ASSERT_EQ("abc2", s1);
+
+ s1 = "abc5xyztt";
+
+ ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2);
+ ASSERT_EQ("abc5", s1);
+
+ s1 = "abc3";
+ s2 = "abc2xy";
+ ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2);
+ ASSERT_EQ("abc3", s1);
+
+ s1 = "abc3xyz";
+ s2 = "abc2xy";
+ ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2);
+ ASSERT_EQ("abc3", s1);
+
+ s1 = "abc3xyz";
+ s2 = "abc2";
+ ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2);
+ ASSERT_EQ("abc3", s1);
+
+ std::string old_s1 = s1 = "abc2xy";
+ s2 = "abc2";
+ ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2);
+ ASSERT_TRUE(old_s1 >= s1);
+ ASSERT_TRUE(s1 > s2);
+}
+
+TEST_P(ComparatorDBTest, SeparatorSuccessorRandomizeTest) {
+ // Char list for boundary cases.
+ std::array<unsigned char, 6> char_list{{0, 1, 2, 253, 254, 255}};
+ Random rnd(301);
+
+ for (int attempts = 0; attempts < 1000; attempts++) {
+ uint32_t size1 = rnd.Skewed(4);
+ uint32_t size2;
+
+ if (rnd.OneIn(2)) {
+ // size2 to be random size
+ size2 = rnd.Skewed(4);
+ } else {
+ // size1 is within [-2, +2] of size1
+ int diff = static_cast<int>(rnd.Uniform(5)) - 2;
+ int tmp_size2 = static_cast<int>(size1) + diff;
+ if (tmp_size2 < 0) {
+ tmp_size2 = 0;
+ }
+ size2 = static_cast<uint32_t>(tmp_size2);
+ }
+
+ std::string s1;
+ std::string s2;
+ for (uint32_t i = 0; i < size1; i++) {
+ if (rnd.OneIn(2)) {
+ // Use random byte
+ s1 += static_cast<char>(rnd.Uniform(256));
+ } else {
+ // Use one byte in char_list
+ char c = static_cast<char>(char_list[rnd.Uniform(sizeof(char_list))]);
+ s1 += c;
+ }
+ }
+
+ // First set s2 to be the same as s1, and then modify s2.
+ s2 = s1;
+ s2.resize(size2);
+ // We start from the back of the string
+ if (size2 > 0) {
+ uint32_t pos = size2 - 1;
+ do {
+ if (pos >= size1 || rnd.OneIn(4)) {
+ // For 1/4 chance, use random byte
+ s2[pos] = static_cast<char>(rnd.Uniform(256));
+ } else if (rnd.OneIn(4)) {
+ // In 1/4 chance, stop here.
+ break;
+ } else {
+ // Create a char within [-2, +2] of the matching char of s1.
+ int diff = static_cast<int>(rnd.Uniform(5)) - 2;
+ // char may be signed or unsigned based on platform.
+ int s1_char = static_cast<int>(static_cast<unsigned char>(s1[pos]));
+ int s2_char = s1_char + diff;
+ if (s2_char < 0) {
+ s2_char = 0;
+ }
+ if (s2_char > 255) {
+ s2_char = 255;
+ }
+ s2[pos] = static_cast<char>(s2_char);
+ }
+ } while (pos-- != 0);
+ }
+
+ // Test separators
+ for (int rev = 0; rev < 2; rev++) {
+ if (rev == 1) {
+ // switch s1 and s2
+ std::string t = s1;
+ s1 = s2;
+ s2 = t;
+ }
+ std::string separator = s1;
+ BytewiseComparator()->FindShortestSeparator(&separator, s2);
+ std::string rev_separator = s1;
+ ReverseBytewiseComparator()->FindShortestSeparator(&rev_separator, s2);
+
+ if (s1 == s2) {
+ ASSERT_EQ(s1, separator);
+ ASSERT_EQ(s2, rev_separator);
+ } else if (s1 < s2) {
+ ASSERT_TRUE(s1 <= separator);
+ ASSERT_TRUE(s2 > separator);
+ ASSERT_LE(separator.size(), std::max(s1.size(), s2.size()));
+ ASSERT_EQ(s1, rev_separator);
+ } else {
+ ASSERT_TRUE(s1 >= rev_separator);
+ ASSERT_TRUE(s2 < rev_separator);
+ ASSERT_LE(rev_separator.size(), std::max(s1.size(), s2.size()));
+ ASSERT_EQ(s1, separator);
+ }
+ }
+
+ // Test successors
+ std::string succ = s1;
+ BytewiseComparator()->FindShortSuccessor(&succ);
+ ASSERT_TRUE(succ >= s1);
+
+ succ = s1;
+ ReverseBytewiseComparator()->FindShortSuccessor(&succ);
+ ASSERT_TRUE(succ <= s1);
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/convenience.cc b/src/rocksdb/db/convenience.cc
new file mode 100644
index 000000000..206f1f875
--- /dev/null
+++ b/src/rocksdb/db/convenience.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/convenience.h"
+
+#include "db/db_impl/db_impl.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void CancelAllBackgroundWork(DB* db, bool wait) {
+ (static_cast_with_check<DBImpl, DB>(db->GetRootDB()))
+ ->CancelAllBackgroundWork(wait);
+}
+
+Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family,
+ const Slice* begin, const Slice* end,
+ bool include_end) {
+ RangePtr range(begin, end);
+ return DeleteFilesInRanges(db, column_family, &range, 1, include_end);
+}
+
+Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family,
+ const RangePtr* ranges, size_t n,
+ bool include_end) {
+ return (static_cast_with_check<DBImpl, DB>(db->GetRootDB()))
+ ->DeleteFilesInRanges(column_family, ranges, n, include_end);
+}
+
+Status VerifySstFileChecksum(const Options& options,
+ const EnvOptions& env_options,
+ const std::string& file_path) {
+ return VerifySstFileChecksum(options, env_options, ReadOptions(), file_path);
+}
+Status VerifySstFileChecksum(const Options& options,
+ const EnvOptions& env_options,
+ const ReadOptions& read_options,
+ const std::string& file_path) {
+ std::unique_ptr<FSRandomAccessFile> file;
+ uint64_t file_size;
+ InternalKeyComparator internal_comparator(options.comparator);
+ ImmutableCFOptions ioptions(options);
+
+ Status s = ioptions.fs->NewRandomAccessFile(file_path,
+ FileOptions(env_options),
+ &file, nullptr);
+ if (s.ok()) {
+ s = ioptions.fs->GetFileSize(file_path, IOOptions(), &file_size, nullptr);
+ } else {
+ return s;
+ }
+ std::unique_ptr<TableReader> table_reader;
+ std::unique_ptr<RandomAccessFileReader> file_reader(
+ new RandomAccessFileReader(std::move(file), file_path));
+ const bool kImmortal = true;
+ s = ioptions.table_factory->NewTableReader(
+ TableReaderOptions(ioptions, options.prefix_extractor.get(), env_options,
+ internal_comparator, false /* skip_filters */,
+ !kImmortal, -1 /* level */),
+ std::move(file_reader), file_size, &table_reader,
+ false /* prefetch_index_and_filter_in_cache */);
+ if (!s.ok()) {
+ return s;
+ }
+ s = table_reader->VerifyChecksum(read_options,
+ TableReaderCaller::kUserVerifyChecksum);
+ return s;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/db/corruption_test.cc b/src/rocksdb/db/corruption_test.cc
new file mode 100644
index 000000000..203c34fa4
--- /dev/null
+++ b/src/rocksdb/db/corruption_test.cc
@@ -0,0 +1,613 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/db.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <cinttypes>
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "db/log_format.h"
+#include "db/version_set.h"
+#include "env/composite_env_wrapper.h"
+#include "file/filename.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/env.h"
+#include "rocksdb/table.h"
+#include "rocksdb/write_batch.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/meta_blocks.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static const int kValueSize = 1000;
+
+class CorruptionTest : public testing::Test {
+ public:
+ test::ErrorEnv env_;
+ std::string dbname_;
+ std::shared_ptr<Cache> tiny_cache_;
+ Options options_;
+ DB* db_;
+
+ CorruptionTest() {
+ // If LRU cache shard bit is smaller than 2 (or -1 which will automatically
+ // set it to 0), test SequenceNumberRecovery will fail, likely because of a
+ // bug in recovery code. Keep it 4 for now to make the test passes.
+ tiny_cache_ = NewLRUCache(100, 4);
+ options_.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords;
+ options_.env = &env_;
+ dbname_ = test::PerThreadDBPath("corruption_test");
+ DestroyDB(dbname_, options_);
+
+ db_ = nullptr;
+ options_.create_if_missing = true;
+ BlockBasedTableOptions table_options;
+ table_options.block_size_deviation = 0; // make unit test pass for now
+ options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen();
+ options_.create_if_missing = false;
+ }
+
+ ~CorruptionTest() override {
+ delete db_;
+ DestroyDB(dbname_, Options());
+ }
+
+ void CloseDb() {
+ delete db_;
+ db_ = nullptr;
+ }
+
+ Status TryReopen(Options* options = nullptr) {
+ delete db_;
+ db_ = nullptr;
+ Options opt = (options ? *options : options_);
+ if (opt.env == Options().env) {
+ // If env is not overridden, replace it with ErrorEnv.
+ // Otherwise, the test already uses a non-default Env.
+ opt.env = &env_;
+ }
+ opt.arena_block_size = 4096;
+ BlockBasedTableOptions table_options;
+ table_options.block_cache = tiny_cache_;
+ table_options.block_size_deviation = 0;
+ opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ return DB::Open(opt, dbname_, &db_);
+ }
+
+ void Reopen(Options* options = nullptr) {
+ ASSERT_OK(TryReopen(options));
+ }
+
+ void RepairDB() {
+ delete db_;
+ db_ = nullptr;
+ ASSERT_OK(::ROCKSDB_NAMESPACE::RepairDB(dbname_, options_));
+ }
+
+ void Build(int n, int flush_every = 0) {
+ std::string key_space, value_space;
+ WriteBatch batch;
+ for (int i = 0; i < n; i++) {
+ if (flush_every != 0 && i != 0 && i % flush_every == 0) {
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+ dbi->TEST_FlushMemTable();
+ }
+ //if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n);
+ Slice key = Key(i, &key_space);
+ batch.Clear();
+ batch.Put(key, Value(i, &value_space));
+ ASSERT_OK(db_->Write(WriteOptions(), &batch));
+ }
+ }
+
+ void Check(int min_expected, int max_expected) {
+ uint64_t next_expected = 0;
+ uint64_t missed = 0;
+ int bad_keys = 0;
+ int bad_values = 0;
+ int correct = 0;
+ std::string value_space;
+ // Do not verify checksums. If we verify checksums then the
+ // db itself will raise errors because data is corrupted.
+ // Instead, we want the reads to be successful and this test
+ // will detect whether the appropriate corruptions have
+ // occurred.
+ Iterator* iter = db_->NewIterator(ReadOptions(false, true));
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ uint64_t key;
+ Slice in(iter->key());
+ if (!ConsumeDecimalNumber(&in, &key) ||
+ !in.empty() ||
+ key < next_expected) {
+ bad_keys++;
+ continue;
+ }
+ missed += (key - next_expected);
+ next_expected = key + 1;
+ if (iter->value() != Value(static_cast<int>(key), &value_space)) {
+ bad_values++;
+ } else {
+ correct++;
+ }
+ }
+ delete iter;
+
+ fprintf(stderr,
+ "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%llu\n",
+ min_expected, max_expected, correct, bad_keys, bad_values,
+ static_cast<unsigned long long>(missed));
+ ASSERT_LE(min_expected, correct);
+ ASSERT_GE(max_expected, correct);
+ }
+
+ void CorruptFile(const std::string& fname, int offset, int bytes_to_corrupt) {
+ struct stat sbuf;
+ if (stat(fname.c_str(), &sbuf) != 0) {
+ const char* msg = strerror(errno);
+ FAIL() << fname << ": " << msg;
+ }
+
+ if (offset < 0) {
+ // Relative to end of file; make it absolute
+ if (-offset > sbuf.st_size) {
+ offset = 0;
+ } else {
+ offset = static_cast<int>(sbuf.st_size + offset);
+ }
+ }
+ if (offset > sbuf.st_size) {
+ offset = static_cast<int>(sbuf.st_size);
+ }
+ if (offset + bytes_to_corrupt > sbuf.st_size) {
+ bytes_to_corrupt = static_cast<int>(sbuf.st_size - offset);
+ }
+
+ // Do it
+ std::string contents;
+ Status s = ReadFileToString(Env::Default(), fname, &contents);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ for (int i = 0; i < bytes_to_corrupt; i++) {
+ contents[i + offset] ^= 0x80;
+ }
+ s = WriteStringToFile(Env::Default(), contents, fname);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ Options options;
+ EnvOptions env_options;
+ options.file_system.reset(new LegacyFileSystemWrapper(options.env));
+ ASSERT_NOK(VerifySstFileChecksum(options, env_options, fname));
+ }
+
+ void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
+ // Pick file to corrupt
+ std::vector<std::string> filenames;
+ ASSERT_OK(env_.GetChildren(dbname_, &filenames));
+ uint64_t number;
+ FileType type;
+ std::string fname;
+ int picked_number = -1;
+ for (size_t i = 0; i < filenames.size(); i++) {
+ if (ParseFileName(filenames[i], &number, &type) &&
+ type == filetype &&
+ static_cast<int>(number) > picked_number) { // Pick latest file
+ fname = dbname_ + "/" + filenames[i];
+ picked_number = static_cast<int>(number);
+ }
+ }
+ ASSERT_TRUE(!fname.empty()) << filetype;
+
+ CorruptFile(fname, offset, bytes_to_corrupt);
+ }
+
+ // corrupts exactly one file at level `level`. if no file found at level,
+ // asserts
+ void CorruptTableFileAtLevel(int level, int offset, int bytes_to_corrupt) {
+ std::vector<LiveFileMetaData> metadata;
+ db_->GetLiveFilesMetaData(&metadata);
+ for (const auto& m : metadata) {
+ if (m.level == level) {
+ CorruptFile(dbname_ + "/" + m.name, offset, bytes_to_corrupt);
+ return;
+ }
+ }
+ FAIL() << "no file found at level";
+ }
+
+
+ int Property(const std::string& name) {
+ std::string property;
+ int result;
+ if (db_->GetProperty(name, &property) &&
+ sscanf(property.c_str(), "%d", &result) == 1) {
+ return result;
+ } else {
+ return -1;
+ }
+ }
+
+ // Return the ith key
+ Slice Key(int i, std::string* storage) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%016d", i);
+ storage->assign(buf, strlen(buf));
+ return Slice(*storage);
+ }
+
+ // Return the value to associate with the specified key
+ Slice Value(int k, std::string* storage) {
+ if (k == 0) {
+ // Ugh. Random seed of 0 used to produce no entropy. This code
+ // preserves the implementation that was in place when all of the
+ // magic values in this file were picked.
+ *storage = std::string(kValueSize, ' ');
+ return Slice(*storage);
+ } else {
+ Random r(k);
+ return test::RandomString(&r, kValueSize, storage);
+ }
+ }
+};
+
+TEST_F(CorruptionTest, Recovery) {
+ Build(100);
+ Check(100, 100);
+#ifdef OS_WIN
+ // On Wndows OS Disk cache does not behave properly
+ // We do not call FlushBuffers on every Flush. If we do not close
+ // the log file prior to the corruption we end up with the first
+ // block not corrupted but only the second. However, under the debugger
+ // things work just fine but never pass when running normally
+ // For that reason people may want to run with unbuffered I/O. That option
+ // is not available for WAL though.
+ CloseDb();
+#endif
+ Corrupt(kLogFile, 19, 1); // WriteBatch tag for first record
+ Corrupt(kLogFile, log::kBlockSize + 1000, 1); // Somewhere in second block
+ ASSERT_TRUE(!TryReopen().ok());
+ options_.paranoid_checks = false;
+ Reopen(&options_);
+
+ // The 64 records in the first two log blocks are completely lost.
+ Check(36, 36);
+}
+
+TEST_F(CorruptionTest, RecoverWriteError) {
+ env_.writable_file_error_ = true;
+ Status s = TryReopen();
+ ASSERT_TRUE(!s.ok());
+}
+
+TEST_F(CorruptionTest, NewFileErrorDuringWrite) {
+ // Do enough writing to force minor compaction
+ env_.writable_file_error_ = true;
+ const int num =
+ static_cast<int>(3 + (Options().write_buffer_size / kValueSize));
+ std::string value_storage;
+ Status s;
+ bool failed = false;
+ for (int i = 0; i < num; i++) {
+ WriteBatch batch;
+ batch.Put("a", Value(100, &value_storage));
+ s = db_->Write(WriteOptions(), &batch);
+ if (!s.ok()) {
+ failed = true;
+ }
+ ASSERT_TRUE(!failed || !s.ok());
+ }
+ ASSERT_TRUE(!s.ok());
+ ASSERT_GE(env_.num_writable_file_errors_, 1);
+ env_.writable_file_error_ = false;
+ Reopen();
+}
+
+TEST_F(CorruptionTest, TableFile) {
+ Build(100);
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+ dbi->TEST_FlushMemTable();
+ dbi->TEST_CompactRange(0, nullptr, nullptr);
+ dbi->TEST_CompactRange(1, nullptr, nullptr);
+
+ Corrupt(kTableFile, 100, 1);
+ Check(99, 99);
+ ASSERT_NOK(dbi->VerifyChecksum());
+}
+
+TEST_F(CorruptionTest, VerifyChecksumReadahead) {
+ Options options;
+ SpecialEnv senv(Env::Default());
+ options.env = &senv;
+ // Disable block cache as we are going to check checksum for
+ // the same file twice and measure number of reads.
+ BlockBasedTableOptions table_options_no_bc;
+ table_options_no_bc.no_block_cache = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options_no_bc));
+
+ Reopen(&options);
+
+ Build(10000);
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+ dbi->TEST_FlushMemTable();
+ dbi->TEST_CompactRange(0, nullptr, nullptr);
+ dbi->TEST_CompactRange(1, nullptr, nullptr);
+
+ senv.count_random_reads_ = true;
+ senv.random_read_counter_.Reset();
+ ASSERT_OK(dbi->VerifyChecksum());
+
+ // Make sure the counter is enabled.
+ ASSERT_GT(senv.random_read_counter_.Read(), 0);
+
+ // The SST file is about 10MB. Default readahead size is 256KB.
+ // Give a conservative 20 reads for metadata blocks, The number
+ // of random reads should be within 10 MB / 256KB + 20 = 60.
+ ASSERT_LT(senv.random_read_counter_.Read(), 60);
+
+ senv.random_read_bytes_counter_ = 0;
+ ReadOptions ro;
+ ro.readahead_size = size_t{32 * 1024};
+ ASSERT_OK(dbi->VerifyChecksum(ro));
+ // The SST file is about 10MB. We set readahead size to 32KB.
+ // Give 0 to 20 reads for metadata blocks, and allow real read
+ // to range from 24KB to 48KB. The lower bound would be:
+ // 10MB / 48KB + 0 = 213
+ // The higher bound is
+ // 10MB / 24KB + 20 = 447.
+ ASSERT_GE(senv.random_read_counter_.Read(), 213);
+ ASSERT_LE(senv.random_read_counter_.Read(), 447);
+
+ // Test readahead shouldn't break mmap mode (where it should be
+ // disabled).
+ options.allow_mmap_reads = true;
+ Reopen(&options);
+ dbi = static_cast<DBImpl*>(db_);
+ ASSERT_OK(dbi->VerifyChecksum(ro));
+
+ CloseDb();
+}
+
+TEST_F(CorruptionTest, TableFileIndexData) {
+ Options options;
+ // very big, we'll trigger flushes manually
+ options.write_buffer_size = 100 * 1024 * 1024;
+ Reopen(&options);
+ // build 2 tables, flush at 5000
+ Build(10000, 5000);
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+ dbi->TEST_FlushMemTable();
+
+ // corrupt an index block of an entire file
+ Corrupt(kTableFile, -2000, 500);
+ options.paranoid_checks = false;
+ Reopen(&options);
+ dbi = reinterpret_cast<DBImpl*>(db_);
+ // one full file may be readable, since only one was corrupted
+ // the other file should be fully non-readable, since index was corrupted
+ Check(0, 5000);
+ ASSERT_NOK(dbi->VerifyChecksum());
+
+ // In paranoid mode, the db cannot be opened due to the corrupted file.
+ ASSERT_TRUE(TryReopen().IsCorruption());
+}
+
+TEST_F(CorruptionTest, MissingDescriptor) {
+ Build(1000);
+ RepairDB();
+ Reopen();
+ Check(1000, 1000);
+}
+
+TEST_F(CorruptionTest, SequenceNumberRecovery) {
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3"));
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4"));
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5"));
+ RepairDB();
+ Reopen();
+ std::string v;
+ ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+ ASSERT_EQ("v5", v);
+ // Write something. If sequence number was not recovered properly,
+ // it will be hidden by an earlier write.
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6"));
+ ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+ ASSERT_EQ("v6", v);
+ Reopen();
+ ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+ ASSERT_EQ("v6", v);
+}
+
+TEST_F(CorruptionTest, CorruptedDescriptor) {
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello"));
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+ dbi->TEST_FlushMemTable();
+ dbi->TEST_CompactRange(0, nullptr, nullptr);
+
+ Corrupt(kDescriptorFile, 0, 1000);
+ Status s = TryReopen();
+ ASSERT_TRUE(!s.ok());
+
+ RepairDB();
+ Reopen();
+ std::string v;
+ ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+ ASSERT_EQ("hello", v);
+}
+
+TEST_F(CorruptionTest, CompactionInputError) {
+ Options options;
+ Reopen(&options);
+ Build(10);
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+ dbi->TEST_FlushMemTable();
+ dbi->TEST_CompactRange(0, nullptr, nullptr);
+ dbi->TEST_CompactRange(1, nullptr, nullptr);
+ ASSERT_EQ(1, Property("rocksdb.num-files-at-level2"));
+
+ Corrupt(kTableFile, 100, 1);
+ Check(9, 9);
+ ASSERT_NOK(dbi->VerifyChecksum());
+
+ // Force compactions by writing lots of values
+ Build(10000);
+ Check(10000, 10000);
+ ASSERT_NOK(dbi->VerifyChecksum());
+}
+
+TEST_F(CorruptionTest, CompactionInputErrorParanoid) {
+ Options options;
+ options.paranoid_checks = true;
+ options.write_buffer_size = 131072;
+ options.max_write_buffer_number = 2;
+ Reopen(&options);
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+
+ // Fill levels >= 1
+ for (int level = 1; level < dbi->NumberLevels(); level++) {
+ dbi->Put(WriteOptions(), "", "begin");
+ dbi->Put(WriteOptions(), "~", "end");
+ dbi->TEST_FlushMemTable();
+ for (int comp_level = 0; comp_level < dbi->NumberLevels() - level;
+ ++comp_level) {
+ dbi->TEST_CompactRange(comp_level, nullptr, nullptr);
+ }
+ }
+
+ Reopen(&options);
+
+ dbi = reinterpret_cast<DBImpl*>(db_);
+ Build(10);
+ dbi->TEST_FlushMemTable();
+ dbi->TEST_WaitForCompact();
+ ASSERT_EQ(1, Property("rocksdb.num-files-at-level0"));
+
+ CorruptTableFileAtLevel(0, 100, 1);
+ Check(9, 9);
+ ASSERT_NOK(dbi->VerifyChecksum());
+
+ // Write must eventually fail because of corrupted table
+ Status s;
+ std::string tmp1, tmp2;
+ bool failed = false;
+ for (int i = 0; i < 10000; i++) {
+ s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2));
+ if (!s.ok()) {
+ failed = true;
+ }
+ // if one write failed, every subsequent write must fail, too
+ ASSERT_TRUE(!failed || !s.ok()) << "write did not fail in a corrupted db";
+ }
+ ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
+}
+
+TEST_F(CorruptionTest, UnrelatedKeys) {
+ Build(10);
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+ dbi->TEST_FlushMemTable();
+ Corrupt(kTableFile, 100, 1);
+ ASSERT_NOK(dbi->VerifyChecksum());
+
+ std::string tmp1, tmp2;
+ ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
+ std::string v;
+ ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
+ ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
+ dbi->TEST_FlushMemTable();
+ ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
+ ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
+}
+
+TEST_F(CorruptionTest, RangeDeletionCorrupted) {
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ std::vector<LiveFileMetaData> metadata;
+ db_->GetLiveFilesMetaData(&metadata);
+ ASSERT_EQ(static_cast<size_t>(1), metadata.size());
+ std::string filename = dbname_ + metadata[0].name;
+
+ std::unique_ptr<RandomAccessFile> file;
+ ASSERT_OK(options_.env->NewRandomAccessFile(filename, &file, EnvOptions()));
+ std::unique_ptr<RandomAccessFileReader> file_reader(
+ new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(file),
+ filename));
+
+ uint64_t file_size;
+ ASSERT_OK(options_.env->GetFileSize(filename, &file_size));
+
+ BlockHandle range_del_handle;
+ ASSERT_OK(FindMetaBlock(
+ file_reader.get(), file_size, kBlockBasedTableMagicNumber,
+ ImmutableCFOptions(options_), kRangeDelBlock, &range_del_handle));
+
+ ASSERT_OK(TryReopen());
+ CorruptFile(filename, static_cast<int>(range_del_handle.offset()), 1);
+ ASSERT_TRUE(TryReopen().IsCorruption());
+}
+
+TEST_F(CorruptionTest, FileSystemStateCorrupted) {
+ for (int iter = 0; iter < 2; ++iter) {
+ Options options;
+ options.paranoid_checks = true;
+ options.create_if_missing = true;
+ Reopen(&options);
+ Build(10);
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+ std::vector<LiveFileMetaData> metadata;
+ dbi->GetLiveFilesMetaData(&metadata);
+ ASSERT_GT(metadata.size(), size_t(0));
+ std::string filename = dbname_ + metadata[0].name;
+
+ delete db_;
+ db_ = nullptr;
+
+ if (iter == 0) { // corrupt file size
+ std::unique_ptr<WritableFile> file;
+ env_.NewWritableFile(filename, &file, EnvOptions());
+ file->Append(Slice("corrupted sst"));
+ file.reset();
+ Status x = TryReopen(&options);
+ ASSERT_TRUE(x.IsCorruption());
+ } else { // delete the file
+ env_.DeleteFile(filename);
+ Status x = TryReopen(&options);
+ ASSERT_TRUE(x.IsPathNotFound());
+ }
+
+ DestroyDB(dbname_, options_);
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "SKIPPED as RepairDB() is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/cuckoo_table_db_test.cc b/src/rocksdb/db/cuckoo_table_db_test.cc
new file mode 100644
index 000000000..9467840ff
--- /dev/null
+++ b/src/rocksdb/db/cuckoo_table_db_test.cc
@@ -0,0 +1,351 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/db_impl/db_impl.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "table/cuckoo/cuckoo_table_factory.h"
+#include "table/cuckoo/cuckoo_table_reader.h"
+#include "table/meta_blocks.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CuckooTableDBTest : public testing::Test {
+ private:
+ std::string dbname_;
+ Env* env_;
+ DB* db_;
+
+ public:
+ CuckooTableDBTest() : env_(Env::Default()) {
+ dbname_ = test::PerThreadDBPath("cuckoo_table_db_test");
+ EXPECT_OK(DestroyDB(dbname_, Options()));
+ db_ = nullptr;
+ Reopen();
+ }
+
+ ~CuckooTableDBTest() override {
+ delete db_;
+ EXPECT_OK(DestroyDB(dbname_, Options()));
+ }
+
+ Options CurrentOptions() {
+ Options options;
+ options.table_factory.reset(NewCuckooTableFactory());
+ options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true));
+ options.allow_mmap_reads = true;
+ options.create_if_missing = true;
+ options.allow_concurrent_memtable_write = false;
+ return options;
+ }
+
+ DBImpl* dbfull() {
+ return reinterpret_cast<DBImpl*>(db_);
+ }
+
+ // The following util methods are copied from plain_table_db_test.
+ void Reopen(Options* options = nullptr) {
+ delete db_;
+ db_ = nullptr;
+ Options opts;
+ if (options != nullptr) {
+ opts = *options;
+ } else {
+ opts = CurrentOptions();
+ opts.create_if_missing = true;
+ }
+ ASSERT_OK(DB::Open(opts, dbname_, &db_));
+ }
+
+ Status Put(const Slice& k, const Slice& v) {
+ return db_->Put(WriteOptions(), k, v);
+ }
+
+ Status Delete(const std::string& k) {
+ return db_->Delete(WriteOptions(), k);
+ }
+
+ std::string Get(const std::string& k) {
+ ReadOptions options;
+ std::string result;
+ Status s = db_->Get(options, k, &result);
+ if (s.IsNotFound()) {
+ result = "NOT_FOUND";
+ } else if (!s.ok()) {
+ result = s.ToString();
+ }
+ return result;
+ }
+
+ int NumTableFilesAtLevel(int level) {
+ std::string property;
+ EXPECT_TRUE(db_->GetProperty(
+ "rocksdb.num-files-at-level" + NumberToString(level), &property));
+ return atoi(property.c_str());
+ }
+
+ // Return spread of files per level
+ std::string FilesPerLevel() {
+ std::string result;
+ size_t last_non_zero_offset = 0;
+ for (int level = 0; level < db_->NumberLevels(); level++) {
+ int f = NumTableFilesAtLevel(level);
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+ result += buf;
+ if (f > 0) {
+ last_non_zero_offset = result.size();
+ }
+ }
+ result.resize(last_non_zero_offset);
+ return result;
+ }
+};
+
+TEST_F(CuckooTableDBTest, Flush) {
+ // Try with empty DB first.
+ ASSERT_TRUE(dbfull() != nullptr);
+ ASSERT_EQ("NOT_FOUND", Get("key2"));
+
+ // Add some values to db.
+ Options options = CurrentOptions();
+ Reopen(&options);
+
+ ASSERT_OK(Put("key1", "v1"));
+ ASSERT_OK(Put("key2", "v2"));
+ ASSERT_OK(Put("key3", "v3"));
+ dbfull()->TEST_FlushMemTable();
+
+ TablePropertiesCollection ptc;
+ reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
+ ASSERT_EQ(1U, ptc.size());
+ ASSERT_EQ(3U, ptc.begin()->second->num_entries);
+ ASSERT_EQ("1", FilesPerLevel());
+
+ ASSERT_EQ("v1", Get("key1"));
+ ASSERT_EQ("v2", Get("key2"));
+ ASSERT_EQ("v3", Get("key3"));
+ ASSERT_EQ("NOT_FOUND", Get("key4"));
+
+ // Now add more keys and flush.
+ ASSERT_OK(Put("key4", "v4"));
+ ASSERT_OK(Put("key5", "v5"));
+ ASSERT_OK(Put("key6", "v6"));
+ dbfull()->TEST_FlushMemTable();
+
+ reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
+ ASSERT_EQ(2U, ptc.size());
+ auto row = ptc.begin();
+ ASSERT_EQ(3U, row->second->num_entries);
+ ASSERT_EQ(3U, (++row)->second->num_entries);
+ ASSERT_EQ("2", FilesPerLevel());
+ ASSERT_EQ("v1", Get("key1"));
+ ASSERT_EQ("v2", Get("key2"));
+ ASSERT_EQ("v3", Get("key3"));
+ ASSERT_EQ("v4", Get("key4"));
+ ASSERT_EQ("v5", Get("key5"));
+ ASSERT_EQ("v6", Get("key6"));
+
+ ASSERT_OK(Delete("key6"));
+ ASSERT_OK(Delete("key5"));
+ ASSERT_OK(Delete("key4"));
+ dbfull()->TEST_FlushMemTable();
+ reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
+ ASSERT_EQ(3U, ptc.size());
+ row = ptc.begin();
+ ASSERT_EQ(3U, row->second->num_entries);
+ ASSERT_EQ(3U, (++row)->second->num_entries);
+ ASSERT_EQ(3U, (++row)->second->num_entries);
+ ASSERT_EQ("3", FilesPerLevel());
+ ASSERT_EQ("v1", Get("key1"));
+ ASSERT_EQ("v2", Get("key2"));
+ ASSERT_EQ("v3", Get("key3"));
+ ASSERT_EQ("NOT_FOUND", Get("key4"));
+ ASSERT_EQ("NOT_FOUND", Get("key5"));
+ ASSERT_EQ("NOT_FOUND", Get("key6"));
+}
+
+TEST_F(CuckooTableDBTest, FlushWithDuplicateKeys) {
+ Options options = CurrentOptions();
+ Reopen(&options);
+ ASSERT_OK(Put("key1", "v1"));
+ ASSERT_OK(Put("key2", "v2"));
+ ASSERT_OK(Put("key1", "v3")); // Duplicate
+ dbfull()->TEST_FlushMemTable();
+
+ TablePropertiesCollection ptc;
+ reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
+ ASSERT_EQ(1U, ptc.size());
+ ASSERT_EQ(2U, ptc.begin()->second->num_entries);
+ ASSERT_EQ("1", FilesPerLevel());
+ ASSERT_EQ("v3", Get("key1"));
+ ASSERT_EQ("v2", Get("key2"));
+}
+
+namespace {
+static std::string Key(int i) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "key_______%06d", i);
+ return std::string(buf);
+}
+static std::string Uint64Key(uint64_t i) {
+ std::string str;
+ str.resize(8);
+ memcpy(&str[0], static_cast<void*>(&i), 8);
+ return str;
+}
+} // namespace.
+
+TEST_F(CuckooTableDBTest, Uint64Comparator) {
+ Options options = CurrentOptions();
+ options.comparator = test::Uint64Comparator();
+ Reopen(&options);
+
+ ASSERT_OK(Put(Uint64Key(1), "v1"));
+ ASSERT_OK(Put(Uint64Key(2), "v2"));
+ ASSERT_OK(Put(Uint64Key(3), "v3"));
+ dbfull()->TEST_FlushMemTable();
+
+ ASSERT_EQ("v1", Get(Uint64Key(1)));
+ ASSERT_EQ("v2", Get(Uint64Key(2)));
+ ASSERT_EQ("v3", Get(Uint64Key(3)));
+ ASSERT_EQ("NOT_FOUND", Get(Uint64Key(4)));
+
+ // Add more keys.
+ ASSERT_OK(Delete(Uint64Key(2))); // Delete.
+ dbfull()->TEST_FlushMemTable();
+ ASSERT_OK(Put(Uint64Key(3), "v0")); // Update.
+ ASSERT_OK(Put(Uint64Key(4), "v4"));
+ dbfull()->TEST_FlushMemTable();
+ ASSERT_EQ("v1", Get(Uint64Key(1)));
+ ASSERT_EQ("NOT_FOUND", Get(Uint64Key(2)));
+ ASSERT_EQ("v0", Get(Uint64Key(3)));
+ ASSERT_EQ("v4", Get(Uint64Key(4)));
+}
+
+TEST_F(CuckooTableDBTest, CompactionIntoMultipleFiles) {
+ // Create a big L0 file and check it compacts into multiple files in L1.
+ Options options = CurrentOptions();
+ options.write_buffer_size = 270 << 10;
+ // Two SST files should be created, each containing 14 keys.
+ // Number of buckets will be 16. Total size ~156 KB.
+ options.target_file_size_base = 160 << 10;
+ Reopen(&options);
+
+ // Write 28 values, each 10016 B ~ 10KB
+ for (int idx = 0; idx < 28; ++idx) {
+ ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + char(idx))));
+ }
+ dbfull()->TEST_WaitForFlushMemTable();
+ ASSERT_EQ("1", FilesPerLevel());
+
+ dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+ true /* disallow trivial move */);
+ ASSERT_EQ("0,2", FilesPerLevel());
+ for (int idx = 0; idx < 28; ++idx) {
+ ASSERT_EQ(std::string(10000, 'a' + char(idx)), Get(Key(idx)));
+ }
+}
+
+TEST_F(CuckooTableDBTest, SameKeyInsertedInTwoDifferentFilesAndCompacted) {
+ // Insert same key twice so that they go to different SST files. Then wait for
+ // compaction and check if the latest value is stored and old value removed.
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100 << 10; // 100KB
+ options.level0_file_num_compaction_trigger = 2;
+ Reopen(&options);
+
+ // Write 11 values, each 10016 B
+ for (int idx = 0; idx < 11; ++idx) {
+ ASSERT_OK(Put(Key(idx), std::string(10000, 'a')));
+ }
+ dbfull()->TEST_WaitForFlushMemTable();
+ ASSERT_EQ("1", FilesPerLevel());
+
+ // Generate one more file in level-0, and should trigger level-0 compaction
+ for (int idx = 0; idx < 11; ++idx) {
+ ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + char(idx))));
+ }
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+
+ ASSERT_EQ("0,1", FilesPerLevel());
+ for (int idx = 0; idx < 11; ++idx) {
+ ASSERT_EQ(std::string(10000, 'a' + char(idx)), Get(Key(idx)));
+ }
+}
+
+TEST_F(CuckooTableDBTest, AdaptiveTable) {
+ Options options = CurrentOptions();
+
+ // Ensure options compatible with PlainTable
+ options.prefix_extractor.reset(NewCappedPrefixTransform(8));
+
+ // Write some keys using cuckoo table.
+ options.table_factory.reset(NewCuckooTableFactory());
+ Reopen(&options);
+
+ ASSERT_OK(Put("key1", "v1"));
+ ASSERT_OK(Put("key2", "v2"));
+ ASSERT_OK(Put("key3", "v3"));
+ dbfull()->TEST_FlushMemTable();
+
+ // Write some keys using plain table.
+ std::shared_ptr<TableFactory> block_based_factory(
+ NewBlockBasedTableFactory());
+ std::shared_ptr<TableFactory> plain_table_factory(
+ NewPlainTableFactory());
+ std::shared_ptr<TableFactory> cuckoo_table_factory(
+ NewCuckooTableFactory());
+ options.create_if_missing = false;
+ options.table_factory.reset(NewAdaptiveTableFactory(
+ plain_table_factory, block_based_factory, plain_table_factory,
+ cuckoo_table_factory));
+ Reopen(&options);
+ ASSERT_OK(Put("key4", "v4"));
+ ASSERT_OK(Put("key1", "v5"));
+ dbfull()->TEST_FlushMemTable();
+
+ // Write some keys using block based table.
+ options.table_factory.reset(NewAdaptiveTableFactory(
+ block_based_factory, block_based_factory, plain_table_factory,
+ cuckoo_table_factory));
+ Reopen(&options);
+ ASSERT_OK(Put("key5", "v6"));
+ ASSERT_OK(Put("key2", "v7"));
+ dbfull()->TEST_FlushMemTable();
+
+ ASSERT_EQ("v5", Get("key1"));
+ ASSERT_EQ("v7", Get("key2"));
+ ASSERT_EQ("v3", Get("key3"));
+ ASSERT_EQ("v4", Get("key4"));
+ ASSERT_EQ("v6", Get("key5"));
+}
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ if (ROCKSDB_NAMESPACE::port::kLittleEndian) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+ } else {
+ fprintf(stderr, "SKIPPED as Cuckoo table doesn't support Big Endian\n");
+ return 0;
+ }
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/db/db_basic_test.cc b/src/rocksdb/db/db_basic_test.cc
new file mode 100644
index 000000000..7573a01b4
--- /dev/null
+++ b/src/rocksdb/db/db_basic_test.cc
@@ -0,0 +1,2545 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/utilities/debug.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_builder.h"
+#include "test_util/fault_injection_test_env.h"
+#if !defined(ROCKSDB_LITE)
+#include "test_util/sync_point.h"
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBBasicTest : public DBTestBase {
+ public:
+ DBBasicTest() : DBTestBase("/db_basic_test") {}
+};
+
+TEST_F(DBBasicTest, OpenWhenOpen) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ ROCKSDB_NAMESPACE::DB* db2 = nullptr;
+ ROCKSDB_NAMESPACE::Status s = DB::Open(options, dbname_, &db2);
+
+ ASSERT_EQ(Status::Code::kIOError, s.code());
+ ASSERT_EQ(Status::SubCode::kNone, s.subcode());
+ ASSERT_TRUE(strstr(s.getState(), "lock ") != nullptr);
+
+ delete db2;
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBasicTest, ReadOnlyDB) {
+ ASSERT_OK(Put("foo", "v1"));
+ ASSERT_OK(Put("bar", "v2"));
+ ASSERT_OK(Put("foo", "v3"));
+ Close();
+
+ auto options = CurrentOptions();
+ assert(options.env == env_);
+ ASSERT_OK(ReadOnlyReopen(options));
+ ASSERT_EQ("v3", Get("foo"));
+ ASSERT_EQ("v2", Get("bar"));
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ int count = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_OK(iter->status());
+ ++count;
+ }
+ ASSERT_EQ(count, 2);
+ delete iter;
+ Close();
+
+ // Reopen and flush memtable.
+ Reopen(options);
+ Flush();
+ Close();
+ // Now check keys in read only mode.
+ ASSERT_OK(ReadOnlyReopen(options));
+ ASSERT_EQ("v3", Get("foo"));
+ ASSERT_EQ("v2", Get("bar"));
+ ASSERT_TRUE(db_->SyncWAL().IsNotSupported());
+}
+
+TEST_F(DBBasicTest, ReadOnlyDBWithWriteDBIdToManifestSet) {
+ ASSERT_OK(Put("foo", "v1"));
+ ASSERT_OK(Put("bar", "v2"));
+ ASSERT_OK(Put("foo", "v3"));
+ Close();
+
+ auto options = CurrentOptions();
+ options.write_dbid_to_manifest = true;
+ assert(options.env == env_);
+ ASSERT_OK(ReadOnlyReopen(options));
+ std::string db_id1;
+ db_->GetDbIdentity(db_id1);
+ ASSERT_EQ("v3", Get("foo"));
+ ASSERT_EQ("v2", Get("bar"));
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ int count = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_OK(iter->status());
+ ++count;
+ }
+ ASSERT_EQ(count, 2);
+ delete iter;
+ Close();
+
+ // Reopen and flush memtable.
+ Reopen(options);
+ Flush();
+ Close();
+ // Now check keys in read only mode.
+ ASSERT_OK(ReadOnlyReopen(options));
+ ASSERT_EQ("v3", Get("foo"));
+ ASSERT_EQ("v2", Get("bar"));
+ ASSERT_TRUE(db_->SyncWAL().IsNotSupported());
+ std::string db_id2;
+ db_->GetDbIdentity(db_id2);
+ ASSERT_EQ(db_id1, db_id2);
+}
+
+TEST_F(DBBasicTest, CompactedDB) {
+ const uint64_t kFileSize = 1 << 20;
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.write_buffer_size = kFileSize;
+ options.target_file_size_base = kFileSize;
+ options.max_bytes_for_level_base = 1 << 30;
+ options.compression = kNoCompression;
+ Reopen(options);
+ // 1 L0 file, use CompactedDB if max_open_files = -1
+ ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, '1')));
+ Flush();
+ Close();
+ ASSERT_OK(ReadOnlyReopen(options));
+ Status s = Put("new", "value");
+ ASSERT_EQ(s.ToString(),
+ "Not implemented: Not supported operation in read only mode.");
+ ASSERT_EQ(DummyString(kFileSize / 2, '1'), Get("aaa"));
+ Close();
+ options.max_open_files = -1;
+ ASSERT_OK(ReadOnlyReopen(options));
+ s = Put("new", "value");
+ ASSERT_EQ(s.ToString(),
+ "Not implemented: Not supported in compacted db mode.");
+ ASSERT_EQ(DummyString(kFileSize / 2, '1'), Get("aaa"));
+ Close();
+ Reopen(options);
+ // Add more L0 files
+ ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, '2')));
+ Flush();
+ ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, 'a')));
+ Flush();
+ ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, 'b')));
+ ASSERT_OK(Put("eee", DummyString(kFileSize / 2, 'e')));
+ Flush();
+ Close();
+
+ ASSERT_OK(ReadOnlyReopen(options));
+ // Fallback to read-only DB
+ s = Put("new", "value");
+ ASSERT_EQ(s.ToString(),
+ "Not implemented: Not supported operation in read only mode.");
+ Close();
+
+ // Full compaction
+ Reopen(options);
+ // Add more keys
+ ASSERT_OK(Put("fff", DummyString(kFileSize / 2, 'f')));
+ ASSERT_OK(Put("hhh", DummyString(kFileSize / 2, 'h')));
+ ASSERT_OK(Put("iii", DummyString(kFileSize / 2, 'i')));
+ ASSERT_OK(Put("jjj", DummyString(kFileSize / 2, 'j')));
+ db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ ASSERT_EQ(3, NumTableFilesAtLevel(1));
+ Close();
+
+ // CompactedDB
+ ASSERT_OK(ReadOnlyReopen(options));
+ s = Put("new", "value");
+ ASSERT_EQ(s.ToString(),
+ "Not implemented: Not supported in compacted db mode.");
+ ASSERT_EQ("NOT_FOUND", Get("abc"));
+ ASSERT_EQ(DummyString(kFileSize / 2, 'a'), Get("aaa"));
+ ASSERT_EQ(DummyString(kFileSize / 2, 'b'), Get("bbb"));
+ ASSERT_EQ("NOT_FOUND", Get("ccc"));
+ ASSERT_EQ(DummyString(kFileSize / 2, 'e'), Get("eee"));
+ ASSERT_EQ(DummyString(kFileSize / 2, 'f'), Get("fff"));
+ ASSERT_EQ("NOT_FOUND", Get("ggg"));
+ ASSERT_EQ(DummyString(kFileSize / 2, 'h'), Get("hhh"));
+ ASSERT_EQ(DummyString(kFileSize / 2, 'i'), Get("iii"));
+ ASSERT_EQ(DummyString(kFileSize / 2, 'j'), Get("jjj"));
+ ASSERT_EQ("NOT_FOUND", Get("kkk"));
+
+ // MultiGet
+ std::vector<std::string> values;
+ std::vector<Status> status_list = dbfull()->MultiGet(
+ ReadOptions(),
+ std::vector<Slice>({Slice("aaa"), Slice("ccc"), Slice("eee"),
+ Slice("ggg"), Slice("iii"), Slice("kkk")}),
+ &values);
+ ASSERT_EQ(status_list.size(), static_cast<uint64_t>(6));
+ ASSERT_EQ(values.size(), static_cast<uint64_t>(6));
+ ASSERT_OK(status_list[0]);
+ ASSERT_EQ(DummyString(kFileSize / 2, 'a'), values[0]);
+ ASSERT_TRUE(status_list[1].IsNotFound());
+ ASSERT_OK(status_list[2]);
+ ASSERT_EQ(DummyString(kFileSize / 2, 'e'), values[2]);
+ ASSERT_TRUE(status_list[3].IsNotFound());
+ ASSERT_OK(status_list[4]);
+ ASSERT_EQ(DummyString(kFileSize / 2, 'i'), values[4]);
+ ASSERT_TRUE(status_list[5].IsNotFound());
+
+ Reopen(options);
+ // Add a key
+ ASSERT_OK(Put("fff", DummyString(kFileSize / 2, 'f')));
+ Close();
+ ASSERT_OK(ReadOnlyReopen(options));
+ s = Put("new", "value");
+ ASSERT_EQ(s.ToString(),
+ "Not implemented: Not supported operation in read only mode.");
+}
+
+TEST_F(DBBasicTest, LevelLimitReopen) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ const std::string value(1024 * 1024, ' ');
+ int i = 0;
+ while (NumTableFilesAtLevel(2, 1) == 0) {
+ ASSERT_OK(Put(1, Key(i++), value));
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+ }
+
+ options.num_levels = 1;
+ options.max_bytes_for_level_multiplier_additional.resize(1, 1);
+ Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_EQ(s.IsInvalidArgument(), true);
+ ASSERT_EQ(s.ToString(),
+ "Invalid argument: db has more levels than options.num_levels");
+
+ options.num_levels = 10;
+ options.max_bytes_for_level_multiplier_additional.resize(10, 1);
+ ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBBasicTest, PutDeleteGet) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "foo", "v1"));
+ ASSERT_EQ("v1", Get(1, "foo"));
+ ASSERT_OK(Put(1, "foo", "v2"));
+ ASSERT_EQ("v2", Get(1, "foo"));
+ ASSERT_OK(Delete(1, "foo"));
+ ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+ } while (ChangeOptions());
+}
+
+TEST_F(DBBasicTest, PutSingleDeleteGet) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "foo", "v1"));
+ ASSERT_EQ("v1", Get(1, "foo"));
+ ASSERT_OK(Put(1, "foo2", "v2"));
+ ASSERT_EQ("v2", Get(1, "foo2"));
+ ASSERT_OK(SingleDelete(1, "foo"));
+ ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+ // Ski FIFO and universal compaction because they do not apply to the test
+ // case. Skip MergePut because single delete does not get removed when it
+ // encounters a merge.
+ } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+ kSkipMergePut));
+}
+
+TEST_F(DBBasicTest, EmptyFlush) {
+ // It is possible to produce empty flushes when using single deletes. Tests
+ // whether empty flushes cause issues.
+ do {
+ Random rnd(301);
+
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ Put(1, "a", Slice());
+ SingleDelete(1, "a");
+ ASSERT_OK(Flush(1));
+
+ ASSERT_EQ("[ ]", AllEntriesFor("a", 1));
+ // Skip FIFO and universal compaction as they do not apply to the test
+ // case. Skip MergePut because merges cannot be combined with single
+ // deletions.
+ } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+ kSkipMergePut));
+}
+
+TEST_F(DBBasicTest, GetFromVersions) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "foo", "v1"));
+ ASSERT_OK(Flush(1));
+ ASSERT_EQ("v1", Get(1, "foo"));
+ ASSERT_EQ("NOT_FOUND", Get(0, "foo"));
+ } while (ChangeOptions());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBasicTest, GetSnapshot) {
+ anon::OptionsOverride options_override;
+ options_override.skip_policy = kSkipNoSnapshot;
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
+ // Try with both a short key and a long key
+ for (int i = 0; i < 2; i++) {
+ std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x');
+ ASSERT_OK(Put(1, key, "v1"));
+ const Snapshot* s1 = db_->GetSnapshot();
+ ASSERT_OK(Put(1, key, "v2"));
+ ASSERT_EQ("v2", Get(1, key));
+ ASSERT_EQ("v1", Get(1, key, s1));
+ ASSERT_OK(Flush(1));
+ ASSERT_EQ("v2", Get(1, key));
+ ASSERT_EQ("v1", Get(1, key, s1));
+ db_->ReleaseSnapshot(s1);
+ }
+ } while (ChangeOptions());
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBBasicTest, CheckLock) {
+ do {
+ DB* localdb;
+ Options options = CurrentOptions();
+ ASSERT_OK(TryReopen(options));
+
+ // second open should fail
+ ASSERT_TRUE(!(DB::Open(options, dbname_, &localdb)).ok());
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, FlushMultipleMemtable) {
+ do {
+ Options options = CurrentOptions();
+ WriteOptions writeOpt = WriteOptions();
+ writeOpt.disableWAL = true;
+ options.max_write_buffer_number = 4;
+ options.min_write_buffer_number_to_merge = 3;
+ options.max_write_buffer_size_to_maintain = -1;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+ ASSERT_EQ("v1", Get(1, "foo"));
+ ASSERT_EQ("v1", Get(1, "bar"));
+ ASSERT_OK(Flush(1));
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, FlushEmptyColumnFamily) {
+ // Block flush thread and disable compaction thread
+ env_->SetBackgroundThreads(1, Env::HIGH);
+ env_->SetBackgroundThreads(1, Env::LOW);
+ test::SleepingBackgroundTask sleeping_task_low;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+ test::SleepingBackgroundTask sleeping_task_high;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+ &sleeping_task_high, Env::Priority::HIGH);
+
+ Options options = CurrentOptions();
+ // disable compaction
+ options.disable_auto_compactions = true;
+ WriteOptions writeOpt = WriteOptions();
+ writeOpt.disableWAL = true;
+ options.max_write_buffer_number = 2;
+ options.min_write_buffer_number_to_merge = 1;
+ options.max_write_buffer_size_to_maintain =
+ static_cast<int64_t>(options.write_buffer_size);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Compaction can still go through even if no thread can flush the
+ // mem table.
+ ASSERT_OK(Flush(0));
+ ASSERT_OK(Flush(1));
+
+ // Insert can go through
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[0], "foo", "v1"));
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+ ASSERT_EQ("v1", Get(0, "foo"));
+ ASSERT_EQ("v1", Get(1, "bar"));
+
+ sleeping_task_high.WakeUp();
+ sleeping_task_high.WaitUntilDone();
+
+ // Flush can still go through.
+ ASSERT_OK(Flush(0));
+ ASSERT_OK(Flush(1));
+
+ sleeping_task_low.WakeUp();
+ sleeping_task_low.WaitUntilDone();
+}
+
+TEST_F(DBBasicTest, FLUSH) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ WriteOptions writeOpt = WriteOptions();
+ writeOpt.disableWAL = true;
+ SetPerfLevel(kEnableTime);
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
+ // this will now also flush the last 2 writes
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+ get_perf_context()->Reset();
+ Get(1, "foo");
+ ASSERT_TRUE((int)get_perf_context()->get_from_output_files_time > 0);
+ ASSERT_EQ(2, (int)get_perf_context()->get_read_bytes);
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_EQ("v1", Get(1, "foo"));
+ ASSERT_EQ("v1", Get(1, "bar"));
+
+ writeOpt.disableWAL = true;
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2"));
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2"));
+ ASSERT_OK(Flush(1));
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_EQ("v2", Get(1, "bar"));
+ get_perf_context()->Reset();
+ ASSERT_EQ("v2", Get(1, "foo"));
+ ASSERT_TRUE((int)get_perf_context()->get_from_output_files_time > 0);
+
+ writeOpt.disableWAL = false;
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3"));
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3"));
+ ASSERT_OK(Flush(1));
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ // 'foo' should be there because its put
+ // has WAL enabled.
+ ASSERT_EQ("v3", Get(1, "foo"));
+ ASSERT_EQ("v3", Get(1, "bar"));
+
+ SetPerfLevel(kDisable);
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, ManifestRollOver) {
+ do {
+ Options options;
+ options.max_manifest_file_size = 10; // 10 bytes
+ options = CurrentOptions(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ {
+ ASSERT_OK(Put(1, "manifest_key1", std::string(1000, '1')));
+ ASSERT_OK(Put(1, "manifest_key2", std::string(1000, '2')));
+ ASSERT_OK(Put(1, "manifest_key3", std::string(1000, '3')));
+ uint64_t manifest_before_flush = dbfull()->TEST_Current_Manifest_FileNo();
+ ASSERT_OK(Flush(1)); // This should trigger LogAndApply.
+ uint64_t manifest_after_flush = dbfull()->TEST_Current_Manifest_FileNo();
+ ASSERT_GT(manifest_after_flush, manifest_before_flush);
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_GT(dbfull()->TEST_Current_Manifest_FileNo(), manifest_after_flush);
+ // check if a new manifest file got inserted or not.
+ ASSERT_EQ(std::string(1000, '1'), Get(1, "manifest_key1"));
+ ASSERT_EQ(std::string(1000, '2'), Get(1, "manifest_key2"));
+ ASSERT_EQ(std::string(1000, '3'), Get(1, "manifest_key3"));
+ }
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, IdentityAcrossRestarts1) {
+ do {
+ std::string id1;
+ ASSERT_OK(db_->GetDbIdentity(id1));
+
+ Options options = CurrentOptions();
+ Reopen(options);
+ std::string id2;
+ ASSERT_OK(db_->GetDbIdentity(id2));
+ // id1 should match id2 because identity was not regenerated
+ ASSERT_EQ(id1.compare(id2), 0);
+
+ std::string idfilename = IdentityFileName(dbname_);
+ ASSERT_OK(env_->DeleteFile(idfilename));
+ Reopen(options);
+ std::string id3;
+ ASSERT_OK(db_->GetDbIdentity(id3));
+ if (options.write_dbid_to_manifest) {
+ ASSERT_EQ(id1.compare(id3), 0);
+ } else {
+ // id1 should NOT match id3 because identity was regenerated
+ ASSERT_NE(id1.compare(id3), 0);
+ }
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, IdentityAcrossRestarts2) {
+ do {
+ std::string id1;
+ ASSERT_OK(db_->GetDbIdentity(id1));
+
+ Options options = CurrentOptions();
+ options.write_dbid_to_manifest = true;
+ Reopen(options);
+ std::string id2;
+ ASSERT_OK(db_->GetDbIdentity(id2));
+ // id1 should match id2 because identity was not regenerated
+ ASSERT_EQ(id1.compare(id2), 0);
+
+ std::string idfilename = IdentityFileName(dbname_);
+ ASSERT_OK(env_->DeleteFile(idfilename));
+ Reopen(options);
+ std::string id3;
+ ASSERT_OK(db_->GetDbIdentity(id3));
+ // id1 should NOT match id3 because identity was regenerated
+ ASSERT_EQ(id1, id3);
+ } while (ChangeCompactOptions());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBasicTest, Snapshot) {
+ anon::OptionsOverride options_override;
+ options_override.skip_policy = kSkipNoSnapshot;
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
+ Put(0, "foo", "0v1");
+ Put(1, "foo", "1v1");
+
+ const Snapshot* s1 = db_->GetSnapshot();
+ ASSERT_EQ(1U, GetNumSnapshots());
+ uint64_t time_snap1 = GetTimeOldestSnapshots();
+ ASSERT_GT(time_snap1, 0U);
+ ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+ Put(0, "foo", "0v2");
+ Put(1, "foo", "1v2");
+
+ env_->addon_time_.fetch_add(1);
+
+ const Snapshot* s2 = db_->GetSnapshot();
+ ASSERT_EQ(2U, GetNumSnapshots());
+ ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+ ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+ Put(0, "foo", "0v3");
+ Put(1, "foo", "1v3");
+
+ {
+ ManagedSnapshot s3(db_);
+ ASSERT_EQ(3U, GetNumSnapshots());
+ ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+ ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+
+ Put(0, "foo", "0v4");
+ Put(1, "foo", "1v4");
+ ASSERT_EQ("0v1", Get(0, "foo", s1));
+ ASSERT_EQ("1v1", Get(1, "foo", s1));
+ ASSERT_EQ("0v2", Get(0, "foo", s2));
+ ASSERT_EQ("1v2", Get(1, "foo", s2));
+ ASSERT_EQ("0v3", Get(0, "foo", s3.snapshot()));
+ ASSERT_EQ("1v3", Get(1, "foo", s3.snapshot()));
+ ASSERT_EQ("0v4", Get(0, "foo"));
+ ASSERT_EQ("1v4", Get(1, "foo"));
+ }
+
+ ASSERT_EQ(2U, GetNumSnapshots());
+ ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+ ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+ ASSERT_EQ("0v1", Get(0, "foo", s1));
+ ASSERT_EQ("1v1", Get(1, "foo", s1));
+ ASSERT_EQ("0v2", Get(0, "foo", s2));
+ ASSERT_EQ("1v2", Get(1, "foo", s2));
+ ASSERT_EQ("0v4", Get(0, "foo"));
+ ASSERT_EQ("1v4", Get(1, "foo"));
+
+ db_->ReleaseSnapshot(s1);
+ ASSERT_EQ("0v2", Get(0, "foo", s2));
+ ASSERT_EQ("1v2", Get(1, "foo", s2));
+ ASSERT_EQ("0v4", Get(0, "foo"));
+ ASSERT_EQ("1v4", Get(1, "foo"));
+ ASSERT_EQ(1U, GetNumSnapshots());
+ ASSERT_LT(time_snap1, GetTimeOldestSnapshots());
+ ASSERT_EQ(GetSequenceOldestSnapshots(), s2->GetSequenceNumber());
+
+ db_->ReleaseSnapshot(s2);
+ ASSERT_EQ(0U, GetNumSnapshots());
+ ASSERT_EQ(GetSequenceOldestSnapshots(), 0);
+ ASSERT_EQ("0v4", Get(0, "foo"));
+ ASSERT_EQ("1v4", Get(1, "foo"));
+ } while (ChangeOptions());
+}
+
+#endif // ROCKSDB_LITE
+
+TEST_F(DBBasicTest, CompactBetweenSnapshots) {
+ anon::OptionsOverride options_override;
+ options_override.skip_policy = kSkipNoSnapshot;
+ do {
+ Options options = CurrentOptions(options_override);
+ options.disable_auto_compactions = true;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ Random rnd(301);
+ FillLevels("a", "z", 1);
+
+ Put(1, "foo", "first");
+ const Snapshot* snapshot1 = db_->GetSnapshot();
+ Put(1, "foo", "second");
+ Put(1, "foo", "third");
+ Put(1, "foo", "fourth");
+ const Snapshot* snapshot2 = db_->GetSnapshot();
+ Put(1, "foo", "fifth");
+ Put(1, "foo", "sixth");
+
+ // All entries (including duplicates) exist
+ // before any compaction or flush is triggered.
+ ASSERT_EQ(AllEntriesFor("foo", 1),
+ "[ sixth, fifth, fourth, third, second, first ]");
+ ASSERT_EQ("sixth", Get(1, "foo"));
+ ASSERT_EQ("fourth", Get(1, "foo", snapshot2));
+ ASSERT_EQ("first", Get(1, "foo", snapshot1));
+
+ // After a flush, "second", "third" and "fifth" should
+ // be removed
+ ASSERT_OK(Flush(1));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth, first ]");
+
+ // after we release the snapshot1, only two values left
+ db_->ReleaseSnapshot(snapshot1);
+ FillLevels("a", "z", 1);
+ dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+ nullptr);
+
+ // We have only one valid snapshot snapshot2. Since snapshot1 is
+ // not valid anymore, "first" should be removed by a compaction.
+ ASSERT_EQ("sixth", Get(1, "foo"));
+ ASSERT_EQ("fourth", Get(1, "foo", snapshot2));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth ]");
+
+ // after we release the snapshot2, only one value should be left
+ db_->ReleaseSnapshot(snapshot2);
+ FillLevels("a", "z", 1);
+ dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+ nullptr);
+ ASSERT_EQ("sixth", Get(1, "foo"));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth ]");
+ } while (ChangeOptions(kSkipFIFOCompaction));
+}
+
+TEST_F(DBBasicTest, DBOpen_Options) {
+ Options options = CurrentOptions();
+ Close();
+ Destroy(options);
+
+ // Does not exist, and create_if_missing == false: error
+ DB* db = nullptr;
+ options.create_if_missing = false;
+ Status s = DB::Open(options, dbname_, &db);
+ ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr);
+ ASSERT_TRUE(db == nullptr);
+
+ // Does not exist, and create_if_missing == true: OK
+ options.create_if_missing = true;
+ s = DB::Open(options, dbname_, &db);
+ ASSERT_OK(s);
+ ASSERT_TRUE(db != nullptr);
+
+ delete db;
+ db = nullptr;
+
+ // Does exist, and error_if_exists == true: error
+ options.create_if_missing = false;
+ options.error_if_exists = true;
+ s = DB::Open(options, dbname_, &db);
+ ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr);
+ ASSERT_TRUE(db == nullptr);
+
+ // Does exist, and error_if_exists == false: OK
+ options.create_if_missing = true;
+ options.error_if_exists = false;
+ s = DB::Open(options, dbname_, &db);
+ ASSERT_OK(s);
+ ASSERT_TRUE(db != nullptr);
+
+ delete db;
+ db = nullptr;
+}
+
+TEST_F(DBBasicTest, CompactOnFlush) {
+ anon::OptionsOverride options_override;
+ options_override.skip_policy = kSkipNoSnapshot;
+ do {
+ Options options = CurrentOptions(options_override);
+ options.disable_auto_compactions = true;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ Put(1, "foo", "v1");
+ ASSERT_OK(Flush(1));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ v1 ]");
+
+ // Write two new keys
+ Put(1, "a", "begin");
+ Put(1, "z", "end");
+ Flush(1);
+
+ // Case1: Delete followed by a put
+ Delete(1, "foo");
+ Put(1, "foo", "v2");
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]");
+
+ // After the current memtable is flushed, the DEL should
+ // have been removed
+ ASSERT_OK(Flush(1));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]");
+
+ dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+ nullptr);
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]");
+
+ // Case 2: Delete followed by another delete
+ Delete(1, "foo");
+ Delete(1, "foo");
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, DEL, v2 ]");
+ ASSERT_OK(Flush(1));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v2 ]");
+ dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+ nullptr);
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+
+ // Case 3: Put followed by a delete
+ Put(1, "foo", "v3");
+ Delete(1, "foo");
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v3 ]");
+ ASSERT_OK(Flush(1));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL ]");
+ dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+ nullptr);
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+
+ // Case 4: Put followed by another Put
+ Put(1, "foo", "v4");
+ Put(1, "foo", "v5");
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5, v4 ]");
+ ASSERT_OK(Flush(1));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]");
+ dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+ nullptr);
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]");
+
+ // clear database
+ Delete(1, "foo");
+ dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+ nullptr);
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+
+ // Case 5: Put followed by snapshot followed by another Put
+ // Both puts should remain.
+ Put(1, "foo", "v6");
+ const Snapshot* snapshot = db_->GetSnapshot();
+ Put(1, "foo", "v7");
+ ASSERT_OK(Flush(1));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ v7, v6 ]");
+ db_->ReleaseSnapshot(snapshot);
+
+ // clear database
+ Delete(1, "foo");
+ dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+ nullptr);
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+
+ // Case 5: snapshot followed by a put followed by another Put
+ // Only the last put should remain.
+ const Snapshot* snapshot1 = db_->GetSnapshot();
+ Put(1, "foo", "v8");
+ Put(1, "foo", "v9");
+ ASSERT_OK(Flush(1));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ v9 ]");
+ db_->ReleaseSnapshot(snapshot1);
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, FlushOneColumnFamily) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
+ "alyosha", "popovich"},
+ options);
+
+ ASSERT_OK(Put(0, "Default", "Default"));
+ ASSERT_OK(Put(1, "pikachu", "pikachu"));
+ ASSERT_OK(Put(2, "ilya", "ilya"));
+ ASSERT_OK(Put(3, "muromec", "muromec"));
+ ASSERT_OK(Put(4, "dobrynia", "dobrynia"));
+ ASSERT_OK(Put(5, "nikitich", "nikitich"));
+ ASSERT_OK(Put(6, "alyosha", "alyosha"));
+ ASSERT_OK(Put(7, "popovich", "popovich"));
+
+ for (int i = 0; i < 8; ++i) {
+ Flush(i);
+ auto tables = ListTableFiles(env_, dbname_);
+ ASSERT_EQ(tables.size(), i + 1U);
+ }
+}
+
+TEST_F(DBBasicTest, MultiGetSimple) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ SetPerfLevel(kEnableCount);
+ ASSERT_OK(Put(1, "k1", "v1"));
+ ASSERT_OK(Put(1, "k2", "v2"));
+ ASSERT_OK(Put(1, "k3", "v3"));
+ ASSERT_OK(Put(1, "k4", "v4"));
+ ASSERT_OK(Delete(1, "k4"));
+ ASSERT_OK(Put(1, "k5", "v5"));
+ ASSERT_OK(Delete(1, "no_key"));
+
+ std::vector<Slice> keys({"k1", "k2", "k3", "k4", "k5", "no_key"});
+
+ std::vector<std::string> values(20, "Temporary data to be overwritten");
+ std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+
+ get_perf_context()->Reset();
+ std::vector<Status> s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
+ ASSERT_EQ(values.size(), keys.size());
+ ASSERT_EQ(values[0], "v1");
+ ASSERT_EQ(values[1], "v2");
+ ASSERT_EQ(values[2], "v3");
+ ASSERT_EQ(values[4], "v5");
+ // four kv pairs * two bytes per value
+ ASSERT_EQ(8, (int)get_perf_context()->multiget_read_bytes);
+
+ ASSERT_OK(s[0]);
+ ASSERT_OK(s[1]);
+ ASSERT_OK(s[2]);
+ ASSERT_TRUE(s[3].IsNotFound());
+ ASSERT_OK(s[4]);
+ ASSERT_TRUE(s[5].IsNotFound());
+ SetPerfLevel(kDisable);
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, MultiGetEmpty) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ // Empty Key Set
+ std::vector<Slice> keys;
+ std::vector<std::string> values;
+ std::vector<ColumnFamilyHandle*> cfs;
+ std::vector<Status> s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
+ ASSERT_EQ(s.size(), 0U);
+
+ // Empty Database, Empty Key Set
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
+ ASSERT_EQ(s.size(), 0U);
+
+ // Empty Database, Search for Keys
+ keys.resize(2);
+ keys[0] = "a";
+ keys[1] = "b";
+ cfs.push_back(handles_[0]);
+ cfs.push_back(handles_[1]);
+ s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
+ ASSERT_EQ(static_cast<int>(s.size()), 2);
+ ASSERT_TRUE(s[0].IsNotFound() && s[1].IsNotFound());
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, ChecksumTest) {
+ BlockBasedTableOptions table_options;
+ Options options = CurrentOptions();
+ // change when new checksum type added
+ int max_checksum = static_cast<int>(kxxHash64);
+ const int kNumPerFile = 2;
+
+ // generate one table with each type of checksum
+ for (int i = 0; i <= max_checksum; ++i) {
+ table_options.checksum = static_cast<ChecksumType>(i);
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+ for (int j = 0; j < kNumPerFile; ++j) {
+ ASSERT_OK(Put(Key(i * kNumPerFile + j), Key(i * kNumPerFile + j)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ // with each valid checksum type setting...
+ for (int i = 0; i <= max_checksum; ++i) {
+ table_options.checksum = static_cast<ChecksumType>(i);
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+ // verify every type of checksum (should be regardless of that setting)
+ for (int j = 0; j < (max_checksum + 1) * kNumPerFile; ++j) {
+ ASSERT_EQ(Key(j), Get(Key(j)));
+ }
+ }
+}
+
+// On Windows you can have either memory mapped file or a file
+// with unbuffered access. So this asserts and does not make
+// sense to run
+#ifndef OS_WIN
+TEST_F(DBBasicTest, MmapAndBufferOptions) {
+ if (!IsMemoryMappedAccessSupported()) {
+ return;
+ }
+ Options options = CurrentOptions();
+
+ options.use_direct_reads = true;
+ options.allow_mmap_reads = true;
+ ASSERT_NOK(TryReopen(options));
+
+ // All other combinations are acceptable
+ options.use_direct_reads = false;
+ ASSERT_OK(TryReopen(options));
+
+ if (IsDirectIOSupported()) {
+ options.use_direct_reads = true;
+ options.allow_mmap_reads = false;
+ ASSERT_OK(TryReopen(options));
+ }
+
+ options.use_direct_reads = false;
+ ASSERT_OK(TryReopen(options));
+}
+#endif
+
+class TestEnv : public EnvWrapper {
+ public:
+ explicit TestEnv(Env* base_env) : EnvWrapper(base_env), close_count(0) {}
+
+ class TestLogger : public Logger {
+ public:
+ using Logger::Logv;
+ explicit TestLogger(TestEnv* env_ptr) : Logger() { env = env_ptr; }
+ ~TestLogger() override {
+ if (!closed_) {
+ CloseHelper();
+ }
+ }
+ void Logv(const char* /*format*/, va_list /*ap*/) override {}
+
+ protected:
+ Status CloseImpl() override { return CloseHelper(); }
+
+ private:
+ Status CloseHelper() {
+ env->CloseCountInc();
+ ;
+ return Status::IOError();
+ }
+ TestEnv* env;
+ };
+
+ void CloseCountInc() { close_count++; }
+
+ int GetCloseCount() { return close_count; }
+
+ Status NewLogger(const std::string& /*fname*/,
+ std::shared_ptr<Logger>* result) override {
+ result->reset(new TestLogger(this));
+ return Status::OK();
+ }
+
+ private:
+ int close_count;
+};
+
+TEST_F(DBBasicTest, DBClose) {
+ Options options = GetDefaultOptions();
+ std::string dbname = test::PerThreadDBPath("db_close_test");
+ ASSERT_OK(DestroyDB(dbname, options));
+
+ DB* db = nullptr;
+ TestEnv* env = new TestEnv(env_);
+ std::unique_ptr<TestEnv> local_env_guard(env);
+ options.create_if_missing = true;
+ options.env = env;
+ Status s = DB::Open(options, dbname, &db);
+ ASSERT_OK(s);
+ ASSERT_TRUE(db != nullptr);
+
+ s = db->Close();
+ ASSERT_EQ(env->GetCloseCount(), 1);
+ ASSERT_EQ(s, Status::IOError());
+
+ delete db;
+ ASSERT_EQ(env->GetCloseCount(), 1);
+
+ // Do not call DB::Close() and ensure our logger Close() still gets called
+ s = DB::Open(options, dbname, &db);
+ ASSERT_OK(s);
+ ASSERT_TRUE(db != nullptr);
+ delete db;
+ ASSERT_EQ(env->GetCloseCount(), 2);
+
+ // Provide our own logger and ensure DB::Close() does not close it
+ options.info_log.reset(new TestEnv::TestLogger(env));
+ options.create_if_missing = false;
+ s = DB::Open(options, dbname, &db);
+ ASSERT_OK(s);
+ ASSERT_TRUE(db != nullptr);
+
+ s = db->Close();
+ ASSERT_EQ(s, Status::OK());
+ delete db;
+ ASSERT_EQ(env->GetCloseCount(), 2);
+ options.info_log.reset();
+ ASSERT_EQ(env->GetCloseCount(), 3);
+}
+
+TEST_F(DBBasicTest, DBCloseFlushError) {
+ std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+ new FaultInjectionTestEnv(env_));
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.manual_wal_flush = true;
+ options.write_buffer_size=100;
+ options.env = fault_injection_env.get();
+
+ Reopen(options);
+ ASSERT_OK(Put("key1", "value1"));
+ ASSERT_OK(Put("key2", "value2"));
+ ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+ ASSERT_OK(Put("key3", "value3"));
+ fault_injection_env->SetFilesystemActive(false);
+ Status s = dbfull()->Close();
+ fault_injection_env->SetFilesystemActive(true);
+ ASSERT_NE(s, Status::OK());
+
+ Destroy(options);
+}
+
+class DBMultiGetTestWithParam : public DBBasicTest,
+ public testing::WithParamInterface<bool> {};
+
+TEST_P(DBMultiGetTestWithParam, MultiGetMultiCF) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
+ "alyosha", "popovich"},
+ options);
+ // <CF, key, value> tuples
+ std::vector<std::tuple<int, std::string, std::string>> cf_kv_vec;
+ static const int num_keys = 24;
+ cf_kv_vec.reserve(num_keys);
+
+ for (int i = 0; i < num_keys; ++i) {
+ int cf = i / 3;
+ int cf_key = 1 % 3;
+ cf_kv_vec.emplace_back(std::make_tuple(
+ cf, "cf" + std::to_string(cf) + "_key_" + std::to_string(cf_key),
+ "cf" + std::to_string(cf) + "_val_" + std::to_string(cf_key)));
+ ASSERT_OK(Put(std::get<0>(cf_kv_vec[i]), std::get<1>(cf_kv_vec[i]),
+ std::get<2>(cf_kv_vec[i])));
+ }
+
+ int get_sv_count = 0;
+ ROCKSDB_NAMESPACE::DBImpl* db = reinterpret_cast<DBImpl*>(db_);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::MultiGet::AfterRefSV", [&](void* /*arg*/) {
+ if (++get_sv_count == 2) {
+ // After MultiGet refs a couple of CFs, flush all CFs so MultiGet
+ // is forced to repeat the process
+ for (int i = 0; i < num_keys; ++i) {
+ int cf = i / 3;
+ int cf_key = i % 8;
+ if (cf_key == 0) {
+ ASSERT_OK(Flush(cf));
+ }
+ ASSERT_OK(Put(std::get<0>(cf_kv_vec[i]), std::get<1>(cf_kv_vec[i]),
+ std::get<2>(cf_kv_vec[i]) + "_2"));
+ }
+ }
+ if (get_sv_count == 11) {
+ for (int i = 0; i < 8; ++i) {
+ auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(
+ db->GetColumnFamilyHandle(i))
+ ->cfd();
+ ASSERT_EQ(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
+ }
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ std::vector<int> cfs;
+ std::vector<std::string> keys;
+ std::vector<std::string> values;
+
+ for (int i = 0; i < num_keys; ++i) {
+ cfs.push_back(std::get<0>(cf_kv_vec[i]));
+ keys.push_back(std::get<1>(cf_kv_vec[i]));
+ }
+
+ values = MultiGet(cfs, keys, nullptr, GetParam());
+ ASSERT_EQ(values.size(), num_keys);
+ for (unsigned int j = 0; j < values.size(); ++j) {
+ ASSERT_EQ(values[j], std::get<2>(cf_kv_vec[j]) + "_2");
+ }
+
+ keys.clear();
+ cfs.clear();
+ cfs.push_back(std::get<0>(cf_kv_vec[0]));
+ keys.push_back(std::get<1>(cf_kv_vec[0]));
+ cfs.push_back(std::get<0>(cf_kv_vec[3]));
+ keys.push_back(std::get<1>(cf_kv_vec[3]));
+ cfs.push_back(std::get<0>(cf_kv_vec[4]));
+ keys.push_back(std::get<1>(cf_kv_vec[4]));
+ values = MultiGet(cfs, keys, nullptr, GetParam());
+ ASSERT_EQ(values[0], std::get<2>(cf_kv_vec[0]) + "_2");
+ ASSERT_EQ(values[1], std::get<2>(cf_kv_vec[3]) + "_2");
+ ASSERT_EQ(values[2], std::get<2>(cf_kv_vec[4]) + "_2");
+
+ keys.clear();
+ cfs.clear();
+ cfs.push_back(std::get<0>(cf_kv_vec[7]));
+ keys.push_back(std::get<1>(cf_kv_vec[7]));
+ cfs.push_back(std::get<0>(cf_kv_vec[6]));
+ keys.push_back(std::get<1>(cf_kv_vec[6]));
+ cfs.push_back(std::get<0>(cf_kv_vec[1]));
+ keys.push_back(std::get<1>(cf_kv_vec[1]));
+ values = MultiGet(cfs, keys, nullptr, GetParam());
+ ASSERT_EQ(values[0], std::get<2>(cf_kv_vec[7]) + "_2");
+ ASSERT_EQ(values[1], std::get<2>(cf_kv_vec[6]) + "_2");
+ ASSERT_EQ(values[2], std::get<2>(cf_kv_vec[1]) + "_2");
+
+ for (int cf = 0; cf < 8; ++cf) {
+ auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(
+ reinterpret_cast<DBImpl*>(db_)->GetColumnFamilyHandle(cf))
+ ->cfd();
+ ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
+ ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVObsolete);
+ }
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFMutex) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
+ "alyosha", "popovich"},
+ options);
+
+ for (int i = 0; i < 8; ++i) {
+ ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key",
+ "cf" + std::to_string(i) + "_val"));
+ }
+
+ int get_sv_count = 0;
+ int retries = 0;
+ bool last_try = false;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::MultiGet::LastTry", [&](void* /*arg*/) {
+ last_try = true;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::MultiGet::AfterRefSV", [&](void* /*arg*/) {
+ if (last_try) {
+ return;
+ }
+ if (++get_sv_count == 2) {
+ ++retries;
+ get_sv_count = 0;
+ for (int i = 0; i < 8; ++i) {
+ ASSERT_OK(Flush(i));
+ ASSERT_OK(Put(
+ i, "cf" + std::to_string(i) + "_key",
+ "cf" + std::to_string(i) + "_val" + std::to_string(retries)));
+ }
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ std::vector<int> cfs;
+ std::vector<std::string> keys;
+ std::vector<std::string> values;
+
+ for (int i = 0; i < 8; ++i) {
+ cfs.push_back(i);
+ keys.push_back("cf" + std::to_string(i) + "_key");
+ }
+
+ values = MultiGet(cfs, keys, nullptr, GetParam());
+ ASSERT_TRUE(last_try);
+ ASSERT_EQ(values.size(), 8);
+ for (unsigned int j = 0; j < values.size(); ++j) {
+ ASSERT_EQ(values[j],
+ "cf" + std::to_string(j) + "_val" + std::to_string(retries));
+ }
+ for (int i = 0; i < 8; ++i) {
+ auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(
+ reinterpret_cast<DBImpl*>(db_)->GetColumnFamilyHandle(i))
+ ->cfd();
+ ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
+ }
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFSnapshot) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
+ "alyosha", "popovich"},
+ options);
+
+ for (int i = 0; i < 8; ++i) {
+ ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key",
+ "cf" + std::to_string(i) + "_val"));
+ }
+
+ int get_sv_count = 0;
+ ROCKSDB_NAMESPACE::DBImpl* db = reinterpret_cast<DBImpl*>(db_);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::MultiGet::AfterRefSV", [&](void* /*arg*/) {
+ if (++get_sv_count == 2) {
+ for (int i = 0; i < 8; ++i) {
+ ASSERT_OK(Flush(i));
+ ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key",
+ "cf" + std::to_string(i) + "_val2"));
+ }
+ }
+ if (get_sv_count == 8) {
+ for (int i = 0; i < 8; ++i) {
+ auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(
+ db->GetColumnFamilyHandle(i))
+ ->cfd();
+ ASSERT_TRUE(
+ (cfd->TEST_GetLocalSV()->Get() == SuperVersion::kSVInUse) ||
+ (cfd->TEST_GetLocalSV()->Get() == SuperVersion::kSVObsolete));
+ }
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ std::vector<int> cfs;
+ std::vector<std::string> keys;
+ std::vector<std::string> values;
+
+ for (int i = 0; i < 8; ++i) {
+ cfs.push_back(i);
+ keys.push_back("cf" + std::to_string(i) + "_key");
+ }
+
+ const Snapshot* snapshot = db_->GetSnapshot();
+ values = MultiGet(cfs, keys, snapshot, GetParam());
+ db_->ReleaseSnapshot(snapshot);
+ ASSERT_EQ(values.size(), 8);
+ for (unsigned int j = 0; j < values.size(); ++j) {
+ ASSERT_EQ(values[j], "cf" + std::to_string(j) + "_val");
+ }
+ for (int i = 0; i < 8; ++i) {
+ auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(
+ reinterpret_cast<DBImpl*>(db_)->GetColumnFamilyHandle(i))
+ ->cfd();
+ ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(DBMultiGetTestWithParam, DBMultiGetTestWithParam,
+ testing::Bool());
+
+TEST_F(DBBasicTest, MultiGetBatchedSimpleUnsorted) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ SetPerfLevel(kEnableCount);
+ ASSERT_OK(Put(1, "k1", "v1"));
+ ASSERT_OK(Put(1, "k2", "v2"));
+ ASSERT_OK(Put(1, "k3", "v3"));
+ ASSERT_OK(Put(1, "k4", "v4"));
+ ASSERT_OK(Delete(1, "k4"));
+ ASSERT_OK(Put(1, "k5", "v5"));
+ ASSERT_OK(Delete(1, "no_key"));
+
+ get_perf_context()->Reset();
+
+ std::vector<Slice> keys({"no_key", "k5", "k4", "k3", "k2", "k1"});
+ std::vector<PinnableSlice> values(keys.size());
+ std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+ std::vector<Status> s(keys.size());
+
+ db_->MultiGet(ReadOptions(), handles_[1], keys.size(), keys.data(),
+ values.data(), s.data(), false);
+
+ ASSERT_EQ(values.size(), keys.size());
+ ASSERT_EQ(std::string(values[5].data(), values[5].size()), "v1");
+ ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v2");
+ ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v3");
+ ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v5");
+ // four kv pairs * two bytes per value
+ ASSERT_EQ(8, (int)get_perf_context()->multiget_read_bytes);
+
+ ASSERT_TRUE(s[0].IsNotFound());
+ ASSERT_OK(s[1]);
+ ASSERT_TRUE(s[2].IsNotFound());
+ ASSERT_OK(s[3]);
+ ASSERT_OK(s[4]);
+ ASSERT_OK(s[5]);
+
+ SetPerfLevel(kDisable);
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, MultiGetBatchedSimpleSorted) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ SetPerfLevel(kEnableCount);
+ ASSERT_OK(Put(1, "k1", "v1"));
+ ASSERT_OK(Put(1, "k2", "v2"));
+ ASSERT_OK(Put(1, "k3", "v3"));
+ ASSERT_OK(Put(1, "k4", "v4"));
+ ASSERT_OK(Delete(1, "k4"));
+ ASSERT_OK(Put(1, "k5", "v5"));
+ ASSERT_OK(Delete(1, "no_key"));
+
+ get_perf_context()->Reset();
+
+ std::vector<Slice> keys({"k1", "k2", "k3", "k4", "k5", "no_key"});
+ std::vector<PinnableSlice> values(keys.size());
+ std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+ std::vector<Status> s(keys.size());
+
+ db_->MultiGet(ReadOptions(), handles_[1], keys.size(), keys.data(),
+ values.data(), s.data(), true);
+
+ ASSERT_EQ(values.size(), keys.size());
+ ASSERT_EQ(std::string(values[0].data(), values[0].size()), "v1");
+ ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v2");
+ ASSERT_EQ(std::string(values[2].data(), values[2].size()), "v3");
+ ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v5");
+ // four kv pairs * two bytes per value
+ ASSERT_EQ(8, (int)get_perf_context()->multiget_read_bytes);
+
+ ASSERT_OK(s[0]);
+ ASSERT_OK(s[1]);
+ ASSERT_OK(s[2]);
+ ASSERT_TRUE(s[3].IsNotFound());
+ ASSERT_OK(s[4]);
+ ASSERT_TRUE(s[5].IsNotFound());
+
+ SetPerfLevel(kDisable);
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, MultiGetBatchedMultiLevel) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ Reopen(options);
+ int num_keys = 0;
+
+ for (int i = 0; i < 128; ++i) {
+ ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i)));
+ num_keys++;
+ if (num_keys == 8) {
+ Flush();
+ num_keys = 0;
+ }
+ }
+ if (num_keys > 0) {
+ Flush();
+ num_keys = 0;
+ }
+ MoveFilesToLevel(2);
+
+ for (int i = 0; i < 128; i += 3) {
+ ASSERT_OK(Put("key_" + std::to_string(i), "val_l1_" + std::to_string(i)));
+ num_keys++;
+ if (num_keys == 8) {
+ Flush();
+ num_keys = 0;
+ }
+ }
+ if (num_keys > 0) {
+ Flush();
+ num_keys = 0;
+ }
+ MoveFilesToLevel(1);
+
+ for (int i = 0; i < 128; i += 5) {
+ ASSERT_OK(Put("key_" + std::to_string(i), "val_l0_" + std::to_string(i)));
+ num_keys++;
+ if (num_keys == 8) {
+ Flush();
+ num_keys = 0;
+ }
+ }
+ if (num_keys > 0) {
+ Flush();
+ num_keys = 0;
+ }
+ ASSERT_EQ(0, num_keys);
+
+ for (int i = 0; i < 128; i += 9) {
+ ASSERT_OK(Put("key_" + std::to_string(i), "val_mem_" + std::to_string(i)));
+ }
+
+ std::vector<std::string> keys;
+ std::vector<std::string> values;
+
+ for (int i = 64; i < 80; ++i) {
+ keys.push_back("key_" + std::to_string(i));
+ }
+
+ values = MultiGet(keys, nullptr);
+ ASSERT_EQ(values.size(), 16);
+ for (unsigned int j = 0; j < values.size(); ++j) {
+ int key = j + 64;
+ if (key % 9 == 0) {
+ ASSERT_EQ(values[j], "val_mem_" + std::to_string(key));
+ } else if (key % 5 == 0) {
+ ASSERT_EQ(values[j], "val_l0_" + std::to_string(key));
+ } else if (key % 3 == 0) {
+ ASSERT_EQ(values[j], "val_l1_" + std::to_string(key));
+ } else {
+ ASSERT_EQ(values[j], "val_l2_" + std::to_string(key));
+ }
+ }
+}
+
+TEST_F(DBBasicTest, MultiGetBatchedMultiLevelMerge) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ Reopen(options);
+ int num_keys = 0;
+
+ for (int i = 0; i < 128; ++i) {
+ ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i)));
+ num_keys++;
+ if (num_keys == 8) {
+ Flush();
+ num_keys = 0;
+ }
+ }
+ if (num_keys > 0) {
+ Flush();
+ num_keys = 0;
+ }
+ MoveFilesToLevel(2);
+
+ for (int i = 0; i < 128; i += 3) {
+ ASSERT_OK(Merge("key_" + std::to_string(i), "val_l1_" + std::to_string(i)));
+ num_keys++;
+ if (num_keys == 8) {
+ Flush();
+ num_keys = 0;
+ }
+ }
+ if (num_keys > 0) {
+ Flush();
+ num_keys = 0;
+ }
+ MoveFilesToLevel(1);
+
+ for (int i = 0; i < 128; i += 5) {
+ ASSERT_OK(Merge("key_" + std::to_string(i), "val_l0_" + std::to_string(i)));
+ num_keys++;
+ if (num_keys == 8) {
+ Flush();
+ num_keys = 0;
+ }
+ }
+ if (num_keys > 0) {
+ Flush();
+ num_keys = 0;
+ }
+ ASSERT_EQ(0, num_keys);
+
+ for (int i = 0; i < 128; i += 9) {
+ ASSERT_OK(Merge("key_" + std::to_string(i), "val_mem_" + std::to_string(i)));
+ }
+
+ std::vector<std::string> keys;
+ std::vector<std::string> values;
+
+ for (int i = 32; i < 80; ++i) {
+ keys.push_back("key_" + std::to_string(i));
+ }
+
+ values = MultiGet(keys, nullptr);
+ ASSERT_EQ(values.size(), keys.size());
+ for (unsigned int j = 0; j < 48; ++j) {
+ int key = j + 32;
+ std::string value;
+ value.append("val_l2_" + std::to_string(key));
+ if (key % 3 == 0) {
+ value.append(",");
+ value.append("val_l1_" + std::to_string(key));
+ }
+ if (key % 5 == 0) {
+ value.append(",");
+ value.append("val_l0_" + std::to_string(key));
+ }
+ if (key % 9 == 0) {
+ value.append(",");
+ value.append("val_mem_" + std::to_string(key));
+ }
+ ASSERT_EQ(values[j], value);
+ }
+}
+
+// Test class for batched MultiGet with prefix extractor
+// Param bool - If true, use partitioned filters
+// If false, use full filter block
+class MultiGetPrefixExtractorTest : public DBBasicTest,
+ public ::testing::WithParamInterface<bool> {
+};
+
+TEST_P(MultiGetPrefixExtractorTest, Batched) {
+ Options options = CurrentOptions();
+ options.prefix_extractor.reset(NewFixedPrefixTransform(2));
+ options.memtable_prefix_bloom_size_ratio = 10;
+ BlockBasedTableOptions bbto;
+ if (GetParam()) {
+ bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+ bbto.partition_filters = true;
+ }
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ bbto.whole_key_filtering = false;
+ bbto.cache_index_and_filter_blocks = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ Reopen(options);
+
+ SetPerfLevel(kEnableCount);
+ get_perf_context()->Reset();
+
+ // First key is not in the prefix_extractor domain
+ ASSERT_OK(Put("k", "v0"));
+ ASSERT_OK(Put("kk1", "v1"));
+ ASSERT_OK(Put("kk2", "v2"));
+ ASSERT_OK(Put("kk3", "v3"));
+ ASSERT_OK(Put("kk4", "v4"));
+ std::vector<std::string> mem_keys(
+ {"k", "kk1", "kk2", "kk3", "kk4", "rofl", "lmho"});
+ std::vector<std::string> inmem_values;
+ inmem_values = MultiGet(mem_keys, nullptr);
+ ASSERT_EQ(inmem_values[0], "v0");
+ ASSERT_EQ(inmem_values[1], "v1");
+ ASSERT_EQ(inmem_values[2], "v2");
+ ASSERT_EQ(inmem_values[3], "v3");
+ ASSERT_EQ(inmem_values[4], "v4");
+ ASSERT_EQ(get_perf_context()->bloom_memtable_miss_count, 2);
+ ASSERT_EQ(get_perf_context()->bloom_memtable_hit_count, 5);
+ ASSERT_OK(Flush());
+
+ std::vector<std::string> keys({"k", "kk1", "kk2", "kk3", "kk4"});
+ std::vector<std::string> values;
+ get_perf_context()->Reset();
+ values = MultiGet(keys, nullptr);
+ ASSERT_EQ(values[0], "v0");
+ ASSERT_EQ(values[1], "v1");
+ ASSERT_EQ(values[2], "v2");
+ ASSERT_EQ(values[3], "v3");
+ ASSERT_EQ(values[4], "v4");
+ // Filter hits for 4 in-domain keys
+ ASSERT_EQ(get_perf_context()->bloom_sst_hit_count, 4);
+}
+
+INSTANTIATE_TEST_CASE_P(MultiGetPrefix, MultiGetPrefixExtractorTest,
+ ::testing::Bool());
+
+#ifndef ROCKSDB_LITE
+class DBMultiGetRowCacheTest : public DBBasicTest,
+ public ::testing::WithParamInterface<bool> {};
+
+TEST_P(DBMultiGetRowCacheTest, MultiGetBatched) {
+ do {
+ option_config_ = kRowCache;
+ Options options = CurrentOptions();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ CreateAndReopenWithCF({"pikachu"}, options);
+ SetPerfLevel(kEnableCount);
+ ASSERT_OK(Put(1, "k1", "v1"));
+ ASSERT_OK(Put(1, "k2", "v2"));
+ ASSERT_OK(Put(1, "k3", "v3"));
+ ASSERT_OK(Put(1, "k4", "v4"));
+ Flush(1);
+ ASSERT_OK(Put(1, "k5", "v5"));
+ const Snapshot* snap1 = dbfull()->GetSnapshot();
+ ASSERT_OK(Delete(1, "k4"));
+ Flush(1);
+ const Snapshot* snap2 = dbfull()->GetSnapshot();
+
+ get_perf_context()->Reset();
+
+ std::vector<Slice> keys({"no_key", "k5", "k4", "k3", "k1"});
+ std::vector<PinnableSlice> values(keys.size());
+ std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+ std::vector<Status> s(keys.size());
+
+ ReadOptions ro;
+ bool use_snapshots = GetParam();
+ if (use_snapshots) {
+ ro.snapshot = snap2;
+ }
+ db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
+ s.data(), false);
+
+ ASSERT_EQ(values.size(), keys.size());
+ ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v1");
+ ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v3");
+ ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v5");
+ // four kv pairs * two bytes per value
+ ASSERT_EQ(6, (int)get_perf_context()->multiget_read_bytes);
+
+ ASSERT_TRUE(s[0].IsNotFound());
+ ASSERT_OK(s[1]);
+ ASSERT_TRUE(s[2].IsNotFound());
+ ASSERT_OK(s[3]);
+ ASSERT_OK(s[4]);
+
+ // Call MultiGet() again with some intersection with the previous set of
+ // keys. Those should already be in the row cache.
+ keys.assign({"no_key", "k5", "k3", "k2"});
+ for (size_t i = 0; i < keys.size(); ++i) {
+ values[i].Reset();
+ s[i] = Status::OK();
+ }
+ get_perf_context()->Reset();
+
+ if (use_snapshots) {
+ ro.snapshot = snap1;
+ }
+ db_->MultiGet(ReadOptions(), handles_[1], keys.size(), keys.data(),
+ values.data(), s.data(), false);
+
+ ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v2");
+ ASSERT_EQ(std::string(values[2].data(), values[2].size()), "v3");
+ ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v5");
+ // four kv pairs * two bytes per value
+ ASSERT_EQ(6, (int)get_perf_context()->multiget_read_bytes);
+
+ ASSERT_TRUE(s[0].IsNotFound());
+ ASSERT_OK(s[1]);
+ ASSERT_OK(s[2]);
+ ASSERT_OK(s[3]);
+ if (use_snapshots) {
+ // Only reads from the first SST file would have been cached, since
+ // snapshot seq no is > fd.largest_seqno
+ ASSERT_EQ(1, TestGetTickerCount(options, ROW_CACHE_HIT));
+ } else {
+ ASSERT_EQ(2, TestGetTickerCount(options, ROW_CACHE_HIT));
+ }
+
+ SetPerfLevel(kDisable);
+ dbfull()->ReleaseSnapshot(snap1);
+ dbfull()->ReleaseSnapshot(snap2);
+ } while (ChangeCompactOptions());
+}
+
+INSTANTIATE_TEST_CASE_P(DBMultiGetRowCacheTest, DBMultiGetRowCacheTest,
+ testing::Values(true, false));
+
+TEST_F(DBBasicTest, GetAllKeyVersions) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ASSERT_EQ(2, handles_.size());
+ const size_t kNumInserts = 4;
+ const size_t kNumDeletes = 4;
+ const size_t kNumUpdates = 4;
+
+ // Check default column family
+ for (size_t i = 0; i != kNumInserts; ++i) {
+ ASSERT_OK(Put(std::to_string(i), "value"));
+ }
+ for (size_t i = 0; i != kNumUpdates; ++i) {
+ ASSERT_OK(Put(std::to_string(i), "value1"));
+ }
+ for (size_t i = 0; i != kNumDeletes; ++i) {
+ ASSERT_OK(Delete(std::to_string(i)));
+ }
+ std::vector<KeyVersion> key_versions;
+ ASSERT_OK(ROCKSDB_NAMESPACE::GetAllKeyVersions(
+ db_, Slice(), Slice(), std::numeric_limits<size_t>::max(),
+ &key_versions));
+ ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size());
+ ASSERT_OK(ROCKSDB_NAMESPACE::GetAllKeyVersions(
+ db_, handles_[0], Slice(), Slice(), std::numeric_limits<size_t>::max(),
+ &key_versions));
+ ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size());
+
+ // Check non-default column family
+ for (size_t i = 0; i != kNumInserts - 1; ++i) {
+ ASSERT_OK(Put(1, std::to_string(i), "value"));
+ }
+ for (size_t i = 0; i != kNumUpdates - 1; ++i) {
+ ASSERT_OK(Put(1, std::to_string(i), "value1"));
+ }
+ for (size_t i = 0; i != kNumDeletes - 1; ++i) {
+ ASSERT_OK(Delete(1, std::to_string(i)));
+ }
+ ASSERT_OK(ROCKSDB_NAMESPACE::GetAllKeyVersions(
+ db_, handles_[1], Slice(), Slice(), std::numeric_limits<size_t>::max(),
+ &key_versions));
+ ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates - 3, key_versions.size());
+}
+#endif // !ROCKSDB_LITE
+
+TEST_F(DBBasicTest, MultiGetIOBufferOverrun) {
+ Options options = CurrentOptions();
+ Random rnd(301);
+ BlockBasedTableOptions table_options;
+ table_options.pin_l0_filter_and_index_blocks_in_cache = true;
+ table_options.block_size = 16 * 1024;
+ assert(table_options.block_size >
+ BlockBasedTable::kMultiGetReadStackBufSize);
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+ Reopen(options);
+
+ std::string zero_str(128, '\0');
+ for (int i = 0; i < 100; ++i) {
+ // Make the value compressible. A purely random string doesn't compress
+ // and the resultant data block will not be compressed
+ std::string value(RandomString(&rnd, 128) + zero_str);
+ assert(Put(Key(i), value) == Status::OK());
+ }
+ Flush();
+
+ std::vector<std::string> key_data(10);
+ std::vector<Slice> keys;
+ // We cannot resize a PinnableSlice vector, so just set initial size to
+ // largest we think we will need
+ std::vector<PinnableSlice> values(10);
+ std::vector<Status> statuses;
+ ReadOptions ro;
+
+ // Warm up the cache first
+ key_data.emplace_back(Key(0));
+ keys.emplace_back(Slice(key_data.back()));
+ key_data.emplace_back(Key(50));
+ keys.emplace_back(Slice(key_data.back()));
+ statuses.resize(keys.size());
+
+ dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+ keys.data(), values.data(), statuses.data(), true);
+}
+
+class DBBasicTestWithParallelIO
+ : public DBTestBase,
+ public testing::WithParamInterface<std::tuple<bool, bool, bool, bool>> {
+ public:
+ DBBasicTestWithParallelIO() : DBTestBase("/db_basic_test_with_parallel_io") {
+ bool compressed_cache = std::get<0>(GetParam());
+ bool uncompressed_cache = std::get<1>(GetParam());
+ compression_enabled_ = std::get<2>(GetParam());
+ fill_cache_ = std::get<3>(GetParam());
+
+ if (compressed_cache) {
+ std::shared_ptr<Cache> cache = NewLRUCache(1048576);
+ compressed_cache_ = std::make_shared<MyBlockCache>(cache);
+ }
+ if (uncompressed_cache) {
+ std::shared_ptr<Cache> cache = NewLRUCache(1048576);
+ uncompressed_cache_ = std::make_shared<MyBlockCache>(cache);
+ }
+
+ env_->count_random_reads_ = true;
+
+ Options options = CurrentOptions();
+ Random rnd(301);
+ BlockBasedTableOptions table_options;
+
+#ifndef ROCKSDB_LITE
+ if (compression_enabled_) {
+ std::vector<CompressionType> compression_types;
+ compression_types = GetSupportedCompressions();
+ // Not every platform may have compression libraries available, so
+ // dynamically pick based on what's available
+ if (compression_types.size() == 0) {
+ compression_enabled_ = false;
+ } else {
+ options.compression = compression_types[0];
+ }
+ }
+#else
+ // GetSupportedCompressions() is not available in LITE build
+ if (!Snappy_Supported()) {
+ compression_enabled_ = false;
+ }
+#endif //ROCKSDB_LITE
+
+ table_options.block_cache = uncompressed_cache_;
+ if (table_options.block_cache == nullptr) {
+ table_options.no_block_cache = true;
+ } else {
+ table_options.pin_l0_filter_and_index_blocks_in_cache = true;
+ }
+ table_options.block_cache_compressed = compressed_cache_;
+ table_options.flush_block_policy_factory.reset(
+ new MyFlushBlockPolicyFactory());
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+ if (!compression_enabled_) {
+ options.compression = kNoCompression;
+ }
+ Reopen(options);
+
+ std::string zero_str(128, '\0');
+ for (int i = 0; i < 100; ++i) {
+ // Make the value compressible. A purely random string doesn't compress
+ // and the resultant data block will not be compressed
+ values_.emplace_back(RandomString(&rnd, 128) + zero_str);
+ assert(Put(Key(i), values_[i]) == Status::OK());
+ }
+ Flush();
+
+ for (int i = 0; i < 100; ++i) {
+ // block cannot gain space by compression
+ uncompressable_values_.emplace_back(RandomString(&rnd, 256) + '\0');
+ std::string tmp_key = "a" + Key(i);
+ assert(Put(tmp_key, uncompressable_values_[i]) == Status::OK());
+ }
+ Flush();
+ }
+
+ bool CheckValue(int i, const std::string& value) {
+ if (values_[i].compare(value) == 0) {
+ return true;
+ }
+ return false;
+ }
+
+ bool CheckUncompressableValue(int i, const std::string& value) {
+ if (uncompressable_values_[i].compare(value) == 0) {
+ return true;
+ }
+ return false;
+ }
+
+ int num_lookups() { return uncompressed_cache_->num_lookups(); }
+ int num_found() { return uncompressed_cache_->num_found(); }
+ int num_inserts() { return uncompressed_cache_->num_inserts(); }
+
+ int num_lookups_compressed() { return compressed_cache_->num_lookups(); }
+ int num_found_compressed() { return compressed_cache_->num_found(); }
+ int num_inserts_compressed() { return compressed_cache_->num_inserts(); }
+
+ bool fill_cache() { return fill_cache_; }
+ bool compression_enabled() { return compression_enabled_; }
+ bool has_compressed_cache() { return compressed_cache_ != nullptr; }
+ bool has_uncompressed_cache() { return uncompressed_cache_ != nullptr; }
+
+ static void SetUpTestCase() {}
+ static void TearDownTestCase() {}
+
+ private:
+ class MyFlushBlockPolicyFactory : public FlushBlockPolicyFactory {
+ public:
+ MyFlushBlockPolicyFactory() {}
+
+ virtual const char* Name() const override {
+ return "MyFlushBlockPolicyFactory";
+ }
+
+ virtual FlushBlockPolicy* NewFlushBlockPolicy(
+ const BlockBasedTableOptions& /*table_options*/,
+ const BlockBuilder& data_block_builder) const override {
+ return new MyFlushBlockPolicy(data_block_builder);
+ }
+ };
+
+ class MyFlushBlockPolicy : public FlushBlockPolicy {
+ public:
+ explicit MyFlushBlockPolicy(const BlockBuilder& data_block_builder)
+ : num_keys_(0), data_block_builder_(data_block_builder) {}
+
+ bool Update(const Slice& /*key*/, const Slice& /*value*/) override {
+ if (data_block_builder_.empty()) {
+ // First key in this block
+ num_keys_ = 1;
+ return false;
+ }
+ // Flush every 10 keys
+ if (num_keys_ == 10) {
+ num_keys_ = 1;
+ return true;
+ }
+ num_keys_++;
+ return false;
+ }
+
+ private:
+ int num_keys_;
+ const BlockBuilder& data_block_builder_;
+ };
+
+ class MyBlockCache : public Cache {
+ public:
+ explicit MyBlockCache(std::shared_ptr<Cache>& target)
+ : target_(target), num_lookups_(0), num_found_(0), num_inserts_(0) {}
+
+ virtual const char* Name() const override { return "MyBlockCache"; }
+
+ virtual Status Insert(const Slice& key, void* value, size_t charge,
+ void (*deleter)(const Slice& key, void* value),
+ Handle** handle = nullptr,
+ Priority priority = Priority::LOW) override {
+ num_inserts_++;
+ return target_->Insert(key, value, charge, deleter, handle, priority);
+ }
+
+ virtual Handle* Lookup(const Slice& key,
+ Statistics* stats = nullptr) override {
+ num_lookups_++;
+ Handle* handle = target_->Lookup(key, stats);
+ if (handle != nullptr) {
+ num_found_++;
+ }
+ return handle;
+ }
+
+ virtual bool Ref(Handle* handle) override { return target_->Ref(handle); }
+
+ virtual bool Release(Handle* handle, bool force_erase = false) override {
+ return target_->Release(handle, force_erase);
+ }
+
+ virtual void* Value(Handle* handle) override {
+ return target_->Value(handle);
+ }
+
+ virtual void Erase(const Slice& key) override { target_->Erase(key); }
+ virtual uint64_t NewId() override { return target_->NewId(); }
+
+ virtual void SetCapacity(size_t capacity) override {
+ target_->SetCapacity(capacity);
+ }
+
+ virtual void SetStrictCapacityLimit(bool strict_capacity_limit) override {
+ target_->SetStrictCapacityLimit(strict_capacity_limit);
+ }
+
+ virtual bool HasStrictCapacityLimit() const override {
+ return target_->HasStrictCapacityLimit();
+ }
+
+ virtual size_t GetCapacity() const override {
+ return target_->GetCapacity();
+ }
+
+ virtual size_t GetUsage() const override { return target_->GetUsage(); }
+
+ virtual size_t GetUsage(Handle* handle) const override {
+ return target_->GetUsage(handle);
+ }
+
+ virtual size_t GetPinnedUsage() const override {
+ return target_->GetPinnedUsage();
+ }
+
+ virtual size_t GetCharge(Handle* /*handle*/) const override { return 0; }
+
+ virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
+ bool thread_safe) override {
+ return target_->ApplyToAllCacheEntries(callback, thread_safe);
+ }
+
+ virtual void EraseUnRefEntries() override {
+ return target_->EraseUnRefEntries();
+ }
+
+ int num_lookups() { return num_lookups_; }
+
+ int num_found() { return num_found_; }
+
+ int num_inserts() { return num_inserts_; }
+
+ private:
+ std::shared_ptr<Cache> target_;
+ int num_lookups_;
+ int num_found_;
+ int num_inserts_;
+ };
+
+ std::shared_ptr<MyBlockCache> compressed_cache_;
+ std::shared_ptr<MyBlockCache> uncompressed_cache_;
+ bool compression_enabled_;
+ std::vector<std::string> values_;
+ std::vector<std::string> uncompressable_values_;
+ bool fill_cache_;
+};
+
+TEST_P(DBBasicTestWithParallelIO, MultiGet) {
+ std::vector<std::string> key_data(10);
+ std::vector<Slice> keys;
+ // We cannot resize a PinnableSlice vector, so just set initial size to
+ // largest we think we will need
+ std::vector<PinnableSlice> values(10);
+ std::vector<Status> statuses;
+ ReadOptions ro;
+ ro.fill_cache = fill_cache();
+
+ // Warm up the cache first
+ key_data.emplace_back(Key(0));
+ keys.emplace_back(Slice(key_data.back()));
+ key_data.emplace_back(Key(50));
+ keys.emplace_back(Slice(key_data.back()));
+ statuses.resize(keys.size());
+
+ dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+ keys.data(), values.data(), statuses.data(), true);
+ ASSERT_TRUE(CheckValue(0, values[0].ToString()));
+ ASSERT_TRUE(CheckValue(50, values[1].ToString()));
+
+ int random_reads = env_->random_read_counter_.Read();
+ key_data[0] = Key(1);
+ key_data[1] = Key(51);
+ keys[0] = Slice(key_data[0]);
+ keys[1] = Slice(key_data[1]);
+ values[0].Reset();
+ values[1].Reset();
+ dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+ keys.data(), values.data(), statuses.data(), true);
+ ASSERT_TRUE(CheckValue(1, values[0].ToString()));
+ ASSERT_TRUE(CheckValue(51, values[1].ToString()));
+
+ bool read_from_cache = false;
+ if (fill_cache()) {
+ if (has_uncompressed_cache()) {
+ read_from_cache = true;
+ } else if (has_compressed_cache() && compression_enabled()) {
+ read_from_cache = true;
+ }
+ }
+
+ int expected_reads = random_reads + (read_from_cache ? 0 : 2);
+ ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+
+ keys.resize(10);
+ statuses.resize(10);
+ std::vector<int> key_ints{1, 2, 15, 16, 55, 81, 82, 83, 84, 85};
+ for (size_t i = 0; i < key_ints.size(); ++i) {
+ key_data[i] = Key(key_ints[i]);
+ keys[i] = Slice(key_data[i]);
+ statuses[i] = Status::OK();
+ values[i].Reset();
+ }
+ dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+ keys.data(), values.data(), statuses.data(), true);
+ for (size_t i = 0; i < key_ints.size(); ++i) {
+ ASSERT_OK(statuses[i]);
+ ASSERT_TRUE(CheckValue(key_ints[i], values[i].ToString()));
+ }
+ if (compression_enabled() && !has_compressed_cache()) {
+ expected_reads += (read_from_cache ? 2 : 3);
+ } else {
+ expected_reads += (read_from_cache ? 2 : 4);
+ }
+ ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+
+ keys.resize(10);
+ statuses.resize(10);
+ std::vector<int> key_uncmp{1, 2, 15, 16, 55, 81, 82, 83, 84, 85};
+ for (size_t i = 0; i < key_uncmp.size(); ++i) {
+ key_data[i] = "a" + Key(key_uncmp[i]);
+ keys[i] = Slice(key_data[i]);
+ statuses[i] = Status::OK();
+ values[i].Reset();
+ }
+ dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+ keys.data(), values.data(), statuses.data(), true);
+ for (size_t i = 0; i < key_uncmp.size(); ++i) {
+ ASSERT_OK(statuses[i]);
+ ASSERT_TRUE(CheckUncompressableValue(key_uncmp[i], values[i].ToString()));
+ }
+ if (compression_enabled() && !has_compressed_cache()) {
+ expected_reads += (read_from_cache ? 3 : 3);
+ } else {
+ expected_reads += (read_from_cache ? 4 : 4);
+ }
+ ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+
+ keys.resize(5);
+ statuses.resize(5);
+ std::vector<int> key_tr{1, 2, 15, 16, 55};
+ for (size_t i = 0; i < key_tr.size(); ++i) {
+ key_data[i] = "a" + Key(key_tr[i]);
+ keys[i] = Slice(key_data[i]);
+ statuses[i] = Status::OK();
+ values[i].Reset();
+ }
+ dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+ keys.data(), values.data(), statuses.data(), true);
+ for (size_t i = 0; i < key_tr.size(); ++i) {
+ ASSERT_OK(statuses[i]);
+ ASSERT_TRUE(CheckUncompressableValue(key_tr[i], values[i].ToString()));
+ }
+ if (compression_enabled() && !has_compressed_cache()) {
+ expected_reads += (read_from_cache ? 0 : 2);
+ ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+ } else {
+ if (has_uncompressed_cache()) {
+ expected_reads += (read_from_cache ? 0 : 3);
+ ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+ } else {
+ // A rare case, even we enable the block compression but some of data
+ // blocks are not compressed due to content. If user only enable the
+ // compressed cache, the uncompressed blocks will not tbe cached, and
+ // block reads will be triggered. The number of reads is related to
+ // the compression algorithm.
+ ASSERT_TRUE(env_->random_read_counter_.Read() >= expected_reads);
+ }
+ }
+}
+
+TEST_P(DBBasicTestWithParallelIO, MultiGetWithChecksumMismatch) {
+ std::vector<std::string> key_data(10);
+ std::vector<Slice> keys;
+ // We cannot resize a PinnableSlice vector, so just set initial size to
+ // largest we think we will need
+ std::vector<PinnableSlice> values(10);
+ std::vector<Status> statuses;
+ int read_count = 0;
+ ReadOptions ro;
+ ro.fill_cache = fill_cache();
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "RetrieveMultipleBlocks:VerifyChecksum", [&](void *status) {
+ Status* s = static_cast<Status*>(status);
+ read_count++;
+ if (read_count == 2) {
+ *s = Status::Corruption();
+ }
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ // Warm up the cache first
+ key_data.emplace_back(Key(0));
+ keys.emplace_back(Slice(key_data.back()));
+ key_data.emplace_back(Key(50));
+ keys.emplace_back(Slice(key_data.back()));
+ statuses.resize(keys.size());
+
+ dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+ keys.data(), values.data(), statuses.data(), true);
+ ASSERT_TRUE(CheckValue(0, values[0].ToString()));
+ //ASSERT_TRUE(CheckValue(50, values[1].ToString()));
+ ASSERT_EQ(statuses[0], Status::OK());
+ ASSERT_EQ(statuses[1], Status::Corruption());
+
+ SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBBasicTestWithParallelIO, MultiGetWithMissingFile) {
+ std::vector<std::string> key_data(10);
+ std::vector<Slice> keys;
+ // We cannot resize a PinnableSlice vector, so just set initial size to
+ // largest we think we will need
+ std::vector<PinnableSlice> values(10);
+ std::vector<Status> statuses;
+ ReadOptions ro;
+ ro.fill_cache = fill_cache();
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "TableCache::MultiGet:FindTable", [&](void *status) {
+ Status* s = static_cast<Status*>(status);
+ *s = Status::IOError();
+ });
+ // DB open will create table readers unless we reduce the table cache
+ // capacity.
+ // SanitizeOptions will set max_open_files to minimum of 20. Table cache
+ // is allocated with max_open_files - 10 as capacity. So override
+ // max_open_files to 11 so table cache capacity will become 1. This will
+ // prevent file open during DB open and force the file to be opened
+ // during MultiGet
+ SyncPoint::GetInstance()->SetCallBack(
+ "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void *arg) {
+ int* max_open_files = (int*)arg;
+ *max_open_files = 11;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ Reopen(CurrentOptions());
+
+ // Warm up the cache first
+ key_data.emplace_back(Key(0));
+ keys.emplace_back(Slice(key_data.back()));
+ key_data.emplace_back(Key(50));
+ keys.emplace_back(Slice(key_data.back()));
+ statuses.resize(keys.size());
+
+ dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+ keys.data(), values.data(), statuses.data(), true);
+ ASSERT_EQ(statuses[0], Status::IOError());
+ ASSERT_EQ(statuses[1], Status::IOError());
+
+ SyncPoint::GetInstance()->DisableProcessing();
+}
+
+INSTANTIATE_TEST_CASE_P(
+ ParallelIO, DBBasicTestWithParallelIO,
+ // Params are as follows -
+ // Param 0 - Compressed cache enabled
+ // Param 1 - Uncompressed cache enabled
+ // Param 2 - Data compression enabled
+ // Param 3 - ReadOptions::fill_cache
+ ::testing::Combine(::testing::Bool(), ::testing::Bool(),
+ ::testing::Bool(), ::testing::Bool()));
+
+class DBBasicTestWithTimestampBase : public DBTestBase {
+ public:
+ explicit DBBasicTestWithTimestampBase(const std::string& dbname)
+ : DBTestBase(dbname) {}
+
+ protected:
+ class TestComparatorBase : public Comparator {
+ public:
+ explicit TestComparatorBase(size_t ts_sz) : Comparator(ts_sz) {}
+
+ const char* Name() const override { return "TestComparator"; }
+
+ void FindShortSuccessor(std::string*) const override {}
+
+ void FindShortestSeparator(std::string*, const Slice&) const override {}
+
+ int Compare(const Slice& a, const Slice& b) const override {
+ int r = CompareWithoutTimestamp(a, b);
+ if (r != 0 || 0 == timestamp_size()) {
+ return r;
+ }
+ return CompareTimestamp(
+ Slice(a.data() + a.size() - timestamp_size(), timestamp_size()),
+ Slice(b.data() + b.size() - timestamp_size(), timestamp_size()));
+ }
+
+ virtual int CompareImpl(const Slice& a, const Slice& b) const = 0;
+
+ int CompareWithoutTimestamp(const Slice& a, const Slice& b) const override {
+ assert(a.size() >= timestamp_size());
+ assert(b.size() >= timestamp_size());
+ Slice k1 = StripTimestampFromUserKey(a, timestamp_size());
+ Slice k2 = StripTimestampFromUserKey(b, timestamp_size());
+
+ return CompareImpl(k1, k2);
+ }
+
+ int CompareTimestamp(const Slice& ts1, const Slice& ts2) const override {
+ if (!ts1.data() && !ts2.data()) {
+ return 0;
+ } else if (ts1.data() && !ts2.data()) {
+ return 1;
+ } else if (!ts1.data() && ts2.data()) {
+ return -1;
+ }
+ assert(ts1.size() == ts2.size());
+ uint64_t low1 = 0;
+ uint64_t low2 = 0;
+ uint64_t high1 = 0;
+ uint64_t high2 = 0;
+ auto* ptr1 = const_cast<Slice*>(&ts1);
+ auto* ptr2 = const_cast<Slice*>(&ts2);
+ if (!GetFixed64(ptr1, &low1) || !GetFixed64(ptr1, &high1) ||
+ !GetFixed64(ptr2, &low2) || !GetFixed64(ptr2, &high2)) {
+ assert(false);
+ }
+ if (high1 < high2) {
+ return 1;
+ } else if (high1 > high2) {
+ return -1;
+ }
+ if (low1 < low2) {
+ return 1;
+ } else if (low1 > low2) {
+ return -1;
+ }
+ return 0;
+ }
+ };
+
+ Slice EncodeTimestamp(uint64_t low, uint64_t high, std::string* ts) {
+ assert(nullptr != ts);
+ ts->clear();
+ PutFixed64(ts, low);
+ PutFixed64(ts, high);
+ assert(ts->size() == sizeof(low) + sizeof(high));
+ return Slice(*ts);
+ }
+};
+
+class DBBasicTestWithTimestamp : public DBBasicTestWithTimestampBase {
+ public:
+ DBBasicTestWithTimestamp()
+ : DBBasicTestWithTimestampBase("/db_basic_test_with_timestamp") {}
+
+ protected:
+ class TestComparator : public TestComparatorBase {
+ public:
+ const int kKeyPrefixLength =
+ 3; // 3: length of "key" in generated keys ("key" + std::to_string(j))
+ explicit TestComparator(size_t ts_sz) : TestComparatorBase(ts_sz) {}
+
+ int CompareImpl(const Slice& a, const Slice& b) const override {
+ int n1 = atoi(
+ std::string(a.data() + kKeyPrefixLength, a.size() - kKeyPrefixLength)
+ .c_str());
+ int n2 = atoi(
+ std::string(b.data() + kKeyPrefixLength, b.size() - kKeyPrefixLength)
+ .c_str());
+ return (n1 < n2) ? -1 : (n1 > n2) ? 1 : 0;
+ }
+ };
+};
+
+#ifndef ROCKSDB_LITE
+// A class which remembers the name of each flushed file.
+class FlushedFileCollector : public EventListener {
+ public:
+ FlushedFileCollector() {}
+ ~FlushedFileCollector() override {}
+
+ void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+ InstrumentedMutexLock lock(&mutex_);
+ flushed_files_.push_back(info.file_path);
+ }
+
+ std::vector<std::string> GetFlushedFiles() {
+ std::vector<std::string> result;
+ {
+ InstrumentedMutexLock lock(&mutex_);
+ result = flushed_files_;
+ }
+ return result;
+ }
+
+ void ClearFlushedFiles() {
+ InstrumentedMutexLock lock(&mutex_);
+ flushed_files_.clear();
+ }
+
+ private:
+ std::vector<std::string> flushed_files_;
+ InstrumentedMutex mutex_;
+};
+
+TEST_F(DBBasicTestWithTimestamp, PutAndGetWithCompaction) {
+ const int kNumKeysPerFile = 8192;
+ const size_t kNumTimestamps = 2;
+ const size_t kNumKeysPerTimestamp = (kNumKeysPerFile - 1) / kNumTimestamps;
+ const size_t kSplitPosBase = kNumKeysPerTimestamp / 2;
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.env = env_;
+ options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
+
+ FlushedFileCollector* collector = new FlushedFileCollector();
+ options.listeners.emplace_back(collector);
+
+ std::string tmp;
+ size_t ts_sz = EncodeTimestamp(0, 0, &tmp).size();
+ TestComparator test_cmp(ts_sz);
+ options.comparator = &test_cmp;
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(
+ 10 /*bits_per_key*/, false /*use_block_based_builder*/));
+ bbto.whole_key_filtering = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ size_t num_cfs = handles_.size();
+ ASSERT_EQ(2, num_cfs);
+ std::vector<std::string> write_ts_strs(kNumTimestamps);
+ std::vector<std::string> read_ts_strs(kNumTimestamps);
+ std::vector<Slice> write_ts_list;
+ std::vector<Slice> read_ts_list;
+
+ for (size_t i = 0; i != kNumTimestamps; ++i) {
+ write_ts_list.emplace_back(EncodeTimestamp(i * 2, 0, &write_ts_strs[i]));
+ read_ts_list.emplace_back(EncodeTimestamp(1 + i * 2, 0, &read_ts_strs[i]));
+ const Slice& write_ts = write_ts_list.back();
+ WriteOptions wopts;
+ wopts.timestamp = &write_ts;
+ for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+ for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) {
+ ASSERT_OK(Put(cf, "key" + std::to_string(j),
+ "value_" + std::to_string(j) + "_" + std::to_string(i),
+ wopts));
+ if (j == kSplitPosBase + i || j == kNumKeysPerTimestamp - 1) {
+ // flush all keys with the same timestamp to two sst files, split at
+ // incremental positions such that lowerlevel[1].smallest.userkey ==
+ // higherlevel[0].largest.userkey
+ ASSERT_OK(Flush(cf));
+
+ // compact files (2 at each level) to a lower level such that all keys
+ // with the same timestamp is at one level, with newer versions at
+ // higher levels.
+ CompactionOptions compact_opt;
+ compact_opt.compression = kNoCompression;
+ db_->CompactFiles(compact_opt, handles_[cf],
+ collector->GetFlushedFiles(),
+ static_cast<int>(kNumTimestamps - i));
+ collector->ClearFlushedFiles();
+ }
+ }
+ }
+ }
+ const auto& verify_db_func = [&]() {
+ for (size_t i = 0; i != kNumTimestamps; ++i) {
+ ReadOptions ropts;
+ ropts.timestamp = &read_ts_list[i];
+ for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+ ColumnFamilyHandle* cfh = handles_[cf];
+ for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) {
+ std::string value;
+ ASSERT_OK(db_->Get(ropts, cfh, "key" + std::to_string(j), &value));
+ ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i),
+ value);
+ }
+ }
+ }
+ };
+ verify_db_func();
+}
+#endif // !ROCKSDB_LITE
+
+class DBBasicTestWithTimestampWithParam
+ : public DBBasicTestWithTimestampBase,
+ public testing::WithParamInterface<bool> {
+ public:
+ DBBasicTestWithTimestampWithParam()
+ : DBBasicTestWithTimestampBase(
+ "/db_basic_test_with_timestamp_with_param") {}
+
+ protected:
+ class TestComparator : public TestComparatorBase {
+ private:
+ const Comparator* cmp_without_ts_;
+
+ public:
+ explicit TestComparator(size_t ts_sz)
+ : TestComparatorBase(ts_sz), cmp_without_ts_(nullptr) {
+ cmp_without_ts_ = BytewiseComparator();
+ }
+
+ int CompareImpl(const Slice& a, const Slice& b) const override {
+ return cmp_without_ts_->Compare(a, b);
+ }
+ };
+};
+
+TEST_P(DBBasicTestWithTimestampWithParam, PutAndGet) {
+ const int kNumKeysPerFile = 8192;
+ const size_t kNumTimestamps = 6;
+ bool memtable_only = GetParam();
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.env = env_;
+ options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
+ std::string tmp;
+ size_t ts_sz = EncodeTimestamp(0, 0, &tmp).size();
+ TestComparator test_cmp(ts_sz);
+ options.comparator = &test_cmp;
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(
+ 10 /*bits_per_key*/, false /*use_block_based_builder*/));
+ bbto.whole_key_filtering = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+ std::vector<CompressionType> compression_types;
+ compression_types.push_back(kNoCompression);
+ if (Zlib_Supported()) {
+ compression_types.push_back(kZlibCompression);
+ }
+#if LZ4_VERSION_NUMBER >= 10400 // r124+
+ compression_types.push_back(kLZ4Compression);
+ compression_types.push_back(kLZ4HCCompression);
+#endif // LZ4_VERSION_NUMBER >= 10400
+ if (ZSTD_Supported()) {
+ compression_types.push_back(kZSTD);
+ }
+
+ // Switch compression dictionary on/off to check key extraction
+ // correctness in kBuffered state
+ std::vector<uint32_t> max_dict_bytes_list = {0, 1 << 14}; // 0 or 16KB
+
+ for (auto compression_type : compression_types) {
+ for (uint32_t max_dict_bytes : max_dict_bytes_list) {
+ options.compression = compression_type;
+ options.compression_opts.max_dict_bytes = max_dict_bytes;
+ if (compression_type == kZSTD) {
+ options.compression_opts.zstd_max_train_bytes = max_dict_bytes;
+ }
+ options.target_file_size_base = 1 << 26; // 64MB
+
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ size_t num_cfs = handles_.size();
+ ASSERT_EQ(2, num_cfs);
+ std::vector<std::string> write_ts_strs(kNumTimestamps);
+ std::vector<std::string> read_ts_strs(kNumTimestamps);
+ std::vector<Slice> write_ts_list;
+ std::vector<Slice> read_ts_list;
+
+ for (size_t i = 0; i != kNumTimestamps; ++i) {
+ write_ts_list.emplace_back(
+ EncodeTimestamp(i * 2, 0, &write_ts_strs[i]));
+ read_ts_list.emplace_back(
+ EncodeTimestamp(1 + i * 2, 0, &read_ts_strs[i]));
+ const Slice& write_ts = write_ts_list.back();
+ WriteOptions wopts;
+ wopts.timestamp = &write_ts;
+ for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+ for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps; ++j) {
+ ASSERT_OK(Put(
+ cf, "key" + std::to_string(j),
+ "value_" + std::to_string(j) + "_" + std::to_string(i), wopts));
+ }
+ if (!memtable_only) {
+ ASSERT_OK(Flush(cf));
+ }
+ }
+ }
+ const auto& verify_db_func = [&]() {
+ for (size_t i = 0; i != kNumTimestamps; ++i) {
+ ReadOptions ropts;
+ ropts.timestamp = &read_ts_list[i];
+ for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+ ColumnFamilyHandle* cfh = handles_[cf];
+ for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps;
+ ++j) {
+ std::string value;
+ ASSERT_OK(
+ db_->Get(ropts, cfh, "key" + std::to_string(j), &value));
+ ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i),
+ value);
+ }
+ }
+ }
+ };
+ verify_db_func();
+ }
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(Timestamp, DBBasicTestWithTimestampWithParam,
+ ::testing::Bool());
+
+} // namespace ROCKSDB_NAMESPACE
+
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_blob_index_test.cc b/src/rocksdb/db/db_blob_index_test.cc
new file mode 100644
index 000000000..24862f771
--- /dev/null
+++ b/src/rocksdb/db/db_blob_index_test.cc
@@ -0,0 +1,436 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/arena_wrapped_db_iter.h"
+#include "db/column_family.h"
+#include "db/db_iter.h"
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "db/write_batch_internal.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// kTypeBlobIndex is a value type used by BlobDB only. The base rocksdb
+// should accept the value type on write, and report not supported value
+// for reads, unless caller request for it explicitly. The base rocksdb
+// doesn't understand format of actual blob index (the value).
+class DBBlobIndexTest : public DBTestBase {
+ public:
+ enum Tier {
+ kMemtable = 0,
+ kImmutableMemtables = 1,
+ kL0SstFile = 2,
+ kLnSstFile = 3,
+ };
+ const std::vector<Tier> kAllTiers = {Tier::kMemtable,
+ Tier::kImmutableMemtables,
+ Tier::kL0SstFile, Tier::kLnSstFile};
+
+ DBBlobIndexTest() : DBTestBase("/db_blob_index_test") {}
+
+ ColumnFamilyHandle* cfh() { return dbfull()->DefaultColumnFamily(); }
+
+ ColumnFamilyData* cfd() {
+ return reinterpret_cast<ColumnFamilyHandleImpl*>(cfh())->cfd();
+ }
+
+ Status PutBlobIndex(WriteBatch* batch, const Slice& key,
+ const Slice& blob_index) {
+ return WriteBatchInternal::PutBlobIndex(batch, cfd()->GetID(), key,
+ blob_index);
+ }
+
+ Status Write(WriteBatch* batch) {
+ return dbfull()->Write(WriteOptions(), batch);
+ }
+
+ std::string GetImpl(const Slice& key, bool* is_blob_index = nullptr,
+ const Snapshot* snapshot = nullptr) {
+ ReadOptions read_options;
+ read_options.snapshot = snapshot;
+ PinnableSlice value;
+ DBImpl::GetImplOptions get_impl_options;
+ get_impl_options.column_family = cfh();
+ get_impl_options.value = &value;
+ get_impl_options.is_blob_index = is_blob_index;
+ auto s = dbfull()->GetImpl(read_options, key, get_impl_options);
+ if (s.IsNotFound()) {
+ return "NOT_FOUND";
+ }
+ if (s.IsNotSupported()) {
+ return "NOT_SUPPORTED";
+ }
+ if (!s.ok()) {
+ return s.ToString();
+ }
+ return value.ToString();
+ }
+
+ std::string GetBlobIndex(const Slice& key,
+ const Snapshot* snapshot = nullptr) {
+ bool is_blob_index = false;
+ std::string value = GetImpl(key, &is_blob_index, snapshot);
+ if (!is_blob_index) {
+ return "NOT_BLOB";
+ }
+ return value;
+ }
+
+ ArenaWrappedDBIter* GetBlobIterator() {
+ return dbfull()->NewIteratorImpl(
+ ReadOptions(), cfd(), dbfull()->GetLatestSequenceNumber(),
+ nullptr /*read_callback*/, true /*allow_blob*/);
+ }
+
+ Options GetTestOptions() {
+ Options options;
+ options.create_if_missing = true;
+ options.num_levels = 2;
+ options.disable_auto_compactions = true;
+ // Disable auto flushes.
+ options.max_write_buffer_number = 10;
+ options.min_write_buffer_number_to_merge = 10;
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ return options;
+ }
+
+ void MoveDataTo(Tier tier) {
+ switch (tier) {
+ case Tier::kMemtable:
+ break;
+ case Tier::kImmutableMemtables:
+ ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+ break;
+ case Tier::kL0SstFile:
+ ASSERT_OK(Flush());
+ break;
+ case Tier::kLnSstFile:
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("a", "dummy"));
+ ASSERT_OK(Put("z", "dummy"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("0,1", FilesPerLevel());
+#endif // !ROCKSDB_LITE
+ break;
+ }
+ }
+};
+
+// Should be able to write kTypeBlobIndex to memtables and SST files.
+TEST_F(DBBlobIndexTest, Write) {
+ for (auto tier : kAllTiers) {
+ DestroyAndReopen(GetTestOptions());
+ for (int i = 1; i <= 5; i++) {
+ std::string index = ToString(i);
+ WriteBatch batch;
+ ASSERT_OK(PutBlobIndex(&batch, "key" + index, "blob" + index));
+ ASSERT_OK(Write(&batch));
+ }
+ MoveDataTo(tier);
+ for (int i = 1; i <= 5; i++) {
+ std::string index = ToString(i);
+ ASSERT_EQ("blob" + index, GetBlobIndex("key" + index));
+ }
+ }
+}
+
+// Get should be able to return blob index if is_blob_index is provided,
+// otherwise return Status::NotSupported status.
+TEST_F(DBBlobIndexTest, Get) {
+ for (auto tier : kAllTiers) {
+ DestroyAndReopen(GetTestOptions());
+ WriteBatch batch;
+ ASSERT_OK(batch.Put("key", "value"));
+ ASSERT_OK(PutBlobIndex(&batch, "blob_key", "blob_index"));
+ ASSERT_OK(Write(&batch));
+ MoveDataTo(tier);
+ // Verify normal value
+ bool is_blob_index = false;
+ PinnableSlice value;
+ ASSERT_EQ("value", Get("key"));
+ ASSERT_EQ("value", GetImpl("key"));
+ ASSERT_EQ("value", GetImpl("key", &is_blob_index));
+ ASSERT_FALSE(is_blob_index);
+ // Verify blob index
+ ASSERT_TRUE(Get("blob_key", &value).IsNotSupported());
+ ASSERT_EQ("NOT_SUPPORTED", GetImpl("blob_key"));
+ ASSERT_EQ("blob_index", GetImpl("blob_key", &is_blob_index));
+ ASSERT_TRUE(is_blob_index);
+ }
+}
+
+// Get should NOT return Status::NotSupported if blob index is updated with
+// a normal value.
+TEST_F(DBBlobIndexTest, Updated) {
+ for (auto tier : kAllTiers) {
+ DestroyAndReopen(GetTestOptions());
+ WriteBatch batch;
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(PutBlobIndex(&batch, "key" + ToString(i), "blob_index"));
+ }
+ ASSERT_OK(Write(&batch));
+ // Avoid blob values from being purged.
+ const Snapshot* snapshot = dbfull()->GetSnapshot();
+ ASSERT_OK(Put("key1", "new_value"));
+ ASSERT_OK(Merge("key2", "a"));
+ ASSERT_OK(Merge("key2", "b"));
+ ASSERT_OK(Merge("key2", "c"));
+ ASSERT_OK(Delete("key3"));
+ ASSERT_OK(SingleDelete("key4"));
+ ASSERT_OK(Delete("key5"));
+ ASSERT_OK(Merge("key5", "a"));
+ ASSERT_OK(Merge("key5", "b"));
+ ASSERT_OK(Merge("key5", "c"));
+ ASSERT_OK(dbfull()->DeleteRange(WriteOptions(), cfh(), "key6", "key9"));
+ MoveDataTo(tier);
+ for (int i = 0; i < 10; i++) {
+ ASSERT_EQ("blob_index", GetBlobIndex("key" + ToString(i), snapshot));
+ }
+ ASSERT_EQ("new_value", Get("key1"));
+ ASSERT_EQ("NOT_SUPPORTED", GetImpl("key2"));
+ ASSERT_EQ("NOT_FOUND", Get("key3"));
+ ASSERT_EQ("NOT_FOUND", Get("key4"));
+ ASSERT_EQ("a,b,c", GetImpl("key5"));
+ for (int i = 6; i < 9; i++) {
+ ASSERT_EQ("NOT_FOUND", Get("key" + ToString(i)));
+ }
+ ASSERT_EQ("blob_index", GetBlobIndex("key9"));
+ dbfull()->ReleaseSnapshot(snapshot);
+ }
+}
+
+// Iterator should get blob value if allow_blob flag is set,
+// otherwise return Status::NotSupported status.
+TEST_F(DBBlobIndexTest, Iterate) {
+ const std::vector<std::vector<ValueType>> data = {
+ /*00*/ {kTypeValue},
+ /*01*/ {kTypeBlobIndex},
+ /*02*/ {kTypeValue},
+ /*03*/ {kTypeBlobIndex, kTypeValue},
+ /*04*/ {kTypeValue},
+ /*05*/ {kTypeValue, kTypeBlobIndex},
+ /*06*/ {kTypeValue},
+ /*07*/ {kTypeDeletion, kTypeBlobIndex},
+ /*08*/ {kTypeValue},
+ /*09*/ {kTypeSingleDeletion, kTypeBlobIndex},
+ /*10*/ {kTypeValue},
+ /*11*/ {kTypeMerge, kTypeMerge, kTypeMerge, kTypeBlobIndex},
+ /*12*/ {kTypeValue},
+ /*13*/
+ {kTypeMerge, kTypeMerge, kTypeMerge, kTypeDeletion, kTypeBlobIndex},
+ /*14*/ {kTypeValue},
+ /*15*/ {kTypeBlobIndex},
+ /*16*/ {kTypeValue},
+ };
+
+ auto get_key = [](int index) {
+ char buf[20];
+ snprintf(buf, sizeof(buf), "%02d", index);
+ return "key" + std::string(buf);
+ };
+
+ auto get_value = [&](int index, int version) {
+ return get_key(index) + "_value" + ToString(version);
+ };
+
+ auto check_iterator = [&](Iterator* iterator, Status::Code expected_status,
+ const Slice& expected_value) {
+ ASSERT_EQ(expected_status, iterator->status().code());
+ if (expected_status == Status::kOk) {
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ(expected_value, iterator->value());
+ } else {
+ ASSERT_FALSE(iterator->Valid());
+ }
+ };
+
+ auto create_normal_iterator = [&]() -> Iterator* {
+ return dbfull()->NewIterator(ReadOptions());
+ };
+
+ auto create_blob_iterator = [&]() -> Iterator* { return GetBlobIterator(); };
+
+ auto check_is_blob = [&](bool is_blob) {
+ return [is_blob](Iterator* iterator) {
+ ASSERT_EQ(is_blob,
+ reinterpret_cast<ArenaWrappedDBIter*>(iterator)->IsBlob());
+ };
+ };
+
+ auto verify = [&](int index, Status::Code expected_status,
+ const Slice& forward_value, const Slice& backward_value,
+ std::function<Iterator*()> create_iterator,
+ std::function<void(Iterator*)> extra_check = nullptr) {
+ // Seek
+ auto* iterator = create_iterator();
+ ASSERT_OK(iterator->Refresh());
+ iterator->Seek(get_key(index));
+ check_iterator(iterator, expected_status, forward_value);
+ if (extra_check) {
+ extra_check(iterator);
+ }
+ delete iterator;
+
+ // Next
+ iterator = create_iterator();
+ ASSERT_OK(iterator->Refresh());
+ iterator->Seek(get_key(index - 1));
+ ASSERT_TRUE(iterator->Valid());
+ iterator->Next();
+ check_iterator(iterator, expected_status, forward_value);
+ if (extra_check) {
+ extra_check(iterator);
+ }
+ delete iterator;
+
+ // SeekForPrev
+ iterator = create_iterator();
+ ASSERT_OK(iterator->Refresh());
+ iterator->SeekForPrev(get_key(index));
+ check_iterator(iterator, expected_status, backward_value);
+ if (extra_check) {
+ extra_check(iterator);
+ }
+ delete iterator;
+
+ // Prev
+ iterator = create_iterator();
+ iterator->Seek(get_key(index + 1));
+ ASSERT_TRUE(iterator->Valid());
+ iterator->Prev();
+ check_iterator(iterator, expected_status, backward_value);
+ if (extra_check) {
+ extra_check(iterator);
+ }
+ delete iterator;
+ };
+
+ for (auto tier : {Tier::kMemtable} /*kAllTiers*/) {
+ // Avoid values from being purged.
+ std::vector<const Snapshot*> snapshots;
+ DestroyAndReopen(GetTestOptions());
+
+ // fill data
+ for (int i = 0; i < static_cast<int>(data.size()); i++) {
+ for (int j = static_cast<int>(data[i].size()) - 1; j >= 0; j--) {
+ std::string key = get_key(i);
+ std::string value = get_value(i, j);
+ WriteBatch batch;
+ switch (data[i][j]) {
+ case kTypeValue:
+ ASSERT_OK(Put(key, value));
+ break;
+ case kTypeDeletion:
+ ASSERT_OK(Delete(key));
+ break;
+ case kTypeSingleDeletion:
+ ASSERT_OK(SingleDelete(key));
+ break;
+ case kTypeMerge:
+ ASSERT_OK(Merge(key, value));
+ break;
+ case kTypeBlobIndex:
+ ASSERT_OK(PutBlobIndex(&batch, key, value));
+ ASSERT_OK(Write(&batch));
+ break;
+ default:
+ assert(false);
+ };
+ }
+ snapshots.push_back(dbfull()->GetSnapshot());
+ }
+ ASSERT_OK(
+ dbfull()->DeleteRange(WriteOptions(), cfh(), get_key(15), get_key(16)));
+ snapshots.push_back(dbfull()->GetSnapshot());
+ MoveDataTo(tier);
+
+ // Normal iterator
+ verify(1, Status::kNotSupported, "", "", create_normal_iterator);
+ verify(3, Status::kNotSupported, "", "", create_normal_iterator);
+ verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
+ create_normal_iterator);
+ verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
+ create_normal_iterator);
+ verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
+ create_normal_iterator);
+ verify(11, Status::kNotSupported, "", "", create_normal_iterator);
+ verify(13, Status::kOk,
+ get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+ get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+ create_normal_iterator);
+ verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
+ create_normal_iterator);
+
+ // Iterator with blob support
+ verify(1, Status::kOk, get_value(1, 0), get_value(1, 0),
+ create_blob_iterator, check_is_blob(true));
+ verify(3, Status::kOk, get_value(3, 0), get_value(3, 0),
+ create_blob_iterator, check_is_blob(true));
+ verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
+ create_blob_iterator, check_is_blob(false));
+ verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
+ create_blob_iterator, check_is_blob(false));
+ verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
+ create_blob_iterator, check_is_blob(false));
+ verify(11, Status::kNotSupported, "", "", create_blob_iterator);
+ verify(13, Status::kOk,
+ get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+ get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+ create_blob_iterator, check_is_blob(false));
+ verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
+ create_blob_iterator, check_is_blob(false));
+
+#ifndef ROCKSDB_LITE
+ // Iterator with blob support and using seek.
+ ASSERT_OK(dbfull()->SetOptions(
+ cfh(), {{"max_sequential_skip_in_iterations", "0"}}));
+ verify(1, Status::kOk, get_value(1, 0), get_value(1, 0),
+ create_blob_iterator, check_is_blob(true));
+ verify(3, Status::kOk, get_value(3, 0), get_value(3, 0),
+ create_blob_iterator, check_is_blob(true));
+ verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
+ create_blob_iterator, check_is_blob(false));
+ verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
+ create_blob_iterator, check_is_blob(false));
+ verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
+ create_blob_iterator, check_is_blob(false));
+ verify(11, Status::kNotSupported, "", "", create_blob_iterator);
+ verify(13, Status::kOk,
+ get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+ get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+ create_blob_iterator, check_is_blob(false));
+ verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
+ create_blob_iterator, check_is_blob(false));
+#endif // !ROCKSDB_LITE
+
+ for (auto* snapshot : snapshots) {
+ dbfull()->ReleaseSnapshot(snapshot);
+ }
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_block_cache_test.cc b/src/rocksdb/db/db_block_cache_test.cc
new file mode 100644
index 000000000..3031e56bb
--- /dev/null
+++ b/src/rocksdb/db/db_block_cache_test.cc
@@ -0,0 +1,761 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <cstdlib>
+#include "cache/lru_cache.h"
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "util/compression.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBBlockCacheTest : public DBTestBase {
+ private:
+ size_t miss_count_ = 0;
+ size_t hit_count_ = 0;
+ size_t insert_count_ = 0;
+ size_t failure_count_ = 0;
+ size_t compression_dict_miss_count_ = 0;
+ size_t compression_dict_hit_count_ = 0;
+ size_t compression_dict_insert_count_ = 0;
+ size_t compressed_miss_count_ = 0;
+ size_t compressed_hit_count_ = 0;
+ size_t compressed_insert_count_ = 0;
+ size_t compressed_failure_count_ = 0;
+
+ public:
+ const size_t kNumBlocks = 10;
+ const size_t kValueSize = 100;
+
+ DBBlockCacheTest() : DBTestBase("/db_block_cache_test") {}
+
+ BlockBasedTableOptions GetTableOptions() {
+ BlockBasedTableOptions table_options;
+ // Set a small enough block size so that each key-value get its own block.
+ table_options.block_size = 1;
+ return table_options;
+ }
+
+ Options GetOptions(const BlockBasedTableOptions& table_options) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.avoid_flush_during_recovery = false;
+ // options.compression = kNoCompression;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+ return options;
+ }
+
+ void InitTable(const Options& /*options*/) {
+ std::string value(kValueSize, 'a');
+ for (size_t i = 0; i < kNumBlocks; i++) {
+ ASSERT_OK(Put(ToString(i), value.c_str()));
+ }
+ }
+
+ void RecordCacheCounters(const Options& options) {
+ miss_count_ = TestGetTickerCount(options, BLOCK_CACHE_MISS);
+ hit_count_ = TestGetTickerCount(options, BLOCK_CACHE_HIT);
+ insert_count_ = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+ failure_count_ = TestGetTickerCount(options, BLOCK_CACHE_ADD_FAILURES);
+ compressed_miss_count_ =
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS);
+ compressed_hit_count_ =
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT);
+ compressed_insert_count_ =
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD);
+ compressed_failure_count_ =
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
+ }
+
+ void RecordCacheCountersForCompressionDict(const Options& options) {
+ compression_dict_miss_count_ =
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS);
+ compression_dict_hit_count_ =
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_HIT);
+ compression_dict_insert_count_ =
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD);
+ }
+
+ void CheckCacheCounters(const Options& options, size_t expected_misses,
+ size_t expected_hits, size_t expected_inserts,
+ size_t expected_failures) {
+ size_t new_miss_count = TestGetTickerCount(options, BLOCK_CACHE_MISS);
+ size_t new_hit_count = TestGetTickerCount(options, BLOCK_CACHE_HIT);
+ size_t new_insert_count = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+ size_t new_failure_count =
+ TestGetTickerCount(options, BLOCK_CACHE_ADD_FAILURES);
+ ASSERT_EQ(miss_count_ + expected_misses, new_miss_count);
+ ASSERT_EQ(hit_count_ + expected_hits, new_hit_count);
+ ASSERT_EQ(insert_count_ + expected_inserts, new_insert_count);
+ ASSERT_EQ(failure_count_ + expected_failures, new_failure_count);
+ miss_count_ = new_miss_count;
+ hit_count_ = new_hit_count;
+ insert_count_ = new_insert_count;
+ failure_count_ = new_failure_count;
+ }
+
+ void CheckCacheCountersForCompressionDict(
+ const Options& options, size_t expected_compression_dict_misses,
+ size_t expected_compression_dict_hits,
+ size_t expected_compression_dict_inserts) {
+ size_t new_compression_dict_miss_count =
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS);
+ size_t new_compression_dict_hit_count =
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_HIT);
+ size_t new_compression_dict_insert_count =
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD);
+ ASSERT_EQ(compression_dict_miss_count_ + expected_compression_dict_misses,
+ new_compression_dict_miss_count);
+ ASSERT_EQ(compression_dict_hit_count_ + expected_compression_dict_hits,
+ new_compression_dict_hit_count);
+ ASSERT_EQ(
+ compression_dict_insert_count_ + expected_compression_dict_inserts,
+ new_compression_dict_insert_count);
+ compression_dict_miss_count_ = new_compression_dict_miss_count;
+ compression_dict_hit_count_ = new_compression_dict_hit_count;
+ compression_dict_insert_count_ = new_compression_dict_insert_count;
+ }
+
+ void CheckCompressedCacheCounters(const Options& options,
+ size_t expected_misses,
+ size_t expected_hits,
+ size_t expected_inserts,
+ size_t expected_failures) {
+ size_t new_miss_count =
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS);
+ size_t new_hit_count =
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT);
+ size_t new_insert_count =
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD);
+ size_t new_failure_count =
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
+ ASSERT_EQ(compressed_miss_count_ + expected_misses, new_miss_count);
+ ASSERT_EQ(compressed_hit_count_ + expected_hits, new_hit_count);
+ ASSERT_EQ(compressed_insert_count_ + expected_inserts, new_insert_count);
+ ASSERT_EQ(compressed_failure_count_ + expected_failures, new_failure_count);
+ compressed_miss_count_ = new_miss_count;
+ compressed_hit_count_ = new_hit_count;
+ compressed_insert_count_ = new_insert_count;
+ compressed_failure_count_ = new_failure_count;
+ }
+};
+
+TEST_F(DBBlockCacheTest, IteratorBlockCacheUsage) {
+ ReadOptions read_options;
+ read_options.fill_cache = false;
+ auto table_options = GetTableOptions();
+ auto options = GetOptions(table_options);
+ InitTable(options);
+
+ std::shared_ptr<Cache> cache = NewLRUCache(0, 0, false);
+ table_options.block_cache = cache;
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+ Reopen(options);
+ RecordCacheCounters(options);
+
+ std::vector<std::unique_ptr<Iterator>> iterators(kNumBlocks - 1);
+ Iterator* iter = nullptr;
+
+ ASSERT_EQ(0, cache->GetUsage());
+ iter = db_->NewIterator(read_options);
+ iter->Seek(ToString(0));
+ ASSERT_LT(0, cache->GetUsage());
+ delete iter;
+ iter = nullptr;
+ ASSERT_EQ(0, cache->GetUsage());
+}
+
+TEST_F(DBBlockCacheTest, TestWithoutCompressedBlockCache) {
+ ReadOptions read_options;
+ auto table_options = GetTableOptions();
+ auto options = GetOptions(table_options);
+ InitTable(options);
+
+ std::shared_ptr<Cache> cache = NewLRUCache(0, 0, false);
+ table_options.block_cache = cache;
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+ Reopen(options);
+ RecordCacheCounters(options);
+
+ std::vector<std::unique_ptr<Iterator>> iterators(kNumBlocks - 1);
+ Iterator* iter = nullptr;
+
+ // Load blocks into cache.
+ for (size_t i = 0; i < kNumBlocks - 1; i++) {
+ iter = db_->NewIterator(read_options);
+ iter->Seek(ToString(i));
+ ASSERT_OK(iter->status());
+ CheckCacheCounters(options, 1, 0, 1, 0);
+ iterators[i].reset(iter);
+ }
+ size_t usage = cache->GetUsage();
+ ASSERT_LT(0, usage);
+ cache->SetCapacity(usage);
+ ASSERT_EQ(usage, cache->GetPinnedUsage());
+
+ // Test with strict capacity limit.
+ cache->SetStrictCapacityLimit(true);
+ iter = db_->NewIterator(read_options);
+ iter->Seek(ToString(kNumBlocks - 1));
+ ASSERT_TRUE(iter->status().IsIncomplete());
+ CheckCacheCounters(options, 1, 0, 0, 1);
+ delete iter;
+ iter = nullptr;
+
+ // Release iterators and access cache again.
+ for (size_t i = 0; i < kNumBlocks - 1; i++) {
+ iterators[i].reset();
+ CheckCacheCounters(options, 0, 0, 0, 0);
+ }
+ ASSERT_EQ(0, cache->GetPinnedUsage());
+ for (size_t i = 0; i < kNumBlocks - 1; i++) {
+ iter = db_->NewIterator(read_options);
+ iter->Seek(ToString(i));
+ ASSERT_OK(iter->status());
+ CheckCacheCounters(options, 0, 1, 0, 0);
+ iterators[i].reset(iter);
+ }
+}
+
+#ifdef SNAPPY
+TEST_F(DBBlockCacheTest, TestWithCompressedBlockCache) {
+ ReadOptions read_options;
+ auto table_options = GetTableOptions();
+ auto options = GetOptions(table_options);
+ options.compression = CompressionType::kSnappyCompression;
+ InitTable(options);
+
+ std::shared_ptr<Cache> cache = NewLRUCache(0, 0, false);
+ std::shared_ptr<Cache> compressed_cache = NewLRUCache(1 << 25, 0, false);
+ table_options.block_cache = cache;
+ table_options.block_cache_compressed = compressed_cache;
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+ Reopen(options);
+ RecordCacheCounters(options);
+
+ std::vector<std::unique_ptr<Iterator>> iterators(kNumBlocks - 1);
+ Iterator* iter = nullptr;
+
+ // Load blocks into cache.
+ for (size_t i = 0; i < kNumBlocks - 1; i++) {
+ iter = db_->NewIterator(read_options);
+ iter->Seek(ToString(i));
+ ASSERT_OK(iter->status());
+ CheckCacheCounters(options, 1, 0, 1, 0);
+ CheckCompressedCacheCounters(options, 1, 0, 1, 0);
+ iterators[i].reset(iter);
+ }
+ size_t usage = cache->GetUsage();
+ ASSERT_LT(0, usage);
+ ASSERT_EQ(usage, cache->GetPinnedUsage());
+ size_t compressed_usage = compressed_cache->GetUsage();
+ ASSERT_LT(0, compressed_usage);
+ // Compressed block cache cannot be pinned.
+ ASSERT_EQ(0, compressed_cache->GetPinnedUsage());
+
+ // Set strict capacity limit flag. Now block will only load into compressed
+ // block cache.
+ cache->SetCapacity(usage);
+ cache->SetStrictCapacityLimit(true);
+ ASSERT_EQ(usage, cache->GetPinnedUsage());
+ iter = db_->NewIterator(read_options);
+ iter->Seek(ToString(kNumBlocks - 1));
+ ASSERT_TRUE(iter->status().IsIncomplete());
+ CheckCacheCounters(options, 1, 0, 0, 1);
+ CheckCompressedCacheCounters(options, 1, 0, 1, 0);
+ delete iter;
+ iter = nullptr;
+
+ // Clear strict capacity limit flag. This time we shall hit compressed block
+ // cache.
+ cache->SetStrictCapacityLimit(false);
+ iter = db_->NewIterator(read_options);
+ iter->Seek(ToString(kNumBlocks - 1));
+ ASSERT_OK(iter->status());
+ CheckCacheCounters(options, 1, 0, 1, 0);
+ CheckCompressedCacheCounters(options, 0, 1, 0, 0);
+ delete iter;
+ iter = nullptr;
+}
+#endif // SNAPPY
+
+#ifndef ROCKSDB_LITE
+
+// Make sure that when options.block_cache is set, after a new table is
+// created its index/filter blocks are added to block cache.
+TEST_F(DBBlockCacheTest, IndexAndFilterBlocksOfNewTableAddedToCache) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ BlockBasedTableOptions table_options;
+ table_options.cache_index_and_filter_blocks = true;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_OK(Put(1, "key", "val"));
+ // Create a new table.
+ ASSERT_OK(Flush(1));
+
+ // index/filter blocks added to block cache right after table creation.
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(2, /* only index/filter were added */
+ TestGetTickerCount(options, BLOCK_CACHE_ADD));
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS));
+ uint64_t int_num;
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+ ASSERT_EQ(int_num, 0U);
+
+ // Make sure filter block is in cache.
+ std::string value;
+ ReadOptions ropt;
+ db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value);
+
+ // Miss count should remain the same.
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+
+ db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value);
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+
+ // Make sure index block is in cache.
+ auto index_block_hit = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT);
+ value = Get(1, "key");
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(index_block_hit + 1,
+ TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+
+ value = Get(1, "key");
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(index_block_hit + 2,
+ TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+}
+
+// With fill_cache = false, fills up the cache, then iterates over the entire
+// db, verify dummy entries inserted in `BlockBasedTable::NewDataBlockIterator`
+// does not cause heap-use-after-free errors in COMPILE_WITH_ASAN=1 runs
+TEST_F(DBBlockCacheTest, FillCacheAndIterateDB) {
+ ReadOptions read_options;
+ read_options.fill_cache = false;
+ auto table_options = GetTableOptions();
+ auto options = GetOptions(table_options);
+ InitTable(options);
+
+ std::shared_ptr<Cache> cache = NewLRUCache(10, 0, true);
+ table_options.block_cache = cache;
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+ Reopen(options);
+ ASSERT_OK(Put("key1", "val1"));
+ ASSERT_OK(Put("key2", "val2"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("key3", "val3"));
+ ASSERT_OK(Put("key4", "val4"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("key5", "val5"));
+ ASSERT_OK(Put("key6", "val6"));
+ ASSERT_OK(Flush());
+
+ Iterator* iter = nullptr;
+
+ iter = db_->NewIterator(read_options);
+ iter->Seek(ToString(0));
+ while (iter->Valid()) {
+ iter->Next();
+ }
+ delete iter;
+ iter = nullptr;
+}
+
+TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ BlockBasedTableOptions table_options;
+ table_options.cache_index_and_filter_blocks = true;
+ LRUCacheOptions co;
+ // 500 bytes are enough to hold the first two blocks
+ co.capacity = 500;
+ co.num_shard_bits = 0;
+ co.strict_capacity_limit = false;
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
+ std::shared_ptr<Cache> cache = NewLRUCache(co);
+ table_options.block_cache = cache;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(20, true));
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_OK(Put(1, "longer_key", "val"));
+ // Create a new table
+ ASSERT_OK(Flush(1));
+ size_t index_bytes_insert =
+ TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_INSERT);
+ size_t filter_bytes_insert =
+ TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_INSERT);
+ ASSERT_GT(index_bytes_insert, 0);
+ ASSERT_GT(filter_bytes_insert, 0);
+ ASSERT_EQ(cache->GetUsage(), index_bytes_insert + filter_bytes_insert);
+ // set the cache capacity to the current usage
+ cache->SetCapacity(index_bytes_insert + filter_bytes_insert);
+ // The index and filter eviction statistics were broken by the refactoring
+ // that moved the readers out of the block cache. Disabling these until we can
+ // bring the stats back.
+ // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), 0);
+ // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), 0);
+ // Note that the second key needs to be no longer than the first one.
+ // Otherwise the second index block may not fit in cache.
+ ASSERT_OK(Put(1, "key", "val"));
+ // Create a new table
+ ASSERT_OK(Flush(1));
+ // cache evicted old index and block entries
+ ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_INSERT),
+ index_bytes_insert);
+ ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_INSERT),
+ filter_bytes_insert);
+ // The index and filter eviction statistics were broken by the refactoring
+ // that moved the readers out of the block cache. Disabling these until we can
+ // bring the stats back.
+ // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT),
+ // index_bytes_insert);
+ // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT),
+ // filter_bytes_insert);
+}
+
+namespace {
+
+// A mock cache wraps LRUCache, and record how many entries have been
+// inserted for each priority.
+class MockCache : public LRUCache {
+ public:
+ static uint32_t high_pri_insert_count;
+ static uint32_t low_pri_insert_count;
+
+ MockCache()
+ : LRUCache((size_t)1 << 25 /*capacity*/, 0 /*num_shard_bits*/,
+ false /*strict_capacity_limit*/, 0.0 /*high_pri_pool_ratio*/) {
+ }
+
+ Status Insert(const Slice& key, void* value, size_t charge,
+ void (*deleter)(const Slice& key, void* value), Handle** handle,
+ Priority priority) override {
+ if (priority == Priority::LOW) {
+ low_pri_insert_count++;
+ } else {
+ high_pri_insert_count++;
+ }
+ return LRUCache::Insert(key, value, charge, deleter, handle, priority);
+ }
+};
+
+uint32_t MockCache::high_pri_insert_count = 0;
+uint32_t MockCache::low_pri_insert_count = 0;
+
+} // anonymous namespace
+
+TEST_F(DBBlockCacheTest, IndexAndFilterBlocksCachePriority) {
+ for (auto priority : {Cache::Priority::LOW, Cache::Priority::HIGH}) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ BlockBasedTableOptions table_options;
+ table_options.cache_index_and_filter_blocks = true;
+ table_options.block_cache.reset(new MockCache());
+ table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+ table_options.cache_index_and_filter_blocks_with_high_priority =
+ priority == Cache::Priority::HIGH ? true : false;
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+ DestroyAndReopen(options);
+
+ MockCache::high_pri_insert_count = 0;
+ MockCache::low_pri_insert_count = 0;
+
+ // Create a new table.
+ ASSERT_OK(Put("foo", "value"));
+ ASSERT_OK(Put("bar", "value"));
+ ASSERT_OK(Flush());
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+ // index/filter blocks added to block cache right after table creation.
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(2, /* only index/filter were added */
+ TestGetTickerCount(options, BLOCK_CACHE_ADD));
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS));
+ if (priority == Cache::Priority::LOW) {
+ ASSERT_EQ(0u, MockCache::high_pri_insert_count);
+ ASSERT_EQ(2u, MockCache::low_pri_insert_count);
+ } else {
+ ASSERT_EQ(2u, MockCache::high_pri_insert_count);
+ ASSERT_EQ(0u, MockCache::low_pri_insert_count);
+ }
+
+ // Access data block.
+ ASSERT_EQ("value", Get("foo"));
+
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(3, /*adding data block*/
+ TestGetTickerCount(options, BLOCK_CACHE_ADD));
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS));
+
+ // Data block should be inserted with low priority.
+ if (priority == Cache::Priority::LOW) {
+ ASSERT_EQ(0u, MockCache::high_pri_insert_count);
+ ASSERT_EQ(3u, MockCache::low_pri_insert_count);
+ } else {
+ ASSERT_EQ(2u, MockCache::high_pri_insert_count);
+ ASSERT_EQ(1u, MockCache::low_pri_insert_count);
+ }
+ }
+}
+
+TEST_F(DBBlockCacheTest, ParanoidFileChecks) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.level0_file_num_compaction_trigger = 2;
+ options.paranoid_file_checks = true;
+ BlockBasedTableOptions table_options;
+ table_options.cache_index_and_filter_blocks = false;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_OK(Put(1, "1_key", "val"));
+ ASSERT_OK(Put(1, "9_key", "val"));
+ // Create a new table.
+ ASSERT_OK(Flush(1));
+ ASSERT_EQ(1, /* read and cache data block */
+ TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+ ASSERT_OK(Put(1, "1_key2", "val2"));
+ ASSERT_OK(Put(1, "9_key2", "val2"));
+ // Create a new SST file. This will further trigger a compaction
+ // and generate another file.
+ ASSERT_OK(Flush(1));
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(3, /* Totally 3 files created up to now */
+ TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+ // After disabling options.paranoid_file_checks. NO further block
+ // is added after generating a new file.
+ ASSERT_OK(
+ dbfull()->SetOptions(handles_[1], {{"paranoid_file_checks", "false"}}));
+
+ ASSERT_OK(Put(1, "1_key3", "val3"));
+ ASSERT_OK(Put(1, "9_key3", "val3"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(Put(1, "1_key4", "val4"));
+ ASSERT_OK(Put(1, "9_key4", "val4"));
+ ASSERT_OK(Flush(1));
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(3, /* Totally 3 files created up to now */
+ TestGetTickerCount(options, BLOCK_CACHE_ADD));
+}
+
+TEST_F(DBBlockCacheTest, CompressedCache) {
+ if (!Snappy_Supported()) {
+ return;
+ }
+ int num_iter = 80;
+
+ // Run this test three iterations.
+ // Iteration 1: only a uncompressed block cache
+ // Iteration 2: only a compressed block cache
+ // Iteration 3: both block cache and compressed cache
+ // Iteration 4: both block cache and compressed cache, but DB is not
+ // compressed
+ for (int iter = 0; iter < 4; iter++) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 64 * 1024; // small write buffer
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+ BlockBasedTableOptions table_options;
+ switch (iter) {
+ case 0:
+ // only uncompressed block cache
+ table_options.block_cache = NewLRUCache(8 * 1024);
+ table_options.block_cache_compressed = nullptr;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ break;
+ case 1:
+ // no block cache, only compressed cache
+ table_options.no_block_cache = true;
+ table_options.block_cache = nullptr;
+ table_options.block_cache_compressed = NewLRUCache(8 * 1024);
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ break;
+ case 2:
+ // both compressed and uncompressed block cache
+ table_options.block_cache = NewLRUCache(1024);
+ table_options.block_cache_compressed = NewLRUCache(8 * 1024);
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ break;
+ case 3:
+ // both block cache and compressed cache, but DB is not compressed
+ // also, make block cache sizes bigger, to trigger block cache hits
+ table_options.block_cache = NewLRUCache(1024 * 1024);
+ table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024);
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.compression = kNoCompression;
+ break;
+ default:
+ FAIL();
+ }
+ CreateAndReopenWithCF({"pikachu"}, options);
+ // default column family doesn't have block cache
+ Options no_block_cache_opts;
+ no_block_cache_opts.statistics = options.statistics;
+ no_block_cache_opts = CurrentOptions(no_block_cache_opts);
+ BlockBasedTableOptions table_options_no_bc;
+ table_options_no_bc.no_block_cache = true;
+ no_block_cache_opts.table_factory.reset(
+ NewBlockBasedTableFactory(table_options_no_bc));
+ ReopenWithColumnFamilies(
+ {"default", "pikachu"},
+ std::vector<Options>({no_block_cache_opts, options}));
+
+ Random rnd(301);
+
+ // Write 8MB (80 values, each 100K)
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+ std::vector<std::string> values;
+ std::string str;
+ for (int i = 0; i < num_iter; i++) {
+ if (i % 4 == 0) { // high compression ratio
+ str = RandomString(&rnd, 1000);
+ }
+ values.push_back(str);
+ ASSERT_OK(Put(1, Key(i), values[i]));
+ }
+
+ // flush all data from memtable so that reads are from block cache
+ ASSERT_OK(Flush(1));
+
+ for (int i = 0; i < num_iter; i++) {
+ ASSERT_EQ(Get(1, Key(i)), values[i]);
+ }
+
+ // check that we triggered the appropriate code paths in the cache
+ switch (iter) {
+ case 0:
+ // only uncompressed block cache
+ ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+ ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
+ break;
+ case 1:
+ // no block cache, only compressed cache
+ ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+ ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
+ break;
+ case 2:
+ // both compressed and uncompressed block cache
+ ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+ ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
+ break;
+ case 3:
+ // both compressed and uncompressed block cache
+ ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+ ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_HIT), 0);
+ ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
+ // compressed doesn't have any hits since blocks are not compressed on
+ // storage
+ ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT), 0);
+ break;
+ default:
+ FAIL();
+ }
+
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+ }
+}
+
+TEST_F(DBBlockCacheTest, CacheCompressionDict) {
+ const int kNumFiles = 4;
+ const int kNumEntriesPerFile = 128;
+ const int kNumBytesPerEntry = 1024;
+
+ // Try all the available libraries that support dictionary compression
+ std::vector<CompressionType> compression_types;
+ if (Zlib_Supported()) {
+ compression_types.push_back(kZlibCompression);
+ }
+ if (LZ4_Supported()) {
+ compression_types.push_back(kLZ4Compression);
+ compression_types.push_back(kLZ4HCCompression);
+ }
+ if (ZSTD_Supported()) {
+ compression_types.push_back(kZSTD);
+ } else if (ZSTDNotFinal_Supported()) {
+ compression_types.push_back(kZSTDNotFinalCompression);
+ }
+ Random rnd(301);
+ for (auto compression_type : compression_types) {
+ Options options = CurrentOptions();
+ options.compression = compression_type;
+ options.compression_opts.max_dict_bytes = 4096;
+ options.create_if_missing = true;
+ options.num_levels = 2;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry;
+ BlockBasedTableOptions table_options;
+ table_options.cache_index_and_filter_blocks = true;
+ table_options.block_cache.reset(new MockCache());
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+ DestroyAndReopen(options);
+
+ RecordCacheCountersForCompressionDict(options);
+
+ for (int i = 0; i < kNumFiles; ++i) {
+ ASSERT_EQ(i, NumTableFilesAtLevel(0, 0));
+ for (int j = 0; j < kNumEntriesPerFile; ++j) {
+ std::string value = RandomString(&rnd, kNumBytesPerEntry);
+ ASSERT_OK(Put(Key(j * kNumFiles + i), value.c_str()));
+ }
+ ASSERT_OK(Flush());
+ }
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(1));
+
+ // Compression dictionary blocks are preloaded.
+ CheckCacheCountersForCompressionDict(
+ options, kNumFiles /* expected_compression_dict_misses */,
+ 0 /* expected_compression_dict_hits */,
+ kNumFiles /* expected_compression_dict_inserts */);
+
+ // Seek to a key in a file. It should cause the SST's dictionary meta-block
+ // to be read.
+ RecordCacheCounters(options);
+ RecordCacheCountersForCompressionDict(options);
+ ReadOptions read_options;
+ ASSERT_NE("NOT_FOUND", Get(Key(kNumFiles * kNumEntriesPerFile - 1)));
+ // Two block hits: index and dictionary since they are prefetched
+ // One block missed/added: data block
+ CheckCacheCounters(options, 1 /* expected_misses */, 2 /* expected_hits */,
+ 1 /* expected_inserts */, 0 /* expected_failures */);
+ CheckCacheCountersForCompressionDict(
+ options, 0 /* expected_compression_dict_misses */,
+ 1 /* expected_compression_dict_hits */,
+ 0 /* expected_compression_dict_inserts */);
+ }
+}
+
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_bloom_filter_test.cc b/src/rocksdb/db/db_bloom_filter_test.cc
new file mode 100644
index 000000000..dcad00327
--- /dev/null
+++ b/src/rocksdb/db/db_bloom_filter_test.cc
@@ -0,0 +1,1910 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/perf_context.h"
+#include "table/block_based/filter_policy_internal.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+using BFP = BloomFilterPolicy;
+} // namespace
+
+// DB tests related to bloom filter.
+
+class DBBloomFilterTest : public DBTestBase {
+ public:
+ DBBloomFilterTest() : DBTestBase("/db_bloom_filter_test") {}
+};
+
+class DBBloomFilterTestWithParam : public DBTestBase,
+ public testing::WithParamInterface<
+ std::tuple<BFP::Mode, bool, uint32_t>> {
+ // public testing::WithParamInterface<bool> {
+ protected:
+ BFP::Mode bfp_impl_;
+ bool partition_filters_;
+ uint32_t format_version_;
+
+ public:
+ DBBloomFilterTestWithParam() : DBTestBase("/db_bloom_filter_tests") {}
+
+ ~DBBloomFilterTestWithParam() override {}
+
+ void SetUp() override {
+ bfp_impl_ = std::get<0>(GetParam());
+ partition_filters_ = std::get<1>(GetParam());
+ format_version_ = std::get<2>(GetParam());
+ }
+};
+
+class DBBloomFilterTestDefFormatVersion : public DBBloomFilterTestWithParam {};
+
+class SliceTransformLimitedDomainGeneric : public SliceTransform {
+ const char* Name() const override {
+ return "SliceTransformLimitedDomainGeneric";
+ }
+
+ Slice Transform(const Slice& src) const override {
+ return Slice(src.data(), 5);
+ }
+
+ bool InDomain(const Slice& src) const override {
+ // prefix will be x????
+ return src.size() >= 5;
+ }
+
+ bool InRange(const Slice& dst) const override {
+ // prefix will be x????
+ return dst.size() == 5;
+ }
+};
+
+// KeyMayExist can lead to a few false positives, but not false negatives.
+// To make test deterministic, use a much larger number of bits per key-20 than
+// bits in the key, so that false positives are eliminated
+TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) {
+ do {
+ ReadOptions ropts;
+ std::string value;
+ anon::OptionsOverride options_override;
+ options_override.filter_policy.reset(new BFP(20, bfp_impl_));
+ options_override.partition_filters = partition_filters_;
+ options_override.metadata_block_size = 32;
+ Options options = CurrentOptions(options_override);
+ if (partition_filters_ &&
+ static_cast<BlockBasedTableOptions*>(
+ options.table_factory->GetOptions())
+ ->index_type != BlockBasedTableOptions::kTwoLevelIndexSearch) {
+ // In the current implementation partitioned filters depend on partitioned
+ // indexes
+ continue;
+ }
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
+
+ ASSERT_OK(Put(1, "a", "b"));
+ bool value_found = false;
+ ASSERT_TRUE(
+ db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found));
+ ASSERT_TRUE(value_found);
+ ASSERT_EQ("b", value);
+
+ ASSERT_OK(Flush(1));
+ value.clear();
+
+ uint64_t numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+ uint64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+ ASSERT_TRUE(
+ db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found));
+ ASSERT_TRUE(!value_found);
+ // assert that no new files were opened and no new blocks were
+ // read into block cache.
+ ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+ ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+ ASSERT_OK(Delete(1, "a"));
+
+ numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+ cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+ ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
+ ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+ ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+ ASSERT_OK(Flush(1));
+ dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
+ true /* disallow trivial move */);
+
+ numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+ cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+ ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
+ ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+ ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+ ASSERT_OK(Delete(1, "c"));
+
+ numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+ cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+ ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "c", &value));
+ ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+ ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+ // KeyMayExist function only checks data in block caches, which is not used
+ // by plain table format.
+ } while (
+ ChangeOptions(kSkipPlainTable | kSkipHashIndex | kSkipFIFOCompaction));
+}
+
+TEST_F(DBBloomFilterTest, GetFilterByPrefixBloomCustomPrefixExtractor) {
+ for (bool partition_filters : {true, false}) {
+ Options options = last_options_;
+ options.prefix_extractor =
+ std::make_shared<SliceTransformLimitedDomainGeneric>();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ get_perf_context()->EnablePerLevelPerfContext();
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ if (partition_filters) {
+ bbto.partition_filters = true;
+ bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+ }
+ bbto.whole_key_filtering = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ DestroyAndReopen(options);
+
+ WriteOptions wo;
+ ReadOptions ro;
+ FlushOptions fo;
+ fo.wait = true;
+ std::string value;
+
+ ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo"));
+ ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2"));
+ ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar"));
+
+ dbfull()->Flush(fo);
+
+ ASSERT_EQ("foo", Get("barbarbar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+ ASSERT_EQ(
+ 0,
+ (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+ ASSERT_EQ("foo2", Get("barbarbar2"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+ ASSERT_EQ(
+ 0,
+ (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+ ASSERT_EQ("NOT_FOUND", Get("barbarbar3"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+ ASSERT_EQ(
+ 0,
+ (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+
+ ASSERT_EQ("NOT_FOUND", Get("barfoofoo"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+ ASSERT_EQ(
+ 1,
+ (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+
+ ASSERT_EQ("NOT_FOUND", Get("foobarbar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2);
+ ASSERT_EQ(
+ 2,
+ (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+
+ ro.total_order_seek = true;
+ ASSERT_TRUE(db_->Get(ro, "foobarbar", &value).IsNotFound());
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2);
+ ASSERT_EQ(
+ 2,
+ (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+ get_perf_context()->Reset();
+ }
+}
+
+TEST_F(DBBloomFilterTest, GetFilterByPrefixBloom) {
+ for (bool partition_filters : {true, false}) {
+ Options options = last_options_;
+ options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ get_perf_context()->EnablePerLevelPerfContext();
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ if (partition_filters) {
+ bbto.partition_filters = true;
+ bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+ }
+ bbto.whole_key_filtering = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ DestroyAndReopen(options);
+
+ WriteOptions wo;
+ ReadOptions ro;
+ FlushOptions fo;
+ fo.wait = true;
+ std::string value;
+
+ ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo"));
+ ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2"));
+ ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar"));
+
+ dbfull()->Flush(fo);
+
+ ASSERT_EQ("foo", Get("barbarbar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+ ASSERT_EQ("foo2", Get("barbarbar2"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+ ASSERT_EQ("NOT_FOUND", Get("barbarbar3"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+
+ ASSERT_EQ("NOT_FOUND", Get("barfoofoo"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+
+ ASSERT_EQ("NOT_FOUND", Get("foobarbar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2);
+
+ ro.total_order_seek = true;
+ ASSERT_TRUE(db_->Get(ro, "foobarbar", &value).IsNotFound());
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2);
+ ASSERT_EQ(
+ 2,
+ (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+ get_perf_context()->Reset();
+ }
+}
+
+TEST_F(DBBloomFilterTest, WholeKeyFilterProp) {
+ for (bool partition_filters : {true, false}) {
+ Options options = last_options_;
+ options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ get_perf_context()->EnablePerLevelPerfContext();
+
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ bbto.whole_key_filtering = false;
+ if (partition_filters) {
+ bbto.partition_filters = true;
+ bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+ }
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ DestroyAndReopen(options);
+
+ WriteOptions wo;
+ ReadOptions ro;
+ FlushOptions fo;
+ fo.wait = true;
+ std::string value;
+
+ ASSERT_OK(dbfull()->Put(wo, "foobar", "foo"));
+ // Needs insert some keys to make sure files are not filtered out by key
+ // ranges.
+ ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
+ ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
+ dbfull()->Flush(fo);
+
+ Reopen(options);
+ ASSERT_EQ("NOT_FOUND", Get("foo"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+ ASSERT_EQ("NOT_FOUND", Get("bar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+ ASSERT_EQ("foo", Get("foobar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+
+ // Reopen with whole key filtering enabled and prefix extractor
+ // NULL. Bloom filter should be off for both of whole key and
+ // prefix bloom.
+ bbto.whole_key_filtering = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ options.prefix_extractor.reset();
+ Reopen(options);
+
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+ ASSERT_EQ("NOT_FOUND", Get("foo"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+ ASSERT_EQ("NOT_FOUND", Get("bar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+ ASSERT_EQ("foo", Get("foobar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+ // Write DB with only full key filtering.
+ ASSERT_OK(dbfull()->Put(wo, "foobar", "foo"));
+ // Needs insert some keys to make sure files are not filtered out by key
+ // ranges.
+ ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
+ ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
+ db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+ // Reopen with both of whole key off and prefix extractor enabled.
+ // Still no bloom filter should be used.
+ options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+ bbto.whole_key_filtering = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ Reopen(options);
+
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+ ASSERT_EQ("NOT_FOUND", Get("foo"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+ ASSERT_EQ("NOT_FOUND", Get("bar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+ ASSERT_EQ("foo", Get("foobar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+
+ // Try to create a DB with mixed files:
+ ASSERT_OK(dbfull()->Put(wo, "foobar", "foo"));
+ // Needs insert some keys to make sure files are not filtered out by key
+ // ranges.
+ ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
+ ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
+ db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+ options.prefix_extractor.reset();
+ bbto.whole_key_filtering = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ Reopen(options);
+
+ // Try to create a DB with mixed files.
+ ASSERT_OK(dbfull()->Put(wo, "barfoo", "bar"));
+ // In this case needs insert some keys to make sure files are
+ // not filtered out by key ranges.
+ ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
+ ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
+ Flush();
+
+ // Now we have two files:
+ // File 1: An older file with prefix bloom.
+ // File 2: A newer file with whole bloom filter.
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+ ASSERT_EQ("NOT_FOUND", Get("foo"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2);
+ ASSERT_EQ("NOT_FOUND", Get("bar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3);
+ ASSERT_EQ("foo", Get("foobar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4);
+ ASSERT_EQ("bar", Get("barfoo"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4);
+
+ // Reopen with the same setting: only whole key is used
+ Reopen(options);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4);
+ ASSERT_EQ("NOT_FOUND", Get("foo"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 5);
+ ASSERT_EQ("NOT_FOUND", Get("bar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 6);
+ ASSERT_EQ("foo", Get("foobar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7);
+ ASSERT_EQ("bar", Get("barfoo"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7);
+
+ // Restart with both filters are allowed
+ options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+ bbto.whole_key_filtering = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ Reopen(options);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7);
+ // File 1 will has it filtered out.
+ // File 2 will not, as prefix `foo` exists in the file.
+ ASSERT_EQ("NOT_FOUND", Get("foo"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 8);
+ ASSERT_EQ("NOT_FOUND", Get("bar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 10);
+ ASSERT_EQ("foo", Get("foobar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11);
+ ASSERT_EQ("bar", Get("barfoo"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11);
+
+ // Restart with only prefix bloom is allowed.
+ options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+ bbto.whole_key_filtering = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ Reopen(options);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11);
+ ASSERT_EQ("NOT_FOUND", Get("foo"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11);
+ ASSERT_EQ("NOT_FOUND", Get("bar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12);
+ ASSERT_EQ("foo", Get("foobar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12);
+ ASSERT_EQ("bar", Get("barfoo"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12);
+ uint64_t bloom_filter_useful_all_levels = 0;
+ for (auto& kv : (*(get_perf_context()->level_to_perf_context))) {
+ if (kv.second.bloom_filter_useful > 0) {
+ bloom_filter_useful_all_levels += kv.second.bloom_filter_useful;
+ }
+ }
+ ASSERT_EQ(12, bloom_filter_useful_all_levels);
+ get_perf_context()->Reset();
+ }
+}
+
+TEST_P(DBBloomFilterTestWithParam, BloomFilter) {
+ do {
+ Options options = CurrentOptions();
+ env_->count_random_reads_ = true;
+ options.env = env_;
+ // ChangeCompactOptions() only changes compaction style, which does not
+ // trigger reset of table_factory
+ BlockBasedTableOptions table_options;
+ table_options.no_block_cache = true;
+ table_options.filter_policy.reset(new BFP(10, bfp_impl_));
+ table_options.partition_filters = partition_filters_;
+ if (partition_filters_) {
+ table_options.index_type =
+ BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+ }
+ table_options.format_version = format_version_;
+ if (format_version_ >= 4) {
+ // value delta encoding challenged more with index interval > 1
+ table_options.index_block_restart_interval = 8;
+ }
+ table_options.metadata_block_size = 32;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Populate multiple layers
+ const int N = 10000;
+ for (int i = 0; i < N; i++) {
+ ASSERT_OK(Put(1, Key(i), Key(i)));
+ }
+ Compact(1, "a", "z");
+ for (int i = 0; i < N; i += 100) {
+ ASSERT_OK(Put(1, Key(i), Key(i)));
+ }
+ Flush(1);
+
+ // Prevent auto compactions triggered by seeks
+ env_->delay_sstable_sync_.store(true, std::memory_order_release);
+
+ // Lookup present keys. Should rarely read from small sstable.
+ env_->random_read_counter_.Reset();
+ for (int i = 0; i < N; i++) {
+ ASSERT_EQ(Key(i), Get(1, Key(i)));
+ }
+ int reads = env_->random_read_counter_.Read();
+ fprintf(stderr, "%d present => %d reads\n", N, reads);
+ ASSERT_GE(reads, N);
+ if (partition_filters_) {
+ // Without block cache, we read an extra partition filter per each
+ // level*read and a partition index per each read
+ ASSERT_LE(reads, 4 * N + 2 * N / 100);
+ } else {
+ ASSERT_LE(reads, N + 2 * N / 100);
+ }
+
+ // Lookup present keys. Should rarely read from either sstable.
+ env_->random_read_counter_.Reset();
+ for (int i = 0; i < N; i++) {
+ ASSERT_EQ("NOT_FOUND", Get(1, Key(i) + ".missing"));
+ }
+ reads = env_->random_read_counter_.Read();
+ fprintf(stderr, "%d missing => %d reads\n", N, reads);
+ if (partition_filters_) {
+ // With partitioned filter we read one extra filter per level per each
+ // missed read.
+ ASSERT_LE(reads, 2 * N + 3 * N / 100);
+ } else {
+ ASSERT_LE(reads, 3 * N / 100);
+ }
+
+ env_->delay_sstable_sync_.store(false, std::memory_order_release);
+ Close();
+ } while (ChangeCompactOptions());
+}
+
+#ifndef ROCKSDB_VALGRIND_RUN
+INSTANTIATE_TEST_CASE_P(
+ FormatDef, DBBloomFilterTestDefFormatVersion,
+ ::testing::Values(
+ std::make_tuple(BFP::kDeprecatedBlock, false,
+ test::kDefaultFormatVersion),
+ std::make_tuple(BFP::kAuto, true, test::kDefaultFormatVersion),
+ std::make_tuple(BFP::kAuto, false, test::kDefaultFormatVersion)));
+
+INSTANTIATE_TEST_CASE_P(
+ FormatDef, DBBloomFilterTestWithParam,
+ ::testing::Values(
+ std::make_tuple(BFP::kDeprecatedBlock, false,
+ test::kDefaultFormatVersion),
+ std::make_tuple(BFP::kAuto, true, test::kDefaultFormatVersion),
+ std::make_tuple(BFP::kAuto, false, test::kDefaultFormatVersion)));
+
+INSTANTIATE_TEST_CASE_P(
+ FormatLatest, DBBloomFilterTestWithParam,
+ ::testing::Values(
+ std::make_tuple(BFP::kDeprecatedBlock, false,
+ test::kLatestFormatVersion),
+ std::make_tuple(BFP::kAuto, true, test::kLatestFormatVersion),
+ std::make_tuple(BFP::kAuto, false, test::kLatestFormatVersion)));
+#endif // ROCKSDB_VALGRIND_RUN
+
+TEST_F(DBBloomFilterTest, BloomFilterRate) {
+ while (ChangeFilterOptions()) {
+ Options options = CurrentOptions();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ get_perf_context()->EnablePerLevelPerfContext();
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ const int maxKey = 10000;
+ for (int i = 0; i < maxKey; i++) {
+ ASSERT_OK(Put(1, Key(i), Key(i)));
+ }
+ // Add a large key to make the file contain wide range
+ ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
+ Flush(1);
+
+ // Check if they can be found
+ for (int i = 0; i < maxKey; i++) {
+ ASSERT_EQ(Key(i), Get(1, Key(i)));
+ }
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+
+ // Check if filter is useful
+ for (int i = 0; i < maxKey; i++) {
+ ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333)));
+ }
+ ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey * 0.98);
+ ASSERT_GE(
+ (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful,
+ maxKey * 0.98);
+ get_perf_context()->Reset();
+ }
+}
+
+TEST_F(DBBloomFilterTest, BloomFilterCompatibility) {
+ Options options = CurrentOptions();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ BlockBasedTableOptions table_options;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ // Create with block based filter
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ const int maxKey = 10000;
+ for (int i = 0; i < maxKey; i++) {
+ ASSERT_OK(Put(1, Key(i), Key(i)));
+ }
+ ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
+ Flush(1);
+
+ // Check db with full filter
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ // Check if they can be found
+ for (int i = 0; i < maxKey; i++) {
+ ASSERT_EQ(Key(i), Get(1, Key(i)));
+ }
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+
+ // Check db with partitioned full filter
+ table_options.partition_filters = true;
+ table_options.index_type =
+ BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ // Check if they can be found
+ for (int i = 0; i < maxKey; i++) {
+ ASSERT_EQ(Key(i), Get(1, Key(i)));
+ }
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+}
+
+TEST_F(DBBloomFilterTest, BloomFilterReverseCompatibility) {
+ for (bool partition_filters : {true, false}) {
+ Options options = CurrentOptions();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ BlockBasedTableOptions table_options;
+ if (partition_filters) {
+ table_options.partition_filters = true;
+ table_options.index_type =
+ BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+ }
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ DestroyAndReopen(options);
+
+ // Create with full filter
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ const int maxKey = 10000;
+ for (int i = 0; i < maxKey; i++) {
+ ASSERT_OK(Put(1, Key(i), Key(i)));
+ }
+ ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
+ Flush(1);
+
+ // Check db with block_based filter
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ // Check if they can be found
+ for (int i = 0; i < maxKey; i++) {
+ ASSERT_EQ(Key(i), Get(1, Key(i)));
+ }
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+ }
+}
+
+namespace {
+// A wrapped bloom over block-based FilterPolicy
+class TestingWrappedBlockBasedFilterPolicy : public FilterPolicy {
+ public:
+ explicit TestingWrappedBlockBasedFilterPolicy(int bits_per_key)
+ : filter_(NewBloomFilterPolicy(bits_per_key, true)), counter_(0) {}
+
+ ~TestingWrappedBlockBasedFilterPolicy() override { delete filter_; }
+
+ const char* Name() const override {
+ return "TestingWrappedBlockBasedFilterPolicy";
+ }
+
+ void CreateFilter(const ROCKSDB_NAMESPACE::Slice* keys, int n,
+ std::string* dst) const override {
+ std::unique_ptr<ROCKSDB_NAMESPACE::Slice[]> user_keys(
+ new ROCKSDB_NAMESPACE::Slice[n]);
+ for (int i = 0; i < n; ++i) {
+ user_keys[i] = convertKey(keys[i]);
+ }
+ return filter_->CreateFilter(user_keys.get(), n, dst);
+ }
+
+ bool KeyMayMatch(const ROCKSDB_NAMESPACE::Slice& key,
+ const ROCKSDB_NAMESPACE::Slice& filter) const override {
+ counter_++;
+ return filter_->KeyMayMatch(convertKey(key), filter);
+ }
+
+ uint32_t GetCounter() { return counter_; }
+
+ private:
+ const FilterPolicy* filter_;
+ mutable uint32_t counter_;
+
+ ROCKSDB_NAMESPACE::Slice convertKey(
+ const ROCKSDB_NAMESPACE::Slice& key) const {
+ return key;
+ }
+};
+} // namespace
+
+TEST_F(DBBloomFilterTest, WrappedBlockBasedFilterPolicy) {
+ Options options = CurrentOptions();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+ BlockBasedTableOptions table_options;
+ TestingWrappedBlockBasedFilterPolicy* policy =
+ new TestingWrappedBlockBasedFilterPolicy(10);
+ table_options.filter_policy.reset(policy);
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ const int maxKey = 10000;
+ for (int i = 0; i < maxKey; i++) {
+ ASSERT_OK(Put(1, Key(i), Key(i)));
+ }
+ // Add a large key to make the file contain wide range
+ ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
+ ASSERT_EQ(0U, policy->GetCounter());
+ Flush(1);
+
+ // Check if they can be found
+ for (int i = 0; i < maxKey; i++) {
+ ASSERT_EQ(Key(i), Get(1, Key(i)));
+ }
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+ ASSERT_EQ(1U * maxKey, policy->GetCounter());
+
+ // Check if filter is useful
+ for (int i = 0; i < maxKey; i++) {
+ ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333)));
+ }
+ ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey * 0.98);
+ ASSERT_EQ(2U * maxKey, policy->GetCounter());
+}
+
+namespace {
+// NOTE: This class is referenced by HISTORY.md as a model for a wrapper
+// FilterPolicy selecting among configurations based on context.
+class LevelAndStyleCustomFilterPolicy : public FilterPolicy {
+ public:
+ explicit LevelAndStyleCustomFilterPolicy(int bpk_fifo, int bpk_l0_other,
+ int bpk_otherwise)
+ : policy_fifo_(NewBloomFilterPolicy(bpk_fifo)),
+ policy_l0_other_(NewBloomFilterPolicy(bpk_l0_other)),
+ policy_otherwise_(NewBloomFilterPolicy(bpk_otherwise)) {}
+
+ // OK to use built-in policy name because we are deferring to a
+ // built-in builder. We aren't changing the serialized format.
+ const char* Name() const override { return policy_fifo_->Name(); }
+
+ FilterBitsBuilder* GetBuilderWithContext(
+ const FilterBuildingContext& context) const override {
+ if (context.compaction_style == kCompactionStyleFIFO) {
+ return policy_fifo_->GetBuilderWithContext(context);
+ } else if (context.level_at_creation == 0) {
+ return policy_l0_other_->GetBuilderWithContext(context);
+ } else {
+ return policy_otherwise_->GetBuilderWithContext(context);
+ }
+ }
+
+ FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override {
+ // OK to defer to any of them; they all can parse built-in filters
+ // from any settings.
+ return policy_fifo_->GetFilterBitsReader(contents);
+ }
+
+ // Defer just in case configuration uses block-based filter
+ void CreateFilter(const Slice* keys, int n, std::string* dst) const override {
+ policy_otherwise_->CreateFilter(keys, n, dst);
+ }
+ bool KeyMayMatch(const Slice& key, const Slice& filter) const override {
+ return policy_otherwise_->KeyMayMatch(key, filter);
+ }
+
+ private:
+ const std::unique_ptr<const FilterPolicy> policy_fifo_;
+ const std::unique_ptr<const FilterPolicy> policy_l0_other_;
+ const std::unique_ptr<const FilterPolicy> policy_otherwise_;
+};
+
+class TestingContextCustomFilterPolicy
+ : public LevelAndStyleCustomFilterPolicy {
+ public:
+ explicit TestingContextCustomFilterPolicy(int bpk_fifo, int bpk_l0_other,
+ int bpk_otherwise)
+ : LevelAndStyleCustomFilterPolicy(bpk_fifo, bpk_l0_other, bpk_otherwise) {
+ }
+
+ FilterBitsBuilder* GetBuilderWithContext(
+ const FilterBuildingContext& context) const override {
+ test_report_ += "cf=";
+ test_report_ += context.column_family_name;
+ test_report_ += ",cs=";
+ test_report_ +=
+ OptionsHelper::compaction_style_to_string[context.compaction_style];
+ test_report_ += ",lv=";
+ test_report_ += std::to_string(context.level_at_creation);
+ test_report_ += "\n";
+
+ return LevelAndStyleCustomFilterPolicy::GetBuilderWithContext(context);
+ }
+
+ std::string DumpTestReport() {
+ std::string rv;
+ std::swap(rv, test_report_);
+ return rv;
+ }
+
+ private:
+ mutable std::string test_report_;
+};
+} // namespace
+
+TEST_F(DBBloomFilterTest, ContextCustomFilterPolicy) {
+ for (bool fifo : {true, false}) {
+ Options options = CurrentOptions();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.compaction_style =
+ fifo ? kCompactionStyleFIFO : kCompactionStyleLevel;
+
+ BlockBasedTableOptions table_options;
+ auto policy = std::make_shared<TestingContextCustomFilterPolicy>(15, 8, 5);
+ table_options.filter_policy = policy;
+ table_options.format_version = 5;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ CreateAndReopenWithCF({fifo ? "abe" : "bob"}, options);
+
+ const int maxKey = 10000;
+ for (int i = 0; i < maxKey / 2; i++) {
+ ASSERT_OK(Put(1, Key(i), Key(i)));
+ }
+ // Add a large key to make the file contain wide range
+ ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
+ Flush(1);
+ EXPECT_EQ(policy->DumpTestReport(),
+ fifo ? "cf=abe,cs=kCompactionStyleFIFO,lv=0\n"
+ : "cf=bob,cs=kCompactionStyleLevel,lv=0\n");
+
+ for (int i = maxKey / 2; i < maxKey; i++) {
+ ASSERT_OK(Put(1, Key(i), Key(i)));
+ }
+ Flush(1);
+ EXPECT_EQ(policy->DumpTestReport(),
+ fifo ? "cf=abe,cs=kCompactionStyleFIFO,lv=0\n"
+ : "cf=bob,cs=kCompactionStyleLevel,lv=0\n");
+
+ // Check that they can be found
+ for (int i = 0; i < maxKey; i++) {
+ ASSERT_EQ(Key(i), Get(1, Key(i)));
+ }
+ // Since we have two tables / two filters, we might have Bloom checks on
+ // our queries, but no more than one "useful" per query on a found key.
+ EXPECT_LE(TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey);
+
+ // Check that we have two filters, each about
+ // fifo: 0.12% FP rate (15 bits per key)
+ // level: 2.3% FP rate (8 bits per key)
+ for (int i = 0; i < maxKey; i++) {
+ ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333)));
+ }
+ {
+ auto useful_count =
+ TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL);
+ EXPECT_GE(useful_count, maxKey * 2 * (fifo ? 0.9980 : 0.975));
+ EXPECT_LE(useful_count, maxKey * 2 * (fifo ? 0.9995 : 0.98));
+ }
+
+ if (!fifo) { // FIFO only has L0
+ // Full compaction
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+ nullptr));
+ EXPECT_EQ(policy->DumpTestReport(),
+ "cf=bob,cs=kCompactionStyleLevel,lv=1\n");
+
+ // Check that we now have one filter, about 9.2% FP rate (5 bits per key)
+ for (int i = 0; i < maxKey; i++) {
+ ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333)));
+ }
+ {
+ auto useful_count =
+ TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL);
+ EXPECT_GE(useful_count, maxKey * 0.90);
+ EXPECT_LE(useful_count, maxKey * 0.91);
+ }
+ }
+
+ // Destroy
+ ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+ dbfull()->DestroyColumnFamilyHandle(handles_[1]);
+ handles_[1] = nullptr;
+ }
+}
+
+class SliceTransformLimitedDomain : public SliceTransform {
+ const char* Name() const override { return "SliceTransformLimitedDomain"; }
+
+ Slice Transform(const Slice& src) const override {
+ return Slice(src.data(), 5);
+ }
+
+ bool InDomain(const Slice& src) const override {
+ // prefix will be x????
+ return src.size() >= 5 && src[0] == 'x';
+ }
+
+ bool InRange(const Slice& dst) const override {
+ // prefix will be x????
+ return dst.size() == 5 && dst[0] == 'x';
+ }
+};
+
+TEST_F(DBBloomFilterTest, PrefixExtractorFullFilter) {
+ BlockBasedTableOptions bbto;
+ // Full Filter Block
+ bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, false));
+ bbto.whole_key_filtering = false;
+
+ Options options = CurrentOptions();
+ options.prefix_extractor = std::make_shared<SliceTransformLimitedDomain>();
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("x1111_AAAA", "val1"));
+ ASSERT_OK(Put("x1112_AAAA", "val2"));
+ ASSERT_OK(Put("x1113_AAAA", "val3"));
+ ASSERT_OK(Put("x1114_AAAA", "val4"));
+ // Not in domain, wont be added to filter
+ ASSERT_OK(Put("zzzzz_AAAA", "val5"));
+
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ(Get("x1111_AAAA"), "val1");
+ ASSERT_EQ(Get("x1112_AAAA"), "val2");
+ ASSERT_EQ(Get("x1113_AAAA"), "val3");
+ ASSERT_EQ(Get("x1114_AAAA"), "val4");
+ // Was not added to filter but rocksdb will try to read it from the filter
+ ASSERT_EQ(Get("zzzzz_AAAA"), "val5");
+}
+
+TEST_F(DBBloomFilterTest, PrefixExtractorBlockFilter) {
+ BlockBasedTableOptions bbto;
+ // Block Filter Block
+ bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, true));
+
+ Options options = CurrentOptions();
+ options.prefix_extractor = std::make_shared<SliceTransformLimitedDomain>();
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("x1113_AAAA", "val3"));
+ ASSERT_OK(Put("x1114_AAAA", "val4"));
+ // Not in domain, wont be added to filter
+ ASSERT_OK(Put("zzzzz_AAAA", "val1"));
+ ASSERT_OK(Put("zzzzz_AAAB", "val2"));
+ ASSERT_OK(Put("zzzzz_AAAC", "val3"));
+ ASSERT_OK(Put("zzzzz_AAAD", "val4"));
+
+ ASSERT_OK(Flush());
+
+ std::vector<std::string> iter_res;
+ auto iter = db_->NewIterator(ReadOptions());
+ // Seek to a key that was not in Domain
+ for (iter->Seek("zzzzz_AAAA"); iter->Valid(); iter->Next()) {
+ iter_res.emplace_back(iter->value().ToString());
+ }
+
+ std::vector<std::string> expected_res = {"val1", "val2", "val3", "val4"};
+ ASSERT_EQ(iter_res, expected_res);
+ delete iter;
+}
+
+TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilter) {
+ // regression test for #2743. the range delete tombstones in memtable should
+ // be added even when Get() skips searching due to its prefix bloom filter
+ const int kMemtableSize = 1 << 20; // 1MB
+ const int kMemtablePrefixFilterSize = 1 << 13; // 8KB
+ const int kPrefixLen = 4;
+ Options options = CurrentOptions();
+ options.memtable_prefix_bloom_size_ratio =
+ static_cast<double>(kMemtablePrefixFilterSize) / kMemtableSize;
+ options.prefix_extractor.reset(
+ ROCKSDB_NAMESPACE::NewFixedPrefixTransform(kPrefixLen));
+ options.write_buffer_size = kMemtableSize;
+ options.memtable_whole_key_filtering = false;
+ Reopen(options);
+ std::string key1("AAAABBBB");
+ std::string key2("AAAACCCC"); // not in DB
+ std::string key3("AAAADDDD");
+ std::string key4("AAAAEEEE");
+ std::string value1("Value1");
+ std::string value3("Value3");
+ std::string value4("Value4");
+
+ ASSERT_OK(Put(key1, value1, WriteOptions()));
+
+ // check memtable bloom stats
+ ASSERT_EQ("NOT_FOUND", Get(key2));
+ ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count);
+ // same prefix, bloom filter false positive
+ ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
+
+ // enable whole key bloom filter
+ options.memtable_whole_key_filtering = true;
+ Reopen(options);
+ // check memtable bloom stats
+ ASSERT_OK(Put(key3, value3, WriteOptions()));
+ ASSERT_EQ("NOT_FOUND", Get(key2));
+ // whole key bloom filter kicks in and determines it's a miss
+ ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count);
+ ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
+
+ // verify whole key filtering does not depend on prefix_extractor
+ options.prefix_extractor.reset();
+ Reopen(options);
+ // check memtable bloom stats
+ ASSERT_OK(Put(key4, value4, WriteOptions()));
+ ASSERT_EQ("NOT_FOUND", Get(key2));
+ // whole key bloom filter kicks in and determines it's a miss
+ ASSERT_EQ(2, get_perf_context()->bloom_memtable_miss_count);
+ ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
+}
+
+TEST_F(DBBloomFilterTest, MemtablePrefixBloomOutOfDomain) {
+ constexpr size_t kPrefixSize = 8;
+ const std::string kKey = "key";
+ assert(kKey.size() < kPrefixSize);
+ Options options = CurrentOptions();
+ options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixSize));
+ options.memtable_prefix_bloom_size_ratio = 0.25;
+ Reopen(options);
+ ASSERT_OK(Put(kKey, "v"));
+ ASSERT_EQ("v", Get(kKey));
+ std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ReadOptions()));
+ iter->Seek(kKey);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(kKey, iter->key());
+ iter->SeekForPrev(kKey);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(kKey, iter->key());
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+namespace BFP2 {
+// Extends BFP::Mode with option to use Plain table
+using PseudoMode = int;
+static constexpr PseudoMode kPlainTable = -1;
+} // namespace BFP2
+} // namespace
+
+class BloomStatsTestWithParam
+ : public DBBloomFilterTest,
+ public testing::WithParamInterface<std::tuple<BFP2::PseudoMode, bool>> {
+ public:
+ BloomStatsTestWithParam() {
+ bfp_impl_ = std::get<0>(GetParam());
+ partition_filters_ = std::get<1>(GetParam());
+
+ options_.create_if_missing = true;
+ options_.prefix_extractor.reset(
+ ROCKSDB_NAMESPACE::NewFixedPrefixTransform(4));
+ options_.memtable_prefix_bloom_size_ratio =
+ 8.0 * 1024.0 / static_cast<double>(options_.write_buffer_size);
+ if (bfp_impl_ == BFP2::kPlainTable) {
+ assert(!partition_filters_); // not supported in plain table
+ PlainTableOptions table_options;
+ options_.table_factory.reset(NewPlainTableFactory(table_options));
+ } else {
+ BlockBasedTableOptions table_options;
+ table_options.hash_index_allow_collision = false;
+ if (partition_filters_) {
+ assert(bfp_impl_ != BFP::kDeprecatedBlock);
+ table_options.partition_filters = partition_filters_;
+ table_options.index_type =
+ BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+ }
+ table_options.filter_policy.reset(
+ new BFP(10, static_cast<BFP::Mode>(bfp_impl_)));
+ options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ }
+ options_.env = env_;
+
+ get_perf_context()->Reset();
+ DestroyAndReopen(options_);
+ }
+
+ ~BloomStatsTestWithParam() override {
+ get_perf_context()->Reset();
+ Destroy(options_);
+ }
+
+ // Required if inheriting from testing::WithParamInterface<>
+ static void SetUpTestCase() {}
+ static void TearDownTestCase() {}
+
+ BFP2::PseudoMode bfp_impl_;
+ bool partition_filters_;
+ Options options_;
+};
+
+// 1 Insert 2 K-V pairs into DB
+// 2 Call Get() for both keys - expext memtable bloom hit stat to be 2
+// 3 Call Get() for nonexisting key - expect memtable bloom miss stat to be 1
+// 4 Call Flush() to create SST
+// 5 Call Get() for both keys - expext SST bloom hit stat to be 2
+// 6 Call Get() for nonexisting key - expect SST bloom miss stat to be 1
+// Test both: block and plain SST
+TEST_P(BloomStatsTestWithParam, BloomStatsTest) {
+ std::string key1("AAAA");
+ std::string key2("RXDB"); // not in DB
+ std::string key3("ZBRA");
+ std::string value1("Value1");
+ std::string value3("Value3");
+
+ ASSERT_OK(Put(key1, value1, WriteOptions()));
+ ASSERT_OK(Put(key3, value3, WriteOptions()));
+
+ // check memtable bloom stats
+ ASSERT_EQ(value1, Get(key1));
+ ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
+ ASSERT_EQ(value3, Get(key3));
+ ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count);
+ ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count);
+
+ ASSERT_EQ("NOT_FOUND", Get(key2));
+ ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count);
+ ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count);
+
+ // sanity checks
+ ASSERT_EQ(0, get_perf_context()->bloom_sst_hit_count);
+ ASSERT_EQ(0, get_perf_context()->bloom_sst_miss_count);
+
+ Flush();
+
+ // sanity checks
+ ASSERT_EQ(0, get_perf_context()->bloom_sst_hit_count);
+ ASSERT_EQ(0, get_perf_context()->bloom_sst_miss_count);
+
+ // check SST bloom stats
+ ASSERT_EQ(value1, Get(key1));
+ ASSERT_EQ(1, get_perf_context()->bloom_sst_hit_count);
+ ASSERT_EQ(value3, Get(key3));
+ ASSERT_EQ(2, get_perf_context()->bloom_sst_hit_count);
+
+ ASSERT_EQ("NOT_FOUND", Get(key2));
+ ASSERT_EQ(1, get_perf_context()->bloom_sst_miss_count);
+}
+
+// Same scenario as in BloomStatsTest but using an iterator
+TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) {
+ std::string key1("AAAA");
+ std::string key2("RXDB"); // not in DB
+ std::string key3("ZBRA");
+ std::string value1("Value1");
+ std::string value3("Value3");
+
+ ASSERT_OK(Put(key1, value1, WriteOptions()));
+ ASSERT_OK(Put(key3, value3, WriteOptions()));
+
+ std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ReadOptions()));
+
+ // check memtable bloom stats
+ iter->Seek(key1);
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(value1, iter->value().ToString());
+ ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
+ ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count);
+
+ iter->Seek(key3);
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(value3, iter->value().ToString());
+ ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count);
+ ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count);
+
+ iter->Seek(key2);
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+ ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count);
+ ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count);
+
+ Flush();
+
+ iter.reset(dbfull()->NewIterator(ReadOptions()));
+
+ // Check SST bloom stats
+ iter->Seek(key1);
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(value1, iter->value().ToString());
+ ASSERT_EQ(1, get_perf_context()->bloom_sst_hit_count);
+
+ iter->Seek(key3);
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(value3, iter->value().ToString());
+ // The seek doesn't check block-based bloom filter because last index key
+ // starts with the same prefix we're seeking to.
+ uint64_t expected_hits = bfp_impl_ == BFP::kDeprecatedBlock ? 1 : 2;
+ ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count);
+
+ iter->Seek(key2);
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+ ASSERT_EQ(1, get_perf_context()->bloom_sst_miss_count);
+ ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count);
+}
+
+INSTANTIATE_TEST_CASE_P(
+ BloomStatsTestWithParam, BloomStatsTestWithParam,
+ ::testing::Values(std::make_tuple(BFP::kDeprecatedBlock, false),
+ std::make_tuple(BFP::kLegacyBloom, false),
+ std::make_tuple(BFP::kLegacyBloom, true),
+ std::make_tuple(BFP::kFastLocalBloom, false),
+ std::make_tuple(BFP::kFastLocalBloom, true),
+ std::make_tuple(BFP2::kPlainTable, false)));
+
+namespace {
+void PrefixScanInit(DBBloomFilterTest* dbtest) {
+ char buf[100];
+ std::string keystr;
+ const int small_range_sstfiles = 5;
+ const int big_range_sstfiles = 5;
+
+ // Generate 11 sst files with the following prefix ranges.
+ // GROUP 0: [0,10] (level 1)
+ // GROUP 1: [1,2], [2,3], [3,4], [4,5], [5, 6] (level 0)
+ // GROUP 2: [0,6], [0,7], [0,8], [0,9], [0,10] (level 0)
+ //
+ // A seek with the previous API would do 11 random I/Os (to all the
+ // files). With the new API and a prefix filter enabled, we should
+ // only do 2 random I/O, to the 2 files containing the key.
+
+ // GROUP 0
+ snprintf(buf, sizeof(buf), "%02d______:start", 0);
+ keystr = std::string(buf);
+ ASSERT_OK(dbtest->Put(keystr, keystr));
+ snprintf(buf, sizeof(buf), "%02d______:end", 10);
+ keystr = std::string(buf);
+ ASSERT_OK(dbtest->Put(keystr, keystr));
+ dbtest->Flush();
+ dbtest->dbfull()->CompactRange(CompactRangeOptions(), nullptr,
+ nullptr); // move to level 1
+
+ // GROUP 1
+ for (int i = 1; i <= small_range_sstfiles; i++) {
+ snprintf(buf, sizeof(buf), "%02d______:start", i);
+ keystr = std::string(buf);
+ ASSERT_OK(dbtest->Put(keystr, keystr));
+ snprintf(buf, sizeof(buf), "%02d______:end", i + 1);
+ keystr = std::string(buf);
+ ASSERT_OK(dbtest->Put(keystr, keystr));
+ dbtest->Flush();
+ }
+
+ // GROUP 2
+ for (int i = 1; i <= big_range_sstfiles; i++) {
+ snprintf(buf, sizeof(buf), "%02d______:start", 0);
+ keystr = std::string(buf);
+ ASSERT_OK(dbtest->Put(keystr, keystr));
+ snprintf(buf, sizeof(buf), "%02d______:end", small_range_sstfiles + i + 1);
+ keystr = std::string(buf);
+ ASSERT_OK(dbtest->Put(keystr, keystr));
+ dbtest->Flush();
+ }
+}
+} // namespace
+
+TEST_F(DBBloomFilterTest, PrefixScan) {
+ while (ChangeFilterOptions()) {
+ int count;
+ Slice prefix;
+ Slice key;
+ char buf[100];
+ Iterator* iter;
+ snprintf(buf, sizeof(buf), "03______:");
+ prefix = Slice(buf, 8);
+ key = Slice(buf, 9);
+ ASSERT_EQ(key.difference_offset(prefix), 8);
+ ASSERT_EQ(prefix.difference_offset(key), 8);
+ // db configs
+ env_->count_random_reads_ = true;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+ options.disable_auto_compactions = true;
+ options.max_background_compactions = 2;
+ options.create_if_missing = true;
+ options.memtable_factory.reset(NewHashSkipListRepFactory(16));
+ assert(!options.unordered_write);
+ // It is incompatible with allow_concurrent_memtable_write=false
+ options.allow_concurrent_memtable_write = false;
+
+ BlockBasedTableOptions table_options;
+ table_options.no_block_cache = true;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+ table_options.whole_key_filtering = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ // 11 RAND I/Os
+ DestroyAndReopen(options);
+ PrefixScanInit(this);
+ count = 0;
+ env_->random_read_counter_.Reset();
+ iter = db_->NewIterator(ReadOptions());
+ for (iter->Seek(prefix); iter->Valid(); iter->Next()) {
+ if (!iter->key().starts_with(prefix)) {
+ break;
+ }
+ count++;
+ }
+ ASSERT_OK(iter->status());
+ delete iter;
+ ASSERT_EQ(count, 2);
+ ASSERT_EQ(env_->random_read_counter_.Read(), 2);
+ Close();
+ } // end of while
+}
+
+TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 64 * 1024;
+ options.arena_block_size = 4 * 1024;
+ options.target_file_size_base = 64 * 1024;
+ options.level0_file_num_compaction_trigger = 2;
+ options.level0_slowdown_writes_trigger = 2;
+ options.level0_stop_writes_trigger = 4;
+ options.max_bytes_for_level_base = 256 * 1024;
+ options.max_write_buffer_number = 2;
+ options.max_background_compactions = 8;
+ options.max_background_flushes = 8;
+ options.compression = kNoCompression;
+ options.compaction_style = kCompactionStyleLevel;
+ options.level_compaction_dynamic_level_bytes = true;
+ BlockBasedTableOptions bbto;
+ bbto.cache_index_and_filter_blocks = true;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10, true));
+ bbto.whole_key_filtering = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ options.optimize_filters_for_hits = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ get_perf_context()->Reset();
+ get_perf_context()->EnablePerLevelPerfContext();
+ CreateAndReopenWithCF({"mypikachu"}, options);
+
+ int numkeys = 200000;
+
+ // Generate randomly shuffled keys, so the updates are almost
+ // random.
+ std::vector<int> keys;
+ keys.reserve(numkeys);
+ for (int i = 0; i < numkeys; i += 2) {
+ keys.push_back(i);
+ }
+ std::random_shuffle(std::begin(keys), std::end(keys));
+
+ int num_inserted = 0;
+ for (int key : keys) {
+ ASSERT_OK(Put(1, Key(key), "val"));
+ if (++num_inserted % 1000 == 0) {
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+ }
+ }
+ ASSERT_OK(Put(1, Key(0), "val"));
+ ASSERT_OK(Put(1, Key(numkeys), "val"));
+ ASSERT_OK(Flush(1));
+ dbfull()->TEST_WaitForCompact();
+
+ if (NumTableFilesAtLevel(0, 1) == 0) {
+ // No Level 0 file. Create one.
+ ASSERT_OK(Put(1, Key(0), "val"));
+ ASSERT_OK(Put(1, Key(numkeys), "val"));
+ ASSERT_OK(Flush(1));
+ dbfull()->TEST_WaitForCompact();
+ }
+
+ for (int i = 1; i < numkeys; i += 2) {
+ ASSERT_EQ(Get(1, Key(i)), "NOT_FOUND");
+ }
+
+ ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0));
+ ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1));
+ ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP));
+
+ // Now we have three sorted run, L0, L5 and L6 with most files in L6 have
+ // no bloom filter. Most keys be checked bloom filters twice.
+ ASSERT_GT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 65000 * 2);
+ ASSERT_LT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 120000 * 2);
+ uint64_t bloom_filter_useful_all_levels = 0;
+ for (auto& kv : (*(get_perf_context()->level_to_perf_context))) {
+ if (kv.second.bloom_filter_useful > 0) {
+ bloom_filter_useful_all_levels += kv.second.bloom_filter_useful;
+ }
+ }
+ ASSERT_GT(bloom_filter_useful_all_levels, 65000 * 2);
+ ASSERT_LT(bloom_filter_useful_all_levels, 120000 * 2);
+
+ for (int i = 0; i < numkeys; i += 2) {
+ ASSERT_EQ(Get(1, Key(i)), "val");
+ }
+
+ // Part 2 (read path): rewrite last level with blooms, then verify they get
+ // cached only if !optimize_filters_for_hits
+ options.disable_auto_compactions = true;
+ options.num_levels = 9;
+ options.optimize_filters_for_hits = false;
+ options.statistics = CreateDBStatistics();
+ bbto.block_cache.reset();
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+ ReopenWithColumnFamilies({"default", "mypikachu"}, options);
+ MoveFilesToLevel(7 /* level */, 1 /* column family index */);
+
+ std::string value = Get(1, Key(0));
+ uint64_t prev_cache_filter_hits =
+ TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
+ value = Get(1, Key(0));
+ ASSERT_EQ(prev_cache_filter_hits + 1,
+ TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+
+ // Now that we know the filter blocks exist in the last level files, see if
+ // filter caching is skipped for this optimization
+ options.optimize_filters_for_hits = true;
+ options.statistics = CreateDBStatistics();
+ bbto.block_cache.reset();
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+ ReopenWithColumnFamilies({"default", "mypikachu"}, options);
+
+ value = Get(1, Key(0));
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+ ASSERT_EQ(2 /* index and data block */,
+ TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+ // Check filter block ignored for files preloaded during DB::Open()
+ options.max_open_files = -1;
+ options.statistics = CreateDBStatistics();
+ bbto.block_cache.reset();
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+ ReopenWithColumnFamilies({"default", "mypikachu"}, options);
+
+ uint64_t prev_cache_filter_misses =
+ TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+ prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
+ Get(1, Key(0));
+ ASSERT_EQ(prev_cache_filter_misses,
+ TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(prev_cache_filter_hits,
+ TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+
+ // Check filter block ignored for file trivially-moved to bottom level
+ bbto.block_cache.reset();
+ options.max_open_files = 100; // setting > -1 makes it not preload all files
+ options.statistics = CreateDBStatistics();
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+ ReopenWithColumnFamilies({"default", "mypikachu"}, options);
+
+ ASSERT_OK(Put(1, Key(numkeys + 1), "val"));
+ ASSERT_OK(Flush(1));
+
+ int32_t trivial_move = 0;
+ int32_t non_trivial_move = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:TrivialMove",
+ [&](void* /*arg*/) { trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial",
+ [&](void* /*arg*/) { non_trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ CompactRangeOptions compact_options;
+ compact_options.bottommost_level_compaction =
+ BottommostLevelCompaction::kSkip;
+ compact_options.change_level = true;
+ compact_options.target_level = 7;
+ db_->CompactRange(compact_options, handles_[1], nullptr, nullptr);
+
+ ASSERT_EQ(trivial_move, 1);
+ ASSERT_EQ(non_trivial_move, 0);
+
+ prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
+ prev_cache_filter_misses =
+ TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+ value = Get(1, Key(numkeys + 1));
+ ASSERT_EQ(prev_cache_filter_hits,
+ TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+ ASSERT_EQ(prev_cache_filter_misses,
+ TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+
+ // Check filter block not cached for iterator
+ bbto.block_cache.reset();
+ options.statistics = CreateDBStatistics();
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+ ReopenWithColumnFamilies({"default", "mypikachu"}, options);
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions(), handles_[1]));
+ iter->SeekToFirst();
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+ ASSERT_EQ(2 /* index and data block */,
+ TestGetTickerCount(options, BLOCK_CACHE_ADD));
+ get_perf_context()->Reset();
+}
+
+int CountIter(std::unique_ptr<Iterator>& iter, const Slice& key) {
+ int count = 0;
+ for (iter->Seek(key); iter->Valid() && iter->status() == Status::OK();
+ iter->Next()) {
+ count++;
+ }
+ return count;
+}
+
+// use iterate_upper_bound to hint compatiability of existing bloom filters.
+// The BF is considered compatible if 1) upper bound and seek key transform
+// into the same string, or 2) the transformed seek key is of the same length
+// as the upper bound and two keys are adjacent according to the comparator.
+TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
+ for (auto bfp_impl : BFP::kAllFixedImpls) {
+ int using_full_builder = bfp_impl != BFP::kDeprecatedBlock;
+ Options options;
+ options.create_if_missing = true;
+ options.prefix_extractor.reset(NewCappedPrefixTransform(4));
+ options.disable_auto_compactions = true;
+ options.statistics = CreateDBStatistics();
+ // Enable prefix bloom for SST files
+ BlockBasedTableOptions table_options;
+ table_options.cache_index_and_filter_blocks = true;
+ table_options.filter_policy.reset(new BFP(10, bfp_impl));
+ table_options.index_shortening = BlockBasedTableOptions::
+ IndexShorteningMode::kShortenSeparatorsAndSuccessor;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("abcdxxx0", "val1"));
+ ASSERT_OK(Put("abcdxxx1", "val2"));
+ ASSERT_OK(Put("abcdxxx2", "val3"));
+ ASSERT_OK(Put("abcdxxx3", "val4"));
+ dbfull()->Flush(FlushOptions());
+ {
+ // prefix_extractor has not changed, BF will always be read
+ Slice upper_bound("abce");
+ ReadOptions read_options;
+ read_options.prefix_same_as_start = true;
+ read_options.iterate_upper_bound = &upper_bound;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter, "abcd0000"), 4);
+ }
+ {
+ Slice upper_bound("abcdzzzz");
+ ReadOptions read_options;
+ read_options.prefix_same_as_start = true;
+ read_options.iterate_upper_bound = &upper_bound;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter, "abcd0000"), 4);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 2);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+ }
+ ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:5"}}));
+ ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(),
+ "rocksdb.FixedPrefix.5"));
+ {
+ // BF changed, [abcdxx00, abce) is a valid bound, will trigger BF read
+ Slice upper_bound("abce");
+ ReadOptions read_options;
+ read_options.prefix_same_as_start = true;
+ read_options.iterate_upper_bound = &upper_bound;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter, "abcdxx00"), 4);
+ // should check bloom filter since upper bound meets requirement
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+ 2 + using_full_builder);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+ }
+ {
+ // [abcdxx01, abcey) is not valid bound since upper bound is too long for
+ // the BF in SST (capped:4)
+ Slice upper_bound("abcey");
+ ReadOptions read_options;
+ read_options.prefix_same_as_start = true;
+ read_options.iterate_upper_bound = &upper_bound;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter, "abcdxx01"), 4);
+ // should skip bloom filter since upper bound is too long
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+ 2 + using_full_builder);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+ }
+ {
+ // [abcdxx02, abcdy) is a valid bound since the prefix is the same
+ Slice upper_bound("abcdy");
+ ReadOptions read_options;
+ read_options.prefix_same_as_start = true;
+ read_options.iterate_upper_bound = &upper_bound;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter, "abcdxx02"), 4);
+ // should check bloom filter since upper bound matches transformed seek
+ // key
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+ 2 + using_full_builder * 2);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+ }
+ {
+ // [aaaaaaaa, abce) is not a valid bound since 1) they don't share the
+ // same prefix, 2) the prefixes are not consecutive
+ Slice upper_bound("abce");
+ ReadOptions read_options;
+ read_options.prefix_same_as_start = true;
+ read_options.iterate_upper_bound = &upper_bound;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter, "aaaaaaaa"), 0);
+ // should skip bloom filter since mismatch is found
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+ 2 + using_full_builder * 2);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+ }
+ ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:3"}}));
+ {
+ // [abc, abd) is not a valid bound since the upper bound is too short
+ // for BF (capped:4)
+ Slice upper_bound("abd");
+ ReadOptions read_options;
+ read_options.prefix_same_as_start = true;
+ read_options.iterate_upper_bound = &upper_bound;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter, "abc"), 4);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+ 2 + using_full_builder * 2);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+ }
+ ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:4"}}));
+ {
+ // set back to capped:4 and verify BF is always read
+ Slice upper_bound("abd");
+ ReadOptions read_options;
+ read_options.prefix_same_as_start = true;
+ read_options.iterate_upper_bound = &upper_bound;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter, "abc"), 0);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+ 3 + using_full_builder * 2);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1);
+ }
+ }
+}
+
+// Create multiple SST files each with a different prefix_extractor config,
+// verify iterators can read all SST files using the latest config.
+TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
+ for (auto bfp_impl : BFP::kAllFixedImpls) {
+ int using_full_builder = bfp_impl != BFP::kDeprecatedBlock;
+ Options options;
+ options.create_if_missing = true;
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+ options.disable_auto_compactions = true;
+ options.statistics = CreateDBStatistics();
+ // Enable prefix bloom for SST files
+ BlockBasedTableOptions table_options;
+ table_options.filter_policy.reset(new BFP(10, bfp_impl));
+ table_options.cache_index_and_filter_blocks = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ DestroyAndReopen(options);
+
+ Slice upper_bound("foz90000");
+ ReadOptions read_options;
+ read_options.prefix_same_as_start = true;
+
+ // first SST with fixed:1 BF
+ ASSERT_OK(Put("foo2", "bar2"));
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Put("foq1", "bar1"));
+ ASSERT_OK(Put("fpa", "0"));
+ dbfull()->Flush(FlushOptions());
+ std::unique_ptr<Iterator> iter_old(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter_old, "foo"), 4);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 1);
+
+ ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}}));
+ ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(),
+ "rocksdb.CappedPrefix.3"));
+ read_options.iterate_upper_bound = &upper_bound;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter, "foo"), 2);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+ 1 + using_full_builder);
+ ASSERT_EQ(CountIter(iter, "gpk"), 0);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+ 1 + using_full_builder);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+
+ // second SST with capped:3 BF
+ ASSERT_OK(Put("foo3", "bar3"));
+ ASSERT_OK(Put("foo4", "bar4"));
+ ASSERT_OK(Put("foq5", "bar5"));
+ ASSERT_OK(Put("fpb", "1"));
+ dbfull()->Flush(FlushOptions());
+ {
+ // BF is cappped:3 now
+ std::unique_ptr<Iterator> iter_tmp(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter_tmp, "foo"), 4);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+ 2 + using_full_builder * 2);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+ ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0);
+ // both counters are incremented because BF is "not changed" for 1 of the
+ // 2 SST files, so filter is checked once and found no match.
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+ 3 + using_full_builder * 2);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1);
+ }
+
+ ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:2"}}));
+ ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(),
+ "rocksdb.FixedPrefix.2"));
+ // third SST with fixed:2 BF
+ ASSERT_OK(Put("foo6", "bar6"));
+ ASSERT_OK(Put("foo7", "bar7"));
+ ASSERT_OK(Put("foq8", "bar8"));
+ ASSERT_OK(Put("fpc", "2"));
+ dbfull()->Flush(FlushOptions());
+ {
+ // BF is fixed:2 now
+ std::unique_ptr<Iterator> iter_tmp(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter_tmp, "foo"), 9);
+ // the first and last BF are checked
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+ 4 + using_full_builder * 3);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1);
+ ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0);
+ // only last BF is checked and not found
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+ 5 + using_full_builder * 3);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2);
+ }
+
+ // iter_old can only see the first SST, so checked plus 1
+ ASSERT_EQ(CountIter(iter_old, "foo"), 4);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+ 6 + using_full_builder * 3);
+ // iter was created after the first setoptions call so only full filter
+ // will check the filter
+ ASSERT_EQ(CountIter(iter, "foo"), 2);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+ 6 + using_full_builder * 4);
+
+ {
+ // keys in all three SSTs are visible to iterator
+ // The range of [foo, foz90000] is compatible with (fixed:1) and (fixed:2)
+ // so +2 for checked counter
+ std::unique_ptr<Iterator> iter_all(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter_all, "foo"), 9);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+ 7 + using_full_builder * 5);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2);
+ ASSERT_EQ(CountIter(iter_all, "gpk"), 0);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+ 8 + using_full_builder * 5);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3);
+ }
+ ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}}));
+ ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(),
+ "rocksdb.CappedPrefix.3"));
+ {
+ std::unique_ptr<Iterator> iter_all(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter_all, "foo"), 6);
+ // all three SST are checked because the current options has the same as
+ // the remaining SST (capped:3)
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+ 9 + using_full_builder * 7);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3);
+ ASSERT_EQ(CountIter(iter_all, "gpk"), 0);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+ 10 + using_full_builder * 7);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 4);
+ }
+ // TODO(Zhongyi): Maybe also need to add Get calls to test point look up?
+ }
+}
+
+// Create a new column family in a running DB, change prefix_extractor
+// dynamically, verify the iterator created on the new column family behaves
+// as expected
+TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) {
+ int iteration = 0;
+ for (auto bfp_impl : BFP::kAllFixedImpls) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+ options.disable_auto_compactions = true;
+ options.statistics = CreateDBStatistics();
+ // Enable prefix bloom for SST files
+ BlockBasedTableOptions table_options;
+ table_options.cache_index_and_filter_blocks = true;
+ table_options.filter_policy.reset(new BFP(10, bfp_impl));
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ CreateAndReopenWithCF({"pikachu" + std::to_string(iteration)}, options);
+ ReadOptions read_options;
+ read_options.prefix_same_as_start = true;
+ // create a new CF and set prefix_extractor dynamically
+ options.prefix_extractor.reset(NewCappedPrefixTransform(3));
+ CreateColumnFamilies({"ramen_dojo_" + std::to_string(iteration)}, options);
+ ASSERT_EQ(0,
+ strcmp(dbfull()->GetOptions(handles_[2]).prefix_extractor->Name(),
+ "rocksdb.CappedPrefix.3"));
+ ASSERT_OK(Put(2, "foo3", "bar3"));
+ ASSERT_OK(Put(2, "foo4", "bar4"));
+ ASSERT_OK(Put(2, "foo5", "bar5"));
+ ASSERT_OK(Put(2, "foq6", "bar6"));
+ ASSERT_OK(Put(2, "fpq7", "bar7"));
+ dbfull()->Flush(FlushOptions());
+ {
+ std::unique_ptr<Iterator> iter(
+ db_->NewIterator(read_options, handles_[2]));
+ ASSERT_EQ(CountIter(iter, "foo"), 3);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 0);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+ }
+ ASSERT_OK(
+ dbfull()->SetOptions(handles_[2], {{"prefix_extractor", "fixed:2"}}));
+ ASSERT_EQ(0,
+ strcmp(dbfull()->GetOptions(handles_[2]).prefix_extractor->Name(),
+ "rocksdb.FixedPrefix.2"));
+ {
+ std::unique_ptr<Iterator> iter(
+ db_->NewIterator(read_options, handles_[2]));
+ ASSERT_EQ(CountIter(iter, "foo"), 4);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 0);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+ }
+ ASSERT_OK(dbfull()->DropColumnFamily(handles_[2]));
+ dbfull()->DestroyColumnFamilyHandle(handles_[2]);
+ handles_[2] = nullptr;
+ ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+ dbfull()->DestroyColumnFamilyHandle(handles_[1]);
+ handles_[1] = nullptr;
+ iteration++;
+ }
+}
+
+// Verify it's possible to change prefix_extractor at runtime and iterators
+// behaves as expected
+TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) {
+ for (auto bfp_impl : BFP::kAllFixedImpls) {
+ Options options;
+ options.create_if_missing = true;
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+ options.disable_auto_compactions = true;
+ options.statistics = CreateDBStatistics();
+ // Enable prefix bloom for SST files
+ BlockBasedTableOptions table_options;
+ table_options.cache_index_and_filter_blocks = true;
+ table_options.filter_policy.reset(new BFP(10, bfp_impl));
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("foo2", "bar2"));
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Put("foo1", "bar1"));
+ ASSERT_OK(Put("fpa", "0"));
+ dbfull()->Flush(FlushOptions());
+ ASSERT_OK(Put("foo3", "bar3"));
+ ASSERT_OK(Put("foo4", "bar4"));
+ ASSERT_OK(Put("foo5", "bar5"));
+ ASSERT_OK(Put("fpb", "1"));
+ dbfull()->Flush(FlushOptions());
+ ASSERT_OK(Put("foo6", "bar6"));
+ ASSERT_OK(Put("foo7", "bar7"));
+ ASSERT_OK(Put("foo8", "bar8"));
+ ASSERT_OK(Put("fpc", "2"));
+ dbfull()->Flush(FlushOptions());
+
+ ReadOptions read_options;
+ read_options.prefix_same_as_start = true;
+ {
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter, "foo"), 12);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 3);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+ }
+ std::unique_ptr<Iterator> iter_old(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter_old, "foo"), 12);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+
+ ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}}));
+ ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(),
+ "rocksdb.CappedPrefix.3"));
+ {
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ // "fp*" should be skipped
+ ASSERT_EQ(CountIter(iter, "foo"), 9);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+ }
+
+ // iterator created before should not be affected and see all keys
+ ASSERT_EQ(CountIter(iter_old, "foo"), 12);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 9);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+ ASSERT_EQ(CountIter(iter_old, "abc"), 0);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 12);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3);
+ }
+}
+
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_compaction_filter_test.cc b/src/rocksdb/db/db_compaction_filter_test.cc
new file mode 100644
index 000000000..a708c0b1a
--- /dev/null
+++ b/src/rocksdb/db/db_compaction_filter_test.cc
@@ -0,0 +1,872 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static int cfilter_count = 0;
+static int cfilter_skips = 0;
+
+// This is a static filter used for filtering
+// kvs during the compaction process.
+static std::string NEW_VALUE = "NewValue";
+
+class DBTestCompactionFilter : public DBTestBase {
+ public:
+ DBTestCompactionFilter() : DBTestBase("/db_compaction_filter_test") {}
+};
+
+// Param variant of DBTestBase::ChangeCompactOptions
+class DBTestCompactionFilterWithCompactParam
+ : public DBTestCompactionFilter,
+ public ::testing::WithParamInterface<DBTestBase::OptionConfig> {
+ public:
+ DBTestCompactionFilterWithCompactParam() : DBTestCompactionFilter() {
+ option_config_ = GetParam();
+ Destroy(last_options_);
+ auto options = CurrentOptions();
+ if (option_config_ == kDefault || option_config_ == kUniversalCompaction ||
+ option_config_ == kUniversalCompactionMultiLevel) {
+ options.create_if_missing = true;
+ }
+ if (option_config_ == kLevelSubcompactions ||
+ option_config_ == kUniversalSubcompactions) {
+ assert(options.max_subcompactions > 1);
+ }
+ TryReopen(options);
+ }
+};
+
+#ifndef ROCKSDB_VALGRIND_RUN
+INSTANTIATE_TEST_CASE_P(
+ CompactionFilterWithOption, DBTestCompactionFilterWithCompactParam,
+ ::testing::Values(DBTestBase::OptionConfig::kDefault,
+ DBTestBase::OptionConfig::kUniversalCompaction,
+ DBTestBase::OptionConfig::kUniversalCompactionMultiLevel,
+ DBTestBase::OptionConfig::kLevelSubcompactions,
+ DBTestBase::OptionConfig::kUniversalSubcompactions));
+#else
+// Run fewer cases in valgrind
+INSTANTIATE_TEST_CASE_P(CompactionFilterWithOption,
+ DBTestCompactionFilterWithCompactParam,
+ ::testing::Values(DBTestBase::OptionConfig::kDefault));
+#endif // ROCKSDB_VALGRIND_RUN
+
+class KeepFilter : public CompactionFilter {
+ public:
+ bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+ std::string* /*new_value*/,
+ bool* /*value_changed*/) const override {
+ cfilter_count++;
+ return false;
+ }
+
+ const char* Name() const override { return "KeepFilter"; }
+};
+
+class DeleteFilter : public CompactionFilter {
+ public:
+ bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+ std::string* /*new_value*/,
+ bool* /*value_changed*/) const override {
+ cfilter_count++;
+ return true;
+ }
+
+ const char* Name() const override { return "DeleteFilter"; }
+};
+
+class DeleteISFilter : public CompactionFilter {
+ public:
+ bool Filter(int /*level*/, const Slice& key, const Slice& /*value*/,
+ std::string* /*new_value*/,
+ bool* /*value_changed*/) const override {
+ cfilter_count++;
+ int i = std::stoi(key.ToString());
+ if (i > 5 && i <= 105) {
+ return true;
+ }
+ return false;
+ }
+
+ bool IgnoreSnapshots() const override { return true; }
+
+ const char* Name() const override { return "DeleteFilter"; }
+};
+
+// Skip x if floor(x/10) is even, use range skips. Requires that keys are
+// zero-padded to length 10.
+class SkipEvenFilter : public CompactionFilter {
+ public:
+ Decision FilterV2(int /*level*/, const Slice& key, ValueType /*value_type*/,
+ const Slice& /*existing_value*/, std::string* /*new_value*/,
+ std::string* skip_until) const override {
+ cfilter_count++;
+ int i = std::stoi(key.ToString());
+ if (i / 10 % 2 == 0) {
+ char key_str[100];
+ snprintf(key_str, sizeof(key_str), "%010d", i / 10 * 10 + 10);
+ *skip_until = key_str;
+ ++cfilter_skips;
+ return Decision::kRemoveAndSkipUntil;
+ }
+ return Decision::kKeep;
+ }
+
+ bool IgnoreSnapshots() const override { return true; }
+
+ const char* Name() const override { return "DeleteFilter"; }
+};
+
+class DelayFilter : public CompactionFilter {
+ public:
+ explicit DelayFilter(DBTestBase* d) : db_test(d) {}
+ bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+ std::string* /*new_value*/,
+ bool* /*value_changed*/) const override {
+ db_test->env_->addon_time_.fetch_add(1000);
+ return true;
+ }
+
+ const char* Name() const override { return "DelayFilter"; }
+
+ private:
+ DBTestBase* db_test;
+};
+
+class ConditionalFilter : public CompactionFilter {
+ public:
+ explicit ConditionalFilter(const std::string* filtered_value)
+ : filtered_value_(filtered_value) {}
+ bool Filter(int /*level*/, const Slice& /*key*/, const Slice& value,
+ std::string* /*new_value*/,
+ bool* /*value_changed*/) const override {
+ return value.ToString() == *filtered_value_;
+ }
+
+ const char* Name() const override { return "ConditionalFilter"; }
+
+ private:
+ const std::string* filtered_value_;
+};
+
+class ChangeFilter : public CompactionFilter {
+ public:
+ explicit ChangeFilter() {}
+
+ bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+ std::string* new_value, bool* value_changed) const override {
+ assert(new_value != nullptr);
+ *new_value = NEW_VALUE;
+ *value_changed = true;
+ return false;
+ }
+
+ const char* Name() const override { return "ChangeFilter"; }
+};
+
+class KeepFilterFactory : public CompactionFilterFactory {
+ public:
+ explicit KeepFilterFactory(bool check_context = false,
+ bool check_context_cf_id = false)
+ : check_context_(check_context),
+ check_context_cf_id_(check_context_cf_id),
+ compaction_filter_created_(false) {}
+
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& context) override {
+ if (check_context_) {
+ EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction);
+ EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction);
+ }
+ if (check_context_cf_id_) {
+ EXPECT_EQ(expect_cf_id_.load(), context.column_family_id);
+ }
+ compaction_filter_created_ = true;
+ return std::unique_ptr<CompactionFilter>(new KeepFilter());
+ }
+
+ bool compaction_filter_created() const { return compaction_filter_created_; }
+
+ const char* Name() const override { return "KeepFilterFactory"; }
+ bool check_context_;
+ bool check_context_cf_id_;
+ std::atomic_bool expect_full_compaction_;
+ std::atomic_bool expect_manual_compaction_;
+ std::atomic<uint32_t> expect_cf_id_;
+ bool compaction_filter_created_;
+};
+
+class DeleteFilterFactory : public CompactionFilterFactory {
+ public:
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& context) override {
+ if (context.is_manual_compaction) {
+ return std::unique_ptr<CompactionFilter>(new DeleteFilter());
+ } else {
+ return std::unique_ptr<CompactionFilter>(nullptr);
+ }
+ }
+
+ const char* Name() const override { return "DeleteFilterFactory"; }
+};
+
+// Delete Filter Factory which ignores snapshots
+class DeleteISFilterFactory : public CompactionFilterFactory {
+ public:
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& context) override {
+ if (context.is_manual_compaction) {
+ return std::unique_ptr<CompactionFilter>(new DeleteISFilter());
+ } else {
+ return std::unique_ptr<CompactionFilter>(nullptr);
+ }
+ }
+
+ const char* Name() const override { return "DeleteFilterFactory"; }
+};
+
+class SkipEvenFilterFactory : public CompactionFilterFactory {
+ public:
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& context) override {
+ if (context.is_manual_compaction) {
+ return std::unique_ptr<CompactionFilter>(new SkipEvenFilter());
+ } else {
+ return std::unique_ptr<CompactionFilter>(nullptr);
+ }
+ }
+
+ const char* Name() const override { return "SkipEvenFilterFactory"; }
+};
+
+class DelayFilterFactory : public CompactionFilterFactory {
+ public:
+ explicit DelayFilterFactory(DBTestBase* d) : db_test(d) {}
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& /*context*/) override {
+ return std::unique_ptr<CompactionFilter>(new DelayFilter(db_test));
+ }
+
+ const char* Name() const override { return "DelayFilterFactory"; }
+
+ private:
+ DBTestBase* db_test;
+};
+
+class ConditionalFilterFactory : public CompactionFilterFactory {
+ public:
+ explicit ConditionalFilterFactory(const Slice& filtered_value)
+ : filtered_value_(filtered_value.ToString()) {}
+
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& /*context*/) override {
+ return std::unique_ptr<CompactionFilter>(
+ new ConditionalFilter(&filtered_value_));
+ }
+
+ const char* Name() const override { return "ConditionalFilterFactory"; }
+
+ private:
+ std::string filtered_value_;
+};
+
+class ChangeFilterFactory : public CompactionFilterFactory {
+ public:
+ explicit ChangeFilterFactory() {}
+
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& /*context*/) override {
+ return std::unique_ptr<CompactionFilter>(new ChangeFilter());
+ }
+
+ const char* Name() const override { return "ChangeFilterFactory"; }
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTestCompactionFilter, CompactionFilter) {
+ Options options = CurrentOptions();
+ options.max_open_files = -1;
+ options.num_levels = 3;
+ options.compaction_filter_factory = std::make_shared<KeepFilterFactory>();
+ options = CurrentOptions(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Write 100K keys, these are written to a few files in L0.
+ const std::string value(10, 'x');
+ for (int i = 0; i < 100000; i++) {
+ char key[100];
+ snprintf(key, sizeof(key), "B%010d", i);
+ Put(1, key, value);
+ }
+ ASSERT_OK(Flush(1));
+
+ // Push all files to the highest level L2. Verify that
+ // the compaction is each level invokes the filter for
+ // all the keys in that level.
+ cfilter_count = 0;
+ dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+ ASSERT_EQ(cfilter_count, 100000);
+ cfilter_count = 0;
+ dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+ ASSERT_EQ(cfilter_count, 100000);
+
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+ ASSERT_NE(NumTableFilesAtLevel(2, 1), 0);
+ cfilter_count = 0;
+
+ // All the files are in the lowest level.
+ // Verify that all but the 100001st record
+ // has sequence number zero. The 100001st record
+ // is at the tip of this snapshot and cannot
+ // be zeroed out.
+ int count = 0;
+ int total = 0;
+ Arena arena;
+ {
+ InternalKeyComparator icmp(options.comparator);
+ ReadRangeDelAggregator range_del_agg(&icmp,
+ kMaxSequenceNumber /* upper_bound */);
+ ScopedArenaIterator iter(dbfull()->NewInternalIterator(
+ &arena, &range_del_agg, kMaxSequenceNumber, handles_[1]));
+ iter->SeekToFirst();
+ ASSERT_OK(iter->status());
+ while (iter->Valid()) {
+ ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+ ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+ total++;
+ if (ikey.sequence != 0) {
+ count++;
+ }
+ iter->Next();
+ }
+ }
+ ASSERT_EQ(total, 100000);
+ ASSERT_EQ(count, 0);
+
+ // overwrite all the 100K keys once again.
+ for (int i = 0; i < 100000; i++) {
+ char key[100];
+ snprintf(key, sizeof(key), "B%010d", i);
+ ASSERT_OK(Put(1, key, value));
+ }
+ ASSERT_OK(Flush(1));
+
+ // push all files to the highest level L2. This
+ // means that all keys should pass at least once
+ // via the compaction filter
+ cfilter_count = 0;
+ dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+ ASSERT_EQ(cfilter_count, 100000);
+ cfilter_count = 0;
+ dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+ ASSERT_EQ(cfilter_count, 100000);
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+ ASSERT_NE(NumTableFilesAtLevel(2, 1), 0);
+
+ // create a new database with the compaction
+ // filter in such a way that it deletes all keys
+ options.compaction_filter_factory = std::make_shared<DeleteFilterFactory>();
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // write all the keys once again.
+ for (int i = 0; i < 100000; i++) {
+ char key[100];
+ snprintf(key, sizeof(key), "B%010d", i);
+ ASSERT_OK(Put(1, key, value));
+ }
+ ASSERT_OK(Flush(1));
+ ASSERT_NE(NumTableFilesAtLevel(0, 1), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(2, 1), 0);
+
+ // Push all files to the highest level L2. This
+ // triggers the compaction filter to delete all keys,
+ // verify that at the end of the compaction process,
+ // nothing is left.
+ cfilter_count = 0;
+ dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+ ASSERT_EQ(cfilter_count, 100000);
+ cfilter_count = 0;
+ dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+ ASSERT_EQ(cfilter_count, 0);
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+
+ {
+ // Scan the entire database to ensure that nothing is left
+ std::unique_ptr<Iterator> iter(
+ db_->NewIterator(ReadOptions(), handles_[1]));
+ iter->SeekToFirst();
+ count = 0;
+ while (iter->Valid()) {
+ count++;
+ iter->Next();
+ }
+ ASSERT_EQ(count, 0);
+ }
+
+ // The sequence number of the remaining record
+ // is not zeroed out even though it is at the
+ // level Lmax because this record is at the tip
+ count = 0;
+ {
+ InternalKeyComparator icmp(options.comparator);
+ ReadRangeDelAggregator range_del_agg(&icmp,
+ kMaxSequenceNumber /* upper_bound */);
+ ScopedArenaIterator iter(dbfull()->NewInternalIterator(
+ &arena, &range_del_agg, kMaxSequenceNumber, handles_[1]));
+ iter->SeekToFirst();
+ ASSERT_OK(iter->status());
+ while (iter->Valid()) {
+ ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+ ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+ ASSERT_NE(ikey.sequence, (unsigned)0);
+ count++;
+ iter->Next();
+ }
+ ASSERT_EQ(count, 0);
+ }
+}
+
+// Tests the edge case where compaction does not produce any output -- all
+// entries are deleted. The compaction should create bunch of 'DeleteFile'
+// entries in VersionEdit, but none of the 'AddFile's.
+TEST_F(DBTestCompactionFilter, CompactionFilterDeletesAll) {
+ Options options = CurrentOptions();
+ options.compaction_filter_factory = std::make_shared<DeleteFilterFactory>();
+ options.disable_auto_compactions = true;
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+
+ // put some data
+ for (int table = 0; table < 4; ++table) {
+ for (int i = 0; i < 10 + table; ++i) {
+ Put(ToString(table * 100 + i), "val");
+ }
+ Flush();
+ }
+
+ // this will produce empty file (delete compaction filter)
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ(0U, CountLiveFiles());
+
+ Reopen(options);
+
+ Iterator* itr = db_->NewIterator(ReadOptions());
+ itr->SeekToFirst();
+ // empty db
+ ASSERT_TRUE(!itr->Valid());
+
+ delete itr;
+}
+#endif // ROCKSDB_LITE
+
+TEST_P(DBTestCompactionFilterWithCompactParam,
+ CompactionFilterWithValueChange) {
+ Options options = CurrentOptions();
+ options.num_levels = 3;
+ options.compaction_filter_factory = std::make_shared<ChangeFilterFactory>();
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Write 100K+1 keys, these are written to a few files
+ // in L0. We do this so that the current snapshot points
+ // to the 100001 key.The compaction filter is not invoked
+ // on keys that are visible via a snapshot because we
+ // anyways cannot delete it.
+ const std::string value(10, 'x');
+ for (int i = 0; i < 100001; i++) {
+ char key[100];
+ snprintf(key, sizeof(key), "B%010d", i);
+ Put(1, key, value);
+ }
+
+ // push all files to lower levels
+ ASSERT_OK(Flush(1));
+ if (option_config_ != kUniversalCompactionMultiLevel &&
+ option_config_ != kUniversalSubcompactions) {
+ dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+ dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+ } else {
+ dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+ nullptr);
+ }
+
+ // re-write all data again
+ for (int i = 0; i < 100001; i++) {
+ char key[100];
+ snprintf(key, sizeof(key), "B%010d", i);
+ Put(1, key, value);
+ }
+
+ // push all files to lower levels. This should
+ // invoke the compaction filter for all 100000 keys.
+ ASSERT_OK(Flush(1));
+ if (option_config_ != kUniversalCompactionMultiLevel &&
+ option_config_ != kUniversalSubcompactions) {
+ dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+ dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+ } else {
+ dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+ nullptr);
+ }
+
+ // verify that all keys now have the new value that
+ // was set by the compaction process.
+ for (int i = 0; i < 100001; i++) {
+ char key[100];
+ snprintf(key, sizeof(key), "B%010d", i);
+ std::string newvalue = Get(1, key);
+ ASSERT_EQ(newvalue.compare(NEW_VALUE), 0);
+ }
+}
+
+TEST_F(DBTestCompactionFilter, CompactionFilterWithMergeOperator) {
+ std::string one, two, three, four;
+ PutFixed64(&one, 1);
+ PutFixed64(&two, 2);
+ PutFixed64(&three, 3);
+ PutFixed64(&four, 4);
+
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+ options.num_levels = 3;
+ // Filter out keys with value is 2.
+ options.compaction_filter_factory =
+ std::make_shared<ConditionalFilterFactory>(two);
+ DestroyAndReopen(options);
+
+ // In the same compaction, a value type needs to be deleted based on
+ // compaction filter, and there is a merge type for the key. compaction
+ // filter result is ignored.
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", two));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->Merge(WriteOptions(), "foo", one));
+ ASSERT_OK(Flush());
+ std::string newvalue = Get("foo");
+ ASSERT_EQ(newvalue, three);
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ newvalue = Get("foo");
+ ASSERT_EQ(newvalue, three);
+
+ // value key can be deleted based on compaction filter, leaving only
+ // merge keys.
+ ASSERT_OK(db_->Put(WriteOptions(), "bar", two));
+ ASSERT_OK(Flush());
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ newvalue = Get("bar");
+ ASSERT_EQ("NOT_FOUND", newvalue);
+ ASSERT_OK(db_->Merge(WriteOptions(), "bar", two));
+ ASSERT_OK(Flush());
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ newvalue = Get("bar");
+ ASSERT_EQ(two, two);
+
+ // Compaction filter never applies to merge keys.
+ ASSERT_OK(db_->Put(WriteOptions(), "foobar", one));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->Merge(WriteOptions(), "foobar", two));
+ ASSERT_OK(Flush());
+ newvalue = Get("foobar");
+ ASSERT_EQ(newvalue, three);
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ newvalue = Get("foobar");
+ ASSERT_EQ(newvalue, three);
+
+ // In the same compaction, both of value type and merge type keys need to be
+ // deleted based on compaction filter, and there is a merge type for the key.
+ // For both keys, compaction filter results are ignored.
+ ASSERT_OK(db_->Put(WriteOptions(), "barfoo", two));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->Merge(WriteOptions(), "barfoo", two));
+ ASSERT_OK(Flush());
+ newvalue = Get("barfoo");
+ ASSERT_EQ(newvalue, four);
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ newvalue = Get("barfoo");
+ ASSERT_EQ(newvalue, four);
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTestCompactionFilter, CompactionFilterContextManual) {
+ KeepFilterFactory* filter = new KeepFilterFactory(true, true);
+
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.compaction_filter_factory.reset(filter);
+ options.compression = kNoCompression;
+ options.level0_file_num_compaction_trigger = 8;
+ Reopen(options);
+ int num_keys_per_file = 400;
+ for (int j = 0; j < 3; j++) {
+ // Write several keys.
+ const std::string value(10, 'x');
+ for (int i = 0; i < num_keys_per_file; i++) {
+ char key[100];
+ snprintf(key, sizeof(key), "B%08d%02d", i, j);
+ Put(key, value);
+ }
+ dbfull()->TEST_FlushMemTable();
+ // Make sure next file is much smaller so automatic compaction will not
+ // be triggered.
+ num_keys_per_file /= 2;
+ }
+ dbfull()->TEST_WaitForCompact();
+
+ // Force a manual compaction
+ cfilter_count = 0;
+ filter->expect_manual_compaction_.store(true);
+ filter->expect_full_compaction_.store(true);
+ filter->expect_cf_id_.store(0);
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ ASSERT_EQ(cfilter_count, 700);
+ ASSERT_EQ(NumSortedRuns(0), 1);
+ ASSERT_TRUE(filter->compaction_filter_created());
+
+ // Verify total number of keys is correct after manual compaction.
+ {
+ int count = 0;
+ int total = 0;
+ Arena arena;
+ InternalKeyComparator icmp(options.comparator);
+ ReadRangeDelAggregator range_del_agg(&icmp,
+ kMaxSequenceNumber /* snapshots */);
+ ScopedArenaIterator iter(dbfull()->NewInternalIterator(
+ &arena, &range_del_agg, kMaxSequenceNumber));
+ iter->SeekToFirst();
+ ASSERT_OK(iter->status());
+ while (iter->Valid()) {
+ ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+ ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+ total++;
+ if (ikey.sequence != 0) {
+ count++;
+ }
+ iter->Next();
+ }
+ ASSERT_EQ(total, 700);
+ ASSERT_EQ(count, 0);
+ }
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTestCompactionFilter, CompactionFilterContextCfId) {
+ KeepFilterFactory* filter = new KeepFilterFactory(false, true);
+ filter->expect_cf_id_.store(1);
+
+ Options options = CurrentOptions();
+ options.compaction_filter_factory.reset(filter);
+ options.compression = kNoCompression;
+ options.level0_file_num_compaction_trigger = 2;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ int num_keys_per_file = 400;
+ for (int j = 0; j < 3; j++) {
+ // Write several keys.
+ const std::string value(10, 'x');
+ for (int i = 0; i < num_keys_per_file; i++) {
+ char key[100];
+ snprintf(key, sizeof(key), "B%08d%02d", i, j);
+ Put(1, key, value);
+ }
+ Flush(1);
+ // Make sure next file is much smaller so automatic compaction will not
+ // be triggered.
+ num_keys_per_file /= 2;
+ }
+ dbfull()->TEST_WaitForCompact();
+
+ ASSERT_TRUE(filter->compaction_filter_created());
+}
+
+#ifndef ROCKSDB_LITE
+// Compaction filters aplies to all records, regardless snapshots.
+TEST_F(DBTestCompactionFilter, CompactionFilterIgnoreSnapshot) {
+ std::string five = ToString(5);
+ Options options = CurrentOptions();
+ options.compaction_filter_factory = std::make_shared<DeleteISFilterFactory>();
+ options.disable_auto_compactions = true;
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+
+ // Put some data.
+ const Snapshot* snapshot = nullptr;
+ for (int table = 0; table < 4; ++table) {
+ for (int i = 0; i < 10; ++i) {
+ Put(ToString(table * 100 + i), "val");
+ }
+ Flush();
+
+ if (table == 0) {
+ snapshot = db_->GetSnapshot();
+ }
+ }
+ assert(snapshot != nullptr);
+
+ cfilter_count = 0;
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ // The filter should delete 40 records.
+ ASSERT_EQ(40, cfilter_count);
+
+ {
+ // Scan the entire database as of the snapshot to ensure
+ // that nothing is left
+ ReadOptions read_options;
+ read_options.snapshot = snapshot;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ iter->SeekToFirst();
+ int count = 0;
+ while (iter->Valid()) {
+ count++;
+ iter->Next();
+ }
+ ASSERT_EQ(count, 6);
+ read_options.snapshot = nullptr;
+ std::unique_ptr<Iterator> iter1(db_->NewIterator(read_options));
+ iter1->SeekToFirst();
+ count = 0;
+ while (iter1->Valid()) {
+ count++;
+ iter1->Next();
+ }
+ // We have deleted 10 keys from 40 using the compaction filter
+ // Keys 6-9 before the snapshot and 100-105 after the snapshot
+ ASSERT_EQ(count, 30);
+ }
+
+ // Release the snapshot and compact again -> now all records should be
+ // removed.
+ db_->ReleaseSnapshot(snapshot);
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTestCompactionFilter, SkipUntil) {
+ Options options = CurrentOptions();
+ options.compaction_filter_factory = std::make_shared<SkipEvenFilterFactory>();
+ options.disable_auto_compactions = true;
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+
+ // Write 100K keys, these are written to a few files in L0.
+ for (int table = 0; table < 4; ++table) {
+ // Key ranges in tables are [0, 38], [106, 149], [212, 260], [318, 371].
+ for (int i = table * 6; i < 39 + table * 11; ++i) {
+ char key[100];
+ snprintf(key, sizeof(key), "%010d", table * 100 + i);
+ Put(key, std::to_string(table * 1000 + i));
+ }
+ Flush();
+ }
+
+ cfilter_skips = 0;
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ // Number of skips in tables: 2, 3, 3, 3.
+ ASSERT_EQ(11, cfilter_skips);
+
+ for (int table = 0; table < 4; ++table) {
+ for (int i = table * 6; i < 39 + table * 11; ++i) {
+ int k = table * 100 + i;
+ char key[100];
+ snprintf(key, sizeof(key), "%010d", table * 100 + i);
+ auto expected = std::to_string(table * 1000 + i);
+ std::string val;
+ Status s = db_->Get(ReadOptions(), key, &val);
+ if (k / 10 % 2 == 0) {
+ ASSERT_TRUE(s.IsNotFound());
+ } else {
+ ASSERT_OK(s);
+ ASSERT_EQ(expected, val);
+ }
+ }
+ }
+}
+
+TEST_F(DBTestCompactionFilter, SkipUntilWithBloomFilter) {
+ BlockBasedTableOptions table_options;
+ table_options.whole_key_filtering = false;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(100, false));
+
+ Options options = CurrentOptions();
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.prefix_extractor.reset(NewCappedPrefixTransform(9));
+ options.compaction_filter_factory = std::make_shared<SkipEvenFilterFactory>();
+ options.disable_auto_compactions = true;
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+
+ Put("0000000010", "v10");
+ Put("0000000020", "v20"); // skipped
+ Put("0000000050", "v50");
+ Flush();
+
+ cfilter_skips = 0;
+ EXPECT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ EXPECT_EQ(1, cfilter_skips);
+
+ Status s;
+ std::string val;
+
+ s = db_->Get(ReadOptions(), "0000000010", &val);
+ ASSERT_OK(s);
+ EXPECT_EQ("v10", val);
+
+ s = db_->Get(ReadOptions(), "0000000020", &val);
+ EXPECT_TRUE(s.IsNotFound());
+
+ s = db_->Get(ReadOptions(), "0000000050", &val);
+ ASSERT_OK(s);
+ EXPECT_EQ("v50", val);
+}
+
+class TestNotSupportedFilter : public CompactionFilter {
+ public:
+ bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+ std::string* /*new_value*/,
+ bool* /*value_changed*/) const override {
+ return true;
+ }
+
+ const char* Name() const override { return "NotSupported"; }
+ bool IgnoreSnapshots() const override { return false; }
+};
+
+TEST_F(DBTestCompactionFilter, IgnoreSnapshotsFalse) {
+ Options options = CurrentOptions();
+ options.compaction_filter = new TestNotSupportedFilter();
+ DestroyAndReopen(options);
+
+ Put("a", "v10");
+ Put("z", "v20");
+ Flush();
+
+ Put("a", "v10");
+ Put("z", "v20");
+ Flush();
+
+ // Comapction should fail because IgnoreSnapshots() = false
+ EXPECT_TRUE(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)
+ .IsNotSupported());
+
+ delete options.compaction_filter;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_compaction_test.cc b/src/rocksdb/db/db_compaction_test.cc
new file mode 100644
index 000000000..635aca135
--- /dev/null
+++ b/src/rocksdb/db/db_compaction_test.cc
@@ -0,0 +1,5167 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/concurrent_task_limiter.h"
+#include "rocksdb/experimental.h"
+#include "rocksdb/sst_file_writer.h"
+#include "rocksdb/utilities/convenience.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/sync_point.h"
+#include "util/concurrent_task_limiter_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// SYNC_POINT is not supported in released Windows mode.
+#if !defined(ROCKSDB_LITE)
+
+class DBCompactionTest : public DBTestBase {
+ public:
+ DBCompactionTest() : DBTestBase("/db_compaction_test") {}
+};
+
+class DBCompactionTestWithParam
+ : public DBTestBase,
+ public testing::WithParamInterface<std::tuple<uint32_t, bool>> {
+ public:
+ DBCompactionTestWithParam() : DBTestBase("/db_compaction_test") {
+ max_subcompactions_ = std::get<0>(GetParam());
+ exclusive_manual_compaction_ = std::get<1>(GetParam());
+ }
+
+ // Required if inheriting from testing::WithParamInterface<>
+ static void SetUpTestCase() {}
+ static void TearDownTestCase() {}
+
+ uint32_t max_subcompactions_;
+ bool exclusive_manual_compaction_;
+};
+
+class DBCompactionDirectIOTest : public DBCompactionTest,
+ public ::testing::WithParamInterface<bool> {
+ public:
+ DBCompactionDirectIOTest() : DBCompactionTest() {}
+};
+
+namespace {
+
+class FlushedFileCollector : public EventListener {
+ public:
+ FlushedFileCollector() {}
+ ~FlushedFileCollector() override {}
+
+ void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+ std::lock_guard<std::mutex> lock(mutex_);
+ flushed_files_.push_back(info.file_path);
+ }
+
+ std::vector<std::string> GetFlushedFiles() {
+ std::lock_guard<std::mutex> lock(mutex_);
+ std::vector<std::string> result;
+ for (auto fname : flushed_files_) {
+ result.push_back(fname);
+ }
+ return result;
+ }
+
+ void ClearFlushedFiles() { flushed_files_.clear(); }
+
+ private:
+ std::vector<std::string> flushed_files_;
+ std::mutex mutex_;
+};
+
+class CompactionStatsCollector : public EventListener {
+public:
+ CompactionStatsCollector()
+ : compaction_completed_(static_cast<int>(CompactionReason::kNumOfReasons)) {
+ for (auto& v : compaction_completed_) {
+ v.store(0);
+ }
+ }
+
+ ~CompactionStatsCollector() override {}
+
+ void OnCompactionCompleted(DB* /* db */,
+ const CompactionJobInfo& info) override {
+ int k = static_cast<int>(info.compaction_reason);
+ int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+ assert(k >= 0 && k < num_of_reasons);
+ compaction_completed_[k]++;
+ }
+
+ void OnExternalFileIngested(
+ DB* /* db */, const ExternalFileIngestionInfo& /* info */) override {
+ int k = static_cast<int>(CompactionReason::kExternalSstIngestion);
+ compaction_completed_[k]++;
+ }
+
+ void OnFlushCompleted(DB* /* db */, const FlushJobInfo& /* info */) override {
+ int k = static_cast<int>(CompactionReason::kFlush);
+ compaction_completed_[k]++;
+ }
+
+ int NumberOfCompactions(CompactionReason reason) const {
+ int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+ int k = static_cast<int>(reason);
+ assert(k >= 0 && k < num_of_reasons);
+ return compaction_completed_.at(k).load();
+ }
+
+private:
+ std::vector<std::atomic<int>> compaction_completed_;
+};
+
+class SstStatsCollector : public EventListener {
+ public:
+ SstStatsCollector() : num_ssts_creation_started_(0) {}
+
+ void OnTableFileCreationStarted(
+ const TableFileCreationBriefInfo& /* info */) override {
+ ++num_ssts_creation_started_;
+ }
+
+ int num_ssts_creation_started() { return num_ssts_creation_started_; }
+
+ private:
+ std::atomic<int> num_ssts_creation_started_;
+};
+
+static const int kCDTValueSize = 1000;
+static const int kCDTKeysPerBuffer = 4;
+static const int kCDTNumLevels = 8;
+Options DeletionTriggerOptions(Options options) {
+ options.compression = kNoCompression;
+ options.write_buffer_size = kCDTKeysPerBuffer * (kCDTValueSize + 24);
+ options.min_write_buffer_number_to_merge = 1;
+ options.max_write_buffer_size_to_maintain = 0;
+ options.num_levels = kCDTNumLevels;
+ options.level0_file_num_compaction_trigger = 1;
+ options.target_file_size_base = options.write_buffer_size * 2;
+ options.target_file_size_multiplier = 2;
+ options.max_bytes_for_level_base =
+ options.target_file_size_base * options.target_file_size_multiplier;
+ options.max_bytes_for_level_multiplier = 2;
+ options.disable_auto_compactions = false;
+ return options;
+}
+
+bool HaveOverlappingKeyRanges(
+ const Comparator* c,
+ const SstFileMetaData& a, const SstFileMetaData& b) {
+ if (c->Compare(a.smallestkey, b.smallestkey) >= 0) {
+ if (c->Compare(a.smallestkey, b.largestkey) <= 0) {
+ // b.smallestkey <= a.smallestkey <= b.largestkey
+ return true;
+ }
+ } else if (c->Compare(a.largestkey, b.smallestkey) >= 0) {
+ // a.smallestkey < b.smallestkey <= a.largestkey
+ return true;
+ }
+ if (c->Compare(a.largestkey, b.largestkey) <= 0) {
+ if (c->Compare(a.largestkey, b.smallestkey) >= 0) {
+ // b.smallestkey <= a.largestkey <= b.largestkey
+ return true;
+ }
+ } else if (c->Compare(a.smallestkey, b.largestkey) <= 0) {
+ // a.smallestkey <= b.largestkey < a.largestkey
+ return true;
+ }
+ return false;
+}
+
+// Identifies all files between level "min_level" and "max_level"
+// which has overlapping key range with "input_file_meta".
+void GetOverlappingFileNumbersForLevelCompaction(
+ const ColumnFamilyMetaData& cf_meta,
+ const Comparator* comparator,
+ int min_level, int max_level,
+ const SstFileMetaData* input_file_meta,
+ std::set<std::string>* overlapping_file_names) {
+ std::set<const SstFileMetaData*> overlapping_files;
+ overlapping_files.insert(input_file_meta);
+ for (int m = min_level; m <= max_level; ++m) {
+ for (auto& file : cf_meta.levels[m].files) {
+ for (auto* included_file : overlapping_files) {
+ if (HaveOverlappingKeyRanges(
+ comparator, *included_file, file)) {
+ overlapping_files.insert(&file);
+ overlapping_file_names->insert(file.name);
+ break;
+ }
+ }
+ }
+ }
+}
+
+void VerifyCompactionResult(
+ const ColumnFamilyMetaData& cf_meta,
+ const std::set<std::string>& overlapping_file_numbers) {
+#ifndef NDEBUG
+ for (auto& level : cf_meta.levels) {
+ for (auto& file : level.files) {
+ assert(overlapping_file_numbers.find(file.name) ==
+ overlapping_file_numbers.end());
+ }
+ }
+#endif
+}
+
+/*
+ * Verifies compaction stats of cfd are valid.
+ *
+ * For each level of cfd, its compaction stats are valid if
+ * 1) sum(stat.counts) == stat.count, and
+ * 2) stat.counts[i] == collector.NumberOfCompactions(i)
+ */
+void VerifyCompactionStats(ColumnFamilyData& cfd,
+ const CompactionStatsCollector& collector) {
+#ifndef NDEBUG
+ InternalStats* internal_stats_ptr = cfd.internal_stats();
+ ASSERT_TRUE(internal_stats_ptr != nullptr);
+ const std::vector<InternalStats::CompactionStats>& comp_stats =
+ internal_stats_ptr->TEST_GetCompactionStats();
+ const int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+ std::vector<int> counts(num_of_reasons, 0);
+ // Count the number of compactions caused by each CompactionReason across
+ // all levels.
+ for (const auto& stat : comp_stats) {
+ int sum = 0;
+ for (int i = 0; i < num_of_reasons; i++) {
+ counts[i] += stat.counts[i];
+ sum += stat.counts[i];
+ }
+ ASSERT_EQ(sum, stat.count);
+ }
+ // Verify InternalStats bookkeeping matches that of CompactionStatsCollector,
+ // assuming that all compactions complete.
+ for (int i = 0; i < num_of_reasons; i++) {
+ ASSERT_EQ(collector.NumberOfCompactions(static_cast<CompactionReason>(i)), counts[i]);
+ }
+#endif /* NDEBUG */
+}
+
+const SstFileMetaData* PickFileRandomly(
+ const ColumnFamilyMetaData& cf_meta,
+ Random* rand,
+ int* level = nullptr) {
+ auto file_id = rand->Uniform(static_cast<int>(
+ cf_meta.file_count)) + 1;
+ for (auto& level_meta : cf_meta.levels) {
+ if (file_id <= level_meta.files.size()) {
+ if (level != nullptr) {
+ *level = level_meta.level;
+ }
+ auto result = rand->Uniform(file_id);
+ return &(level_meta.files[result]);
+ }
+ file_id -= static_cast<uint32_t>(level_meta.files.size());
+ }
+ assert(false);
+ return nullptr;
+}
+} // anonymous namespace
+
+#ifndef ROCKSDB_VALGRIND_RUN
+// All the TEST_P tests run once with sub_compactions disabled (i.e.
+// options.max_subcompactions = 1) and once with it enabled
+TEST_P(DBCompactionTestWithParam, CompactionDeletionTrigger) {
+ for (int tid = 0; tid < 3; ++tid) {
+ uint64_t db_size[2];
+ Options options = DeletionTriggerOptions(CurrentOptions());
+ options.max_subcompactions = max_subcompactions_;
+
+ if (tid == 1) {
+ // the following only disable stats update in DB::Open()
+ // and should not affect the result of this test.
+ options.skip_stats_update_on_db_open = true;
+ } else if (tid == 2) {
+ // third pass with universal compaction
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = 1;
+ }
+
+ DestroyAndReopen(options);
+ Random rnd(301);
+
+ const int kTestSize = kCDTKeysPerBuffer * 1024;
+ std::vector<std::string> values;
+ for (int k = 0; k < kTestSize; ++k) {
+ values.push_back(RandomString(&rnd, kCDTValueSize));
+ ASSERT_OK(Put(Key(k), values[k]));
+ }
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+ db_size[0] = Size(Key(0), Key(kTestSize - 1));
+
+ for (int k = 0; k < kTestSize; ++k) {
+ ASSERT_OK(Delete(Key(k)));
+ }
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+ db_size[1] = Size(Key(0), Key(kTestSize - 1));
+
+ // must have much smaller db size.
+ ASSERT_GT(db_size[0] / 3, db_size[1]);
+ }
+}
+#endif // ROCKSDB_VALGRIND_RUN
+
+TEST_P(DBCompactionTestWithParam, CompactionsPreserveDeletes) {
+ // For each options type we test following
+ // - Enable preserve_deletes
+ // - write bunch of keys and deletes
+ // - Set start_seqnum to the beginning; compact; check that keys are present
+ // - rewind start_seqnum way forward; compact; check that keys are gone
+
+ for (int tid = 0; tid < 3; ++tid) {
+ Options options = DeletionTriggerOptions(CurrentOptions());
+ options.max_subcompactions = max_subcompactions_;
+ options.preserve_deletes=true;
+ options.num_levels = 2;
+
+ if (tid == 1) {
+ options.skip_stats_update_on_db_open = true;
+ } else if (tid == 2) {
+ // third pass with universal compaction
+ options.compaction_style = kCompactionStyleUniversal;
+ }
+
+ DestroyAndReopen(options);
+ Random rnd(301);
+ // highlight the default; all deletes should be preserved
+ SetPreserveDeletesSequenceNumber(0);
+
+ const int kTestSize = kCDTKeysPerBuffer;
+ std::vector<std::string> values;
+ for (int k = 0; k < kTestSize; ++k) {
+ values.push_back(RandomString(&rnd, kCDTValueSize));
+ ASSERT_OK(Put(Key(k), values[k]));
+ }
+
+ for (int k = 0; k < kTestSize; ++k) {
+ ASSERT_OK(Delete(Key(k)));
+ }
+ // to ensure we tackle all tombstones
+ CompactRangeOptions cro;
+ cro.change_level = true;
+ cro.target_level = 2;
+ cro.bottommost_level_compaction =
+ BottommostLevelCompaction::kForceOptimized;
+
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->CompactRange(cro, nullptr, nullptr);
+
+ // check that normal user iterator doesn't see anything
+ Iterator* db_iter = dbfull()->NewIterator(ReadOptions());
+ int i = 0;
+ for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+ i++;
+ }
+ ASSERT_EQ(i, 0);
+ delete db_iter;
+
+ // check that iterator that sees internal keys sees tombstones
+ ReadOptions ro;
+ ro.iter_start_seqnum=1;
+ db_iter = dbfull()->NewIterator(ro);
+ i = 0;
+ for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+ i++;
+ }
+ ASSERT_EQ(i, 4);
+ delete db_iter;
+
+ // now all deletes should be gone
+ SetPreserveDeletesSequenceNumber(100000000);
+ dbfull()->CompactRange(cro, nullptr, nullptr);
+
+ db_iter = dbfull()->NewIterator(ro);
+ i = 0;
+ for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+ i++;
+ }
+ ASSERT_EQ(i, 0);
+ delete db_iter;
+ }
+}
+
+TEST_F(DBCompactionTest, SkipStatsUpdateTest) {
+ // This test verify UpdateAccumulatedStats is not on
+ // if options.skip_stats_update_on_db_open = true
+ // The test will need to be updated if the internal behavior changes.
+
+ Options options = DeletionTriggerOptions(CurrentOptions());
+ options.disable_auto_compactions = true;
+ options.env = env_;
+ DestroyAndReopen(options);
+ Random rnd(301);
+
+ const int kTestSize = kCDTKeysPerBuffer * 512;
+ std::vector<std::string> values;
+ for (int k = 0; k < kTestSize; ++k) {
+ values.push_back(RandomString(&rnd, kCDTValueSize));
+ ASSERT_OK(Put(Key(k), values[k]));
+ }
+
+ ASSERT_OK(Flush());
+
+ Close();
+
+ int update_acc_stats_called = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "VersionStorageInfo::UpdateAccumulatedStats",
+ [&](void* /* arg */) { ++update_acc_stats_called; });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ // Reopen the DB with stats-update disabled
+ options.skip_stats_update_on_db_open = true;
+ options.max_open_files = 20;
+ Reopen(options);
+
+ ASSERT_EQ(update_acc_stats_called, 0);
+
+ // Repeat the reopen process, but this time we enable
+ // stats-update.
+ options.skip_stats_update_on_db_open = false;
+ Reopen(options);
+
+ ASSERT_GT(update_acc_stats_called, 0);
+
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, TestTableReaderForCompaction) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.new_table_reader_for_compaction_inputs = true;
+ options.max_open_files = 20;
+ options.level0_file_num_compaction_trigger = 3;
+ DestroyAndReopen(options);
+ Random rnd(301);
+
+ int num_table_cache_lookup = 0;
+ int num_new_table_reader = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "TableCache::FindTable:0", [&](void* arg) {
+ assert(arg != nullptr);
+ bool no_io = *(reinterpret_cast<bool*>(arg));
+ if (!no_io) {
+ // filter out cases for table properties queries.
+ num_table_cache_lookup++;
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "TableCache::GetTableReader:0",
+ [&](void* /*arg*/) { num_new_table_reader++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ for (int k = 0; k < options.level0_file_num_compaction_trigger; ++k) {
+ ASSERT_OK(Put(Key(k), Key(k)));
+ ASSERT_OK(Put(Key(10 - k), "bar"));
+ if (k < options.level0_file_num_compaction_trigger - 1) {
+ num_table_cache_lookup = 0;
+ Flush();
+ dbfull()->TEST_WaitForCompact();
+ // preloading iterator issues one table cache lookup and create
+ // a new table reader, if not preloaded.
+ int old_num_table_cache_lookup = num_table_cache_lookup;
+ ASSERT_GE(num_table_cache_lookup, 1);
+ ASSERT_EQ(num_new_table_reader, 1);
+
+ num_table_cache_lookup = 0;
+ num_new_table_reader = 0;
+ ASSERT_EQ(Key(k), Get(Key(k)));
+ // lookup iterator from table cache and no need to create a new one.
+ ASSERT_EQ(old_num_table_cache_lookup + num_table_cache_lookup, 2);
+ ASSERT_EQ(num_new_table_reader, 0);
+ }
+ }
+
+ num_table_cache_lookup = 0;
+ num_new_table_reader = 0;
+ Flush();
+ dbfull()->TEST_WaitForCompact();
+ // Preloading iterator issues one table cache lookup and creates
+ // a new table reader. One file is created for flush and one for compaction.
+ // Compaction inputs make no table cache look-up for data/range deletion
+ // iterators
+ // May preload table cache too.
+ ASSERT_GE(num_table_cache_lookup, 2);
+ int old_num_table_cache_lookup2 = num_table_cache_lookup;
+
+ // Create new iterator for:
+ // (1) 1 for verifying flush results
+ // (2) 1 for verifying compaction results.
+ // (3) New TableReaders will not be created for compaction inputs
+ ASSERT_EQ(num_new_table_reader, 2);
+
+ num_table_cache_lookup = 0;
+ num_new_table_reader = 0;
+ ASSERT_EQ(Key(1), Get(Key(1)));
+ ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 5);
+ ASSERT_EQ(num_new_table_reader, 0);
+
+ num_table_cache_lookup = 0;
+ num_new_table_reader = 0;
+ CompactRangeOptions cro;
+ cro.change_level = true;
+ cro.target_level = 2;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+ db_->CompactRange(cro, nullptr, nullptr);
+ // Only verifying compaction outputs issues one table cache lookup
+ // for both data block and range deletion block).
+ // May preload table cache too.
+ ASSERT_GE(num_table_cache_lookup, 1);
+ old_num_table_cache_lookup2 = num_table_cache_lookup;
+ // One for verifying compaction results.
+ // No new iterator created for compaction.
+ ASSERT_EQ(num_new_table_reader, 1);
+
+ num_table_cache_lookup = 0;
+ num_new_table_reader = 0;
+ ASSERT_EQ(Key(1), Get(Key(1)));
+ ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 3);
+ ASSERT_EQ(num_new_table_reader, 0);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(DBCompactionTestWithParam, CompactionDeletionTriggerReopen) {
+ for (int tid = 0; tid < 2; ++tid) {
+ uint64_t db_size[3];
+ Options options = DeletionTriggerOptions(CurrentOptions());
+ options.max_subcompactions = max_subcompactions_;
+
+ if (tid == 1) {
+ // second pass with universal compaction
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = 1;
+ }
+
+ DestroyAndReopen(options);
+ Random rnd(301);
+
+ // round 1 --- insert key/value pairs.
+ const int kTestSize = kCDTKeysPerBuffer * 512;
+ std::vector<std::string> values;
+ for (int k = 0; k < kTestSize; ++k) {
+ values.push_back(RandomString(&rnd, kCDTValueSize));
+ ASSERT_OK(Put(Key(k), values[k]));
+ }
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+ db_size[0] = Size(Key(0), Key(kTestSize - 1));
+ Close();
+
+ // round 2 --- disable auto-compactions and issue deletions.
+ options.create_if_missing = false;
+ options.disable_auto_compactions = true;
+ Reopen(options);
+
+ for (int k = 0; k < kTestSize; ++k) {
+ ASSERT_OK(Delete(Key(k)));
+ }
+ db_size[1] = Size(Key(0), Key(kTestSize - 1));
+ Close();
+ // as auto_compaction is off, we shouldn't see too much reduce
+ // in db size.
+ ASSERT_LT(db_size[0] / 3, db_size[1]);
+
+ // round 3 --- reopen db with auto_compaction on and see if
+ // deletion compensation still work.
+ options.disable_auto_compactions = false;
+ Reopen(options);
+ // insert relatively small amount of data to trigger auto compaction.
+ for (int k = 0; k < kTestSize / 10; ++k) {
+ ASSERT_OK(Put(Key(k), values[k]));
+ }
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+ db_size[2] = Size(Key(0), Key(kTestSize - 1));
+ // this time we're expecting significant drop in size.
+ ASSERT_GT(db_size[0] / 3, db_size[2]);
+ }
+}
+
+TEST_F(DBCompactionTest, DisableStatsUpdateReopen) {
+ uint64_t db_size[3];
+ for (int test = 0; test < 2; ++test) {
+ Options options = DeletionTriggerOptions(CurrentOptions());
+ options.skip_stats_update_on_db_open = (test == 0);
+
+ env_->random_read_counter_.Reset();
+ DestroyAndReopen(options);
+ Random rnd(301);
+
+ // round 1 --- insert key/value pairs.
+ const int kTestSize = kCDTKeysPerBuffer * 512;
+ std::vector<std::string> values;
+ for (int k = 0; k < kTestSize; ++k) {
+ values.push_back(RandomString(&rnd, kCDTValueSize));
+ ASSERT_OK(Put(Key(k), values[k]));
+ }
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+ db_size[0] = Size(Key(0), Key(kTestSize - 1));
+ Close();
+
+ // round 2 --- disable auto-compactions and issue deletions.
+ options.create_if_missing = false;
+ options.disable_auto_compactions = true;
+
+ env_->random_read_counter_.Reset();
+ Reopen(options);
+
+ for (int k = 0; k < kTestSize; ++k) {
+ ASSERT_OK(Delete(Key(k)));
+ }
+ db_size[1] = Size(Key(0), Key(kTestSize - 1));
+ Close();
+ // as auto_compaction is off, we shouldn't see too much reduce
+ // in db size.
+ ASSERT_LT(db_size[0] / 3, db_size[1]);
+
+ // round 3 --- reopen db with auto_compaction on and see if
+ // deletion compensation still work.
+ options.disable_auto_compactions = false;
+ Reopen(options);
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+ db_size[2] = Size(Key(0), Key(kTestSize - 1));
+
+ if (options.skip_stats_update_on_db_open) {
+ // If update stats on DB::Open is disable, we don't expect
+ // deletion entries taking effect.
+ ASSERT_LT(db_size[0] / 3, db_size[2]);
+ } else {
+ // Otherwise, we should see a significant drop in db size.
+ ASSERT_GT(db_size[0] / 3, db_size[2]);
+ }
+ }
+}
+
+
+TEST_P(DBCompactionTestWithParam, CompactionTrigger) {
+ const int kNumKeysPerFile = 100;
+
+ Options options = CurrentOptions();
+ options.write_buffer_size = 110 << 10; // 110KB
+ options.arena_block_size = 4 << 10;
+ options.num_levels = 3;
+ options.level0_file_num_compaction_trigger = 3;
+ options.max_subcompactions = max_subcompactions_;
+ options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ Random rnd(301);
+
+ for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+ num++) {
+ std::vector<std::string> values;
+ // Write 100KB (100 values, each 1K)
+ for (int i = 0; i < kNumKeysPerFile; i++) {
+ values.push_back(RandomString(&rnd, 990));
+ ASSERT_OK(Put(1, Key(i), values[i]));
+ }
+ // put extra key to trigger flush
+ ASSERT_OK(Put(1, "", ""));
+ dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 1);
+ }
+
+ // generate one more file in level-0, and should trigger level-0 compaction
+ std::vector<std::string> values;
+ for (int i = 0; i < kNumKeysPerFile; i++) {
+ values.push_back(RandomString(&rnd, 990));
+ ASSERT_OK(Put(1, Key(i), values[i]));
+ }
+ // put extra key to trigger flush
+ ASSERT_OK(Put(1, "", ""));
+ dbfull()->TEST_WaitForCompact();
+
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(1, 1), 1);
+}
+
+TEST_F(DBCompactionTest, BGCompactionsAllowed) {
+ // Create several column families. Make compaction triggers in all of them
+ // and see number of compactions scheduled to be less than allowed.
+ const int kNumKeysPerFile = 100;
+
+ Options options = CurrentOptions();
+ options.write_buffer_size = 110 << 10; // 110KB
+ options.arena_block_size = 4 << 10;
+ options.num_levels = 3;
+ // Should speed up compaction when there are 4 files.
+ options.level0_file_num_compaction_trigger = 2;
+ options.level0_slowdown_writes_trigger = 20;
+ options.soft_pending_compaction_bytes_limit = 1 << 30; // Infinitely large
+ options.max_background_compactions = 3;
+ options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
+
+ // Block all threads in thread pool.
+ const size_t kTotalTasks = 4;
+ env_->SetBackgroundThreads(4, Env::LOW);
+ test::SleepingBackgroundTask sleeping_tasks[kTotalTasks];
+ for (size_t i = 0; i < kTotalTasks; i++) {
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+ &sleeping_tasks[i], Env::Priority::LOW);
+ sleeping_tasks[i].WaitUntilSleeping();
+ }
+
+ CreateAndReopenWithCF({"one", "two", "three"}, options);
+
+ Random rnd(301);
+ for (int cf = 0; cf < 4; cf++) {
+ for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+ for (int i = 0; i < kNumKeysPerFile; i++) {
+ ASSERT_OK(Put(cf, Key(i), ""));
+ }
+ // put extra key to trigger flush
+ ASSERT_OK(Put(cf, "", ""));
+ dbfull()->TEST_WaitForFlushMemTable(handles_[cf]);
+ ASSERT_EQ(NumTableFilesAtLevel(0, cf), num + 1);
+ }
+ }
+
+ // Now all column families qualify compaction but only one should be
+ // scheduled, because no column family hits speed up condition.
+ ASSERT_EQ(1u, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+
+ // Create two more files for one column family, which triggers speed up
+ // condition, three compactions will be scheduled.
+ for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+ for (int i = 0; i < kNumKeysPerFile; i++) {
+ ASSERT_OK(Put(2, Key(i), ""));
+ }
+ // put extra key to trigger flush
+ ASSERT_OK(Put(2, "", ""));
+ dbfull()->TEST_WaitForFlushMemTable(handles_[2]);
+ ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1,
+ NumTableFilesAtLevel(0, 2));
+ }
+ ASSERT_EQ(3U, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+
+ // Unblock all threads to unblock all compactions.
+ for (size_t i = 0; i < kTotalTasks; i++) {
+ sleeping_tasks[i].WakeUp();
+ sleeping_tasks[i].WaitUntilDone();
+ }
+ dbfull()->TEST_WaitForCompact();
+
+ // Verify number of compactions allowed will come back to 1.
+
+ for (size_t i = 0; i < kTotalTasks; i++) {
+ sleeping_tasks[i].Reset();
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+ &sleeping_tasks[i], Env::Priority::LOW);
+ sleeping_tasks[i].WaitUntilSleeping();
+ }
+ for (int cf = 0; cf < 4; cf++) {
+ for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+ for (int i = 0; i < kNumKeysPerFile; i++) {
+ ASSERT_OK(Put(cf, Key(i), ""));
+ }
+ // put extra key to trigger flush
+ ASSERT_OK(Put(cf, "", ""));
+ dbfull()->TEST_WaitForFlushMemTable(handles_[cf]);
+ ASSERT_EQ(NumTableFilesAtLevel(0, cf), num + 1);
+ }
+ }
+
+ // Now all column families qualify compaction but only one should be
+ // scheduled, because no column family hits speed up condition.
+ ASSERT_EQ(1U, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+
+ for (size_t i = 0; i < kTotalTasks; i++) {
+ sleeping_tasks[i].WakeUp();
+ sleeping_tasks[i].WaitUntilDone();
+ }
+}
+
+TEST_P(DBCompactionTestWithParam, CompactionsGenerateMultipleFiles) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100000000; // Large write buffer
+ options.max_subcompactions = max_subcompactions_;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ Random rnd(301);
+
+ // Write 8MB (80 values, each 100K)
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+ std::vector<std::string> values;
+ for (int i = 0; i < 80; i++) {
+ values.push_back(RandomString(&rnd, 100000));
+ ASSERT_OK(Put(1, Key(i), values[i]));
+ }
+
+ // Reopening moves updates to level-0
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
+ true /* disallow trivial move */);
+
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+ ASSERT_GT(NumTableFilesAtLevel(1, 1), 1);
+ for (int i = 0; i < 80; i++) {
+ ASSERT_EQ(Get(1, Key(i)), values[i]);
+ }
+}
+
+TEST_F(DBCompactionTest, MinorCompactionsHappen) {
+ do {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 10000;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ const int N = 500;
+
+ int starting_num_tables = TotalTableFiles(1);
+ for (int i = 0; i < N; i++) {
+ ASSERT_OK(Put(1, Key(i), Key(i) + std::string(1000, 'v')));
+ }
+ int ending_num_tables = TotalTableFiles(1);
+ ASSERT_GT(ending_num_tables, starting_num_tables);
+
+ for (int i = 0; i < N; i++) {
+ ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i)));
+ }
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ for (int i = 0; i < N; i++) {
+ ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i)));
+ }
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBCompactionTest, UserKeyCrossFile1) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleLevel;
+ options.level0_file_num_compaction_trigger = 3;
+
+ DestroyAndReopen(options);
+
+ // create first file and flush to l0
+ Put("4", "A");
+ Put("3", "A");
+ Flush();
+ dbfull()->TEST_WaitForFlushMemTable();
+
+ Put("2", "A");
+ Delete("3");
+ Flush();
+ dbfull()->TEST_WaitForFlushMemTable();
+ ASSERT_EQ("NOT_FOUND", Get("3"));
+
+ // move both files down to l1
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ ASSERT_EQ("NOT_FOUND", Get("3"));
+
+ for (int i = 0; i < 3; i++) {
+ Put("2", "B");
+ Flush();
+ dbfull()->TEST_WaitForFlushMemTable();
+ }
+ dbfull()->TEST_WaitForCompact();
+
+ ASSERT_EQ("NOT_FOUND", Get("3"));
+}
+
+TEST_F(DBCompactionTest, UserKeyCrossFile2) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleLevel;
+ options.level0_file_num_compaction_trigger = 3;
+
+ DestroyAndReopen(options);
+
+ // create first file and flush to l0
+ Put("4", "A");
+ Put("3", "A");
+ Flush();
+ dbfull()->TEST_WaitForFlushMemTable();
+
+ Put("2", "A");
+ SingleDelete("3");
+ Flush();
+ dbfull()->TEST_WaitForFlushMemTable();
+ ASSERT_EQ("NOT_FOUND", Get("3"));
+
+ // move both files down to l1
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ ASSERT_EQ("NOT_FOUND", Get("3"));
+
+ for (int i = 0; i < 3; i++) {
+ Put("2", "B");
+ Flush();
+ dbfull()->TEST_WaitForFlushMemTable();
+ }
+ dbfull()->TEST_WaitForCompact();
+
+ ASSERT_EQ("NOT_FOUND", Get("3"));
+}
+
+TEST_F(DBCompactionTest, ZeroSeqIdCompaction) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleLevel;
+ options.level0_file_num_compaction_trigger = 3;
+
+ FlushedFileCollector* collector = new FlushedFileCollector();
+ options.listeners.emplace_back(collector);
+
+ // compaction options
+ CompactionOptions compact_opt;
+ compact_opt.compression = kNoCompression;
+ compact_opt.output_file_size_limit = 4096;
+ const size_t key_len =
+ static_cast<size_t>(compact_opt.output_file_size_limit) / 5;
+
+ DestroyAndReopen(options);
+
+ std::vector<const Snapshot*> snaps;
+
+ // create first file and flush to l0
+ for (auto& key : {"1", "2", "3", "3", "3", "3"}) {
+ Put(key, std::string(key_len, 'A'));
+ snaps.push_back(dbfull()->GetSnapshot());
+ }
+ Flush();
+ dbfull()->TEST_WaitForFlushMemTable();
+
+ // create second file and flush to l0
+ for (auto& key : {"3", "4", "5", "6", "7", "8"}) {
+ Put(key, std::string(key_len, 'A'));
+ snaps.push_back(dbfull()->GetSnapshot());
+ }
+ Flush();
+ dbfull()->TEST_WaitForFlushMemTable();
+
+ // move both files down to l1
+ dbfull()->CompactFiles(compact_opt, collector->GetFlushedFiles(), 1);
+
+ // release snap so that first instance of key(3) can have seqId=0
+ for (auto snap : snaps) {
+ dbfull()->ReleaseSnapshot(snap);
+ }
+
+ // create 3 files in l0 so to trigger compaction
+ for (int i = 0; i < options.level0_file_num_compaction_trigger; i++) {
+ Put("2", std::string(1, 'A'));
+ Flush();
+ dbfull()->TEST_WaitForFlushMemTable();
+ }
+
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_OK(Put("", ""));
+}
+
+TEST_F(DBCompactionTest, ManualCompactionUnknownOutputSize) {
+ // github issue #2249
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleLevel;
+ options.level0_file_num_compaction_trigger = 3;
+ DestroyAndReopen(options);
+
+ // create two files in l1 that we can compact
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < options.level0_file_num_compaction_trigger; j++) {
+ // make l0 files' ranges overlap to avoid trivial move
+ Put(std::to_string(2 * i), std::string(1, 'A'));
+ Put(std::to_string(2 * i + 1), std::string(1, 'A'));
+ Flush();
+ dbfull()->TEST_WaitForFlushMemTable();
+ }
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(1, 0), i + 1);
+ }
+
+ ColumnFamilyMetaData cf_meta;
+ dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta);
+ ASSERT_EQ(2, cf_meta.levels[1].files.size());
+ std::vector<std::string> input_filenames;
+ for (const auto& sst_file : cf_meta.levels[1].files) {
+ input_filenames.push_back(sst_file.name);
+ }
+
+ // note CompactionOptions::output_file_size_limit is unset.
+ CompactionOptions compact_opt;
+ compact_opt.compression = kNoCompression;
+ dbfull()->CompactFiles(compact_opt, input_filenames, 1);
+}
+
+// Check that writes done during a memtable compaction are recovered
+// if the database is shutdown during the memtable compaction.
+TEST_F(DBCompactionTest, RecoverDuringMemtableCompaction) {
+ do {
+ Options options = CurrentOptions();
+ options.env = env_;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Trigger a long memtable compaction and reopen the database during it
+ ASSERT_OK(Put(1, "foo", "v1")); // Goes to 1st log file
+ ASSERT_OK(Put(1, "big1", std::string(10000000, 'x'))); // Fills memtable
+ ASSERT_OK(Put(1, "big2", std::string(1000, 'y'))); // Triggers compaction
+ ASSERT_OK(Put(1, "bar", "v2")); // Goes to new log file
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_EQ("v1", Get(1, "foo"));
+ ASSERT_EQ("v2", Get(1, "bar"));
+ ASSERT_EQ(std::string(10000000, 'x'), Get(1, "big1"));
+ ASSERT_EQ(std::string(1000, 'y'), Get(1, "big2"));
+ } while (ChangeOptions());
+}
+
+TEST_P(DBCompactionTestWithParam, TrivialMoveOneFile) {
+ int32_t trivial_move = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:TrivialMove",
+ [&](void* /*arg*/) { trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100000000;
+ options.max_subcompactions = max_subcompactions_;
+ DestroyAndReopen(options);
+
+ int32_t num_keys = 80;
+ int32_t value_size = 100 * 1024; // 100 KB
+
+ Random rnd(301);
+ std::vector<std::string> values;
+ for (int i = 0; i < num_keys; i++) {
+ values.push_back(RandomString(&rnd, value_size));
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+
+ // Reopening moves updates to L0
+ Reopen(options);
+ ASSERT_EQ(NumTableFilesAtLevel(0, 0), 1); // 1 file in L0
+ ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0); // 0 files in L1
+
+ std::vector<LiveFileMetaData> metadata;
+ db_->GetLiveFilesMetaData(&metadata);
+ ASSERT_EQ(metadata.size(), 1U);
+ LiveFileMetaData level0_file = metadata[0]; // L0 file meta
+
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+
+ // Compaction will initiate a trivial move from L0 to L1
+ dbfull()->CompactRange(cro, nullptr, nullptr);
+
+ // File moved From L0 to L1
+ ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0); // 0 files in L0
+ ASSERT_EQ(NumTableFilesAtLevel(1, 0), 1); // 1 file in L1
+
+ metadata.clear();
+ db_->GetLiveFilesMetaData(&metadata);
+ ASSERT_EQ(metadata.size(), 1U);
+ ASSERT_EQ(metadata[0].name /* level1_file.name */, level0_file.name);
+ ASSERT_EQ(metadata[0].size /* level1_file.size */, level0_file.size);
+
+ for (int i = 0; i < num_keys; i++) {
+ ASSERT_EQ(Get(Key(i)), values[i]);
+ }
+
+ ASSERT_EQ(trivial_move, 1);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBCompactionTestWithParam, TrivialMoveNonOverlappingFiles) {
+ int32_t trivial_move = 0;
+ int32_t non_trivial_move = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:TrivialMove",
+ [&](void* /*arg*/) { trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial",
+ [&](void* /*arg*/) { non_trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.write_buffer_size = 10 * 1024 * 1024;
+ options.max_subcompactions = max_subcompactions_;
+
+ DestroyAndReopen(options);
+ // non overlapping ranges
+ std::vector<std::pair<int32_t, int32_t>> ranges = {
+ {100, 199},
+ {300, 399},
+ {0, 99},
+ {200, 299},
+ {600, 699},
+ {400, 499},
+ {500, 550},
+ {551, 599},
+ };
+ int32_t value_size = 10 * 1024; // 10 KB
+
+ Random rnd(301);
+ std::map<int32_t, std::string> values;
+ for (size_t i = 0; i < ranges.size(); i++) {
+ for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+ values[j] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(j), values[j]));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ int32_t level0_files = NumTableFilesAtLevel(0, 0);
+ ASSERT_EQ(level0_files, ranges.size()); // Multiple files in L0
+ ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0); // No files in L1
+
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+
+ // Since data is non-overlapping we expect compaction to initiate
+ // a trivial move
+ db_->CompactRange(cro, nullptr, nullptr);
+ // We expect that all the files were trivially moved from L0 to L1
+ ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(1, 0) /* level1_files */, level0_files);
+
+ for (size_t i = 0; i < ranges.size(); i++) {
+ for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+ ASSERT_EQ(Get(Key(j)), values[j]);
+ }
+ }
+
+ ASSERT_EQ(trivial_move, 1);
+ ASSERT_EQ(non_trivial_move, 0);
+
+ trivial_move = 0;
+ non_trivial_move = 0;
+ values.clear();
+ DestroyAndReopen(options);
+ // Same ranges as above but overlapping
+ ranges = {
+ {100, 199},
+ {300, 399},
+ {0, 99},
+ {200, 299},
+ {600, 699},
+ {400, 499},
+ {500, 560}, // this range overlap with the next one
+ {551, 599},
+ };
+ for (size_t i = 0; i < ranges.size(); i++) {
+ for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+ values[j] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(j), values[j]));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ db_->CompactRange(cro, nullptr, nullptr);
+
+ for (size_t i = 0; i < ranges.size(); i++) {
+ for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+ ASSERT_EQ(Get(Key(j)), values[j]);
+ }
+ }
+ ASSERT_EQ(trivial_move, 0);
+ ASSERT_EQ(non_trivial_move, 1);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBCompactionTestWithParam, TrivialMoveTargetLevel) {
+ int32_t trivial_move = 0;
+ int32_t non_trivial_move = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:TrivialMove",
+ [&](void* /*arg*/) { trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial",
+ [&](void* /*arg*/) { non_trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.write_buffer_size = 10 * 1024 * 1024;
+ options.num_levels = 7;
+ options.max_subcompactions = max_subcompactions_;
+
+ DestroyAndReopen(options);
+ int32_t value_size = 10 * 1024; // 10 KB
+
+ // Add 2 non-overlapping files
+ Random rnd(301);
+ std::map<int32_t, std::string> values;
+
+ // file 1 [0 => 300]
+ for (int32_t i = 0; i <= 300; i++) {
+ values[i] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // file 2 [600 => 700]
+ for (int32_t i = 600; i <= 700; i++) {
+ values[i] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // 2 files in L0
+ ASSERT_EQ("2", FilesPerLevel(0));
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 6;
+ compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+ // 2 files in L6
+ ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel(0));
+
+ ASSERT_EQ(trivial_move, 1);
+ ASSERT_EQ(non_trivial_move, 0);
+
+ for (int32_t i = 0; i <= 300; i++) {
+ ASSERT_EQ(Get(Key(i)), values[i]);
+ }
+ for (int32_t i = 600; i <= 700; i++) {
+ ASSERT_EQ(Get(Key(i)), values[i]);
+ }
+}
+
+TEST_P(DBCompactionTestWithParam, ManualCompactionPartial) {
+ int32_t trivial_move = 0;
+ int32_t non_trivial_move = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:TrivialMove",
+ [&](void* /*arg*/) { trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial",
+ [&](void* /*arg*/) { non_trivial_move++; });
+ bool first = true;
+ // Purpose of dependencies:
+ // 4 -> 1: ensure the order of two non-trivial compactions
+ // 5 -> 2 and 5 -> 3: ensure we do a check before two non-trivial compactions
+ // are installed
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBCompaction::ManualPartial:4", "DBCompaction::ManualPartial:1"},
+ {"DBCompaction::ManualPartial:5", "DBCompaction::ManualPartial:2"},
+ {"DBCompaction::ManualPartial:5", "DBCompaction::ManualPartial:3"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+ if (first) {
+ first = false;
+ TEST_SYNC_POINT("DBCompaction::ManualPartial:4");
+ TEST_SYNC_POINT("DBCompaction::ManualPartial:3");
+ } else { // second non-trivial compaction
+ TEST_SYNC_POINT("DBCompaction::ManualPartial:2");
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.write_buffer_size = 10 * 1024 * 1024;
+ options.num_levels = 7;
+ options.max_subcompactions = max_subcompactions_;
+ options.level0_file_num_compaction_trigger = 3;
+ options.max_background_compactions = 3;
+ options.target_file_size_base = 1 << 23; // 8 MB
+
+ DestroyAndReopen(options);
+ int32_t value_size = 10 * 1024; // 10 KB
+
+ // Add 2 non-overlapping files
+ Random rnd(301);
+ std::map<int32_t, std::string> values;
+
+ // file 1 [0 => 100]
+ for (int32_t i = 0; i < 100; i++) {
+ values[i] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // file 2 [100 => 300]
+ for (int32_t i = 100; i < 300; i++) {
+ values[i] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // 2 files in L0
+ ASSERT_EQ("2", FilesPerLevel(0));
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 6;
+ compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+ // Trivial move the two non-overlapping files to level 6
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+ // 2 files in L6
+ ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel(0));
+
+ ASSERT_EQ(trivial_move, 1);
+ ASSERT_EQ(non_trivial_move, 0);
+
+ // file 3 [ 0 => 200]
+ for (int32_t i = 0; i < 200; i++) {
+ values[i] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // 1 files in L0
+ ASSERT_EQ("1,0,0,0,0,0,2", FilesPerLevel(0));
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, false));
+ ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr, false));
+ ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr, nullptr, false));
+ ASSERT_OK(dbfull()->TEST_CompactRange(3, nullptr, nullptr, nullptr, false));
+ ASSERT_OK(dbfull()->TEST_CompactRange(4, nullptr, nullptr, nullptr, false));
+ // 2 files in L6, 1 file in L5
+ ASSERT_EQ("0,0,0,0,0,1,2", FilesPerLevel(0));
+
+ ASSERT_EQ(trivial_move, 6);
+ ASSERT_EQ(non_trivial_move, 0);
+
+ ROCKSDB_NAMESPACE::port::Thread threads([&] {
+ compact_options.change_level = false;
+ compact_options.exclusive_manual_compaction = false;
+ std::string begin_string = Key(0);
+ std::string end_string = Key(199);
+ Slice begin(begin_string);
+ Slice end(end_string);
+ // First non-trivial compaction is triggered
+ ASSERT_OK(db_->CompactRange(compact_options, &begin, &end));
+ });
+
+ TEST_SYNC_POINT("DBCompaction::ManualPartial:1");
+ // file 4 [300 => 400)
+ for (int32_t i = 300; i <= 400; i++) {
+ values[i] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // file 5 [400 => 500)
+ for (int32_t i = 400; i <= 500; i++) {
+ values[i] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // file 6 [500 => 600)
+ for (int32_t i = 500; i <= 600; i++) {
+ values[i] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ // Second non-trivial compaction is triggered
+ ASSERT_OK(Flush());
+
+ // Before two non-trivial compactions are installed, there are 3 files in L0
+ ASSERT_EQ("3,0,0,0,0,1,2", FilesPerLevel(0));
+ TEST_SYNC_POINT("DBCompaction::ManualPartial:5");
+
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+ // After two non-trivial compactions are installed, there is 1 file in L6, and
+ // 1 file in L1
+ ASSERT_EQ("0,1,0,0,0,0,1", FilesPerLevel(0));
+ threads.join();
+
+ for (int32_t i = 0; i < 600; i++) {
+ ASSERT_EQ(Get(Key(i)), values[i]);
+ }
+}
+
+// Disable as the test is flaky.
+TEST_F(DBCompactionTest, DISABLED_ManualPartialFill) {
+ int32_t trivial_move = 0;
+ int32_t non_trivial_move = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:TrivialMove",
+ [&](void* /*arg*/) { trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial",
+ [&](void* /*arg*/) { non_trivial_move++; });
+ bool first = true;
+ bool second = true;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBCompaction::PartialFill:4", "DBCompaction::PartialFill:1"},
+ {"DBCompaction::PartialFill:2", "DBCompaction::PartialFill:3"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+ if (first) {
+ TEST_SYNC_POINT("DBCompaction::PartialFill:4");
+ first = false;
+ TEST_SYNC_POINT("DBCompaction::PartialFill:3");
+ } else if (second) {
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.write_buffer_size = 10 * 1024 * 1024;
+ options.max_bytes_for_level_multiplier = 2;
+ options.num_levels = 4;
+ options.level0_file_num_compaction_trigger = 3;
+ options.max_background_compactions = 3;
+
+ DestroyAndReopen(options);
+ // make sure all background compaction jobs can be scheduled
+ auto stop_token =
+ dbfull()->TEST_write_controler().GetCompactionPressureToken();
+ int32_t value_size = 10 * 1024; // 10 KB
+
+ // Add 2 non-overlapping files
+ Random rnd(301);
+ std::map<int32_t, std::string> values;
+
+ // file 1 [0 => 100]
+ for (int32_t i = 0; i < 100; i++) {
+ values[i] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // file 2 [100 => 300]
+ for (int32_t i = 100; i < 300; i++) {
+ values[i] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // 2 files in L0
+ ASSERT_EQ("2", FilesPerLevel(0));
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 2;
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+ // 2 files in L2
+ ASSERT_EQ("0,0,2", FilesPerLevel(0));
+
+ ASSERT_EQ(trivial_move, 1);
+ ASSERT_EQ(non_trivial_move, 0);
+
+ // file 3 [ 0 => 200]
+ for (int32_t i = 0; i < 200; i++) {
+ values[i] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // 2 files in L2, 1 in L0
+ ASSERT_EQ("1,0,2", FilesPerLevel(0));
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, false));
+ // 2 files in L2, 1 in L1
+ ASSERT_EQ("0,1,2", FilesPerLevel(0));
+
+ ASSERT_EQ(trivial_move, 2);
+ ASSERT_EQ(non_trivial_move, 0);
+
+ ROCKSDB_NAMESPACE::port::Thread threads([&] {
+ compact_options.change_level = false;
+ compact_options.exclusive_manual_compaction = false;
+ std::string begin_string = Key(0);
+ std::string end_string = Key(199);
+ Slice begin(begin_string);
+ Slice end(end_string);
+ ASSERT_OK(db_->CompactRange(compact_options, &begin, &end));
+ });
+
+ TEST_SYNC_POINT("DBCompaction::PartialFill:1");
+ // Many files 4 [300 => 4300)
+ for (int32_t i = 0; i <= 5; i++) {
+ for (int32_t j = 300; j < 4300; j++) {
+ if (j == 2300) {
+ ASSERT_OK(Flush());
+ dbfull()->TEST_WaitForFlushMemTable();
+ }
+ values[j] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(j), values[j]));
+ }
+ }
+
+ // Verify level sizes
+ uint64_t target_size = 4 * options.max_bytes_for_level_base;
+ for (int32_t i = 1; i < options.num_levels; i++) {
+ ASSERT_LE(SizeAtLevel(i), target_size);
+ target_size = static_cast<uint64_t>(target_size *
+ options.max_bytes_for_level_multiplier);
+ }
+
+ TEST_SYNC_POINT("DBCompaction::PartialFill:2");
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+ threads.join();
+
+ for (int32_t i = 0; i < 4300; i++) {
+ ASSERT_EQ(Get(Key(i)), values[i]);
+ }
+}
+
+TEST_F(DBCompactionTest, ManualCompactionWithUnorderedWrite) {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL",
+ "DBCompactionTest::ManualCompactionWithUnorderedWrite:WaitWriteWAL"},
+ {"DBImpl::WaitForPendingWrites:BeforeBlock",
+ "DBImpl::WriteImpl:BeforeUnorderedWriteMemtable"}});
+
+ Options options = CurrentOptions();
+ options.unordered_write = true;
+ DestroyAndReopen(options);
+ Put("foo", "v1");
+ ASSERT_OK(Flush());
+
+ Put("bar", "v1");
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ port::Thread writer([&]() { Put("foo", "v2"); });
+
+ TEST_SYNC_POINT(
+ "DBCompactionTest::ManualCompactionWithUnorderedWrite:WaitWriteWAL");
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ writer.join();
+ ASSERT_EQ(Get("foo"), "v2");
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ Reopen(options);
+ ASSERT_EQ(Get("foo"), "v2");
+}
+
+TEST_F(DBCompactionTest, DeleteFileRange) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 10 * 1024 * 1024;
+ options.max_bytes_for_level_multiplier = 2;
+ options.num_levels = 4;
+ options.level0_file_num_compaction_trigger = 3;
+ options.max_background_compactions = 3;
+
+ DestroyAndReopen(options);
+ int32_t value_size = 10 * 1024; // 10 KB
+
+ // Add 2 non-overlapping files
+ Random rnd(301);
+ std::map<int32_t, std::string> values;
+
+ // file 1 [0 => 100]
+ for (int32_t i = 0; i < 100; i++) {
+ values[i] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // file 2 [100 => 300]
+ for (int32_t i = 100; i < 300; i++) {
+ values[i] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // 2 files in L0
+ ASSERT_EQ("2", FilesPerLevel(0));
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 2;
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+ // 2 files in L2
+ ASSERT_EQ("0,0,2", FilesPerLevel(0));
+
+ // file 3 [ 0 => 200]
+ for (int32_t i = 0; i < 200; i++) {
+ values[i] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // Many files 4 [300 => 4300)
+ for (int32_t i = 0; i <= 5; i++) {
+ for (int32_t j = 300; j < 4300; j++) {
+ if (j == 2300) {
+ ASSERT_OK(Flush());
+ dbfull()->TEST_WaitForFlushMemTable();
+ }
+ values[j] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(j), values[j]));
+ }
+ }
+ ASSERT_OK(Flush());
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+
+ // Verify level sizes
+ uint64_t target_size = 4 * options.max_bytes_for_level_base;
+ for (int32_t i = 1; i < options.num_levels; i++) {
+ ASSERT_LE(SizeAtLevel(i), target_size);
+ target_size = static_cast<uint64_t>(target_size *
+ options.max_bytes_for_level_multiplier);
+ }
+
+ size_t old_num_files = CountFiles();
+ std::string begin_string = Key(1000);
+ std::string end_string = Key(2000);
+ Slice begin(begin_string);
+ Slice end(end_string);
+ ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end));
+
+ int32_t deleted_count = 0;
+ for (int32_t i = 0; i < 4300; i++) {
+ if (i < 1000 || i > 2000) {
+ ASSERT_EQ(Get(Key(i)), values[i]);
+ } else {
+ ReadOptions roptions;
+ std::string result;
+ Status s = db_->Get(roptions, Key(i), &result);
+ ASSERT_TRUE(s.IsNotFound() || s.ok());
+ if (s.IsNotFound()) {
+ deleted_count++;
+ }
+ }
+ }
+ ASSERT_GT(deleted_count, 0);
+ begin_string = Key(5000);
+ end_string = Key(6000);
+ Slice begin1(begin_string);
+ Slice end1(end_string);
+ // Try deleting files in range which contain no keys
+ ASSERT_OK(
+ DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin1, &end1));
+
+ // Push data from level 0 to level 1 to force all data to be deleted
+ // Note that we don't delete level 0 files
+ compact_options.change_level = true;
+ compact_options.target_level = 1;
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+ dbfull()->TEST_WaitForCompact();
+
+ ASSERT_OK(
+ DeleteFilesInRange(db_, db_->DefaultColumnFamily(), nullptr, nullptr));
+
+ int32_t deleted_count2 = 0;
+ for (int32_t i = 0; i < 4300; i++) {
+ ReadOptions roptions;
+ std::string result;
+ Status s = db_->Get(roptions, Key(i), &result);
+ ASSERT_TRUE(s.IsNotFound());
+ deleted_count2++;
+ }
+ ASSERT_GT(deleted_count2, deleted_count);
+ size_t new_num_files = CountFiles();
+ ASSERT_GT(old_num_files, new_num_files);
+}
+
+TEST_F(DBCompactionTest, DeleteFilesInRanges) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 10 * 1024 * 1024;
+ options.max_bytes_for_level_multiplier = 2;
+ options.num_levels = 4;
+ options.max_background_compactions = 3;
+ options.disable_auto_compactions = true;
+
+ DestroyAndReopen(options);
+ int32_t value_size = 10 * 1024; // 10 KB
+
+ Random rnd(301);
+ std::map<int32_t, std::string> values;
+
+ // file [0 => 100), [100 => 200), ... [900, 1000)
+ for (auto i = 0; i < 10; i++) {
+ for (auto j = 0; j < 100; j++) {
+ auto k = i * 100 + j;
+ values[k] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(k), values[k]));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_EQ("10", FilesPerLevel(0));
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 2;
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+ ASSERT_EQ("0,0,10", FilesPerLevel(0));
+
+ // file [0 => 100), [200 => 300), ... [800, 900)
+ for (auto i = 0; i < 10; i+=2) {
+ for (auto j = 0; j < 100; j++) {
+ auto k = i * 100 + j;
+ ASSERT_OK(Put(Key(k), values[k]));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_EQ("5,0,10", FilesPerLevel(0));
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+ ASSERT_EQ("0,5,10", FilesPerLevel(0));
+
+ // Delete files in range [0, 299] (inclusive)
+ {
+ auto begin_str1 = Key(0), end_str1 = Key(100);
+ auto begin_str2 = Key(100), end_str2 = Key(200);
+ auto begin_str3 = Key(200), end_str3 = Key(299);
+ Slice begin1(begin_str1), end1(end_str1);
+ Slice begin2(begin_str2), end2(end_str2);
+ Slice begin3(begin_str3), end3(end_str3);
+ std::vector<RangePtr> ranges;
+ ranges.push_back(RangePtr(&begin1, &end1));
+ ranges.push_back(RangePtr(&begin2, &end2));
+ ranges.push_back(RangePtr(&begin3, &end3));
+ ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(),
+ ranges.data(), ranges.size()));
+ ASSERT_EQ("0,3,7", FilesPerLevel(0));
+
+ // Keys [0, 300) should not exist.
+ for (auto i = 0; i < 300; i++) {
+ ReadOptions ropts;
+ std::string result;
+ auto s = db_->Get(ropts, Key(i), &result);
+ ASSERT_TRUE(s.IsNotFound());
+ }
+ for (auto i = 300; i < 1000; i++) {
+ ASSERT_EQ(Get(Key(i)), values[i]);
+ }
+ }
+
+ // Delete files in range [600, 999) (exclusive)
+ {
+ auto begin_str1 = Key(600), end_str1 = Key(800);
+ auto begin_str2 = Key(700), end_str2 = Key(900);
+ auto begin_str3 = Key(800), end_str3 = Key(999);
+ Slice begin1(begin_str1), end1(end_str1);
+ Slice begin2(begin_str2), end2(end_str2);
+ Slice begin3(begin_str3), end3(end_str3);
+ std::vector<RangePtr> ranges;
+ ranges.push_back(RangePtr(&begin1, &end1));
+ ranges.push_back(RangePtr(&begin2, &end2));
+ ranges.push_back(RangePtr(&begin3, &end3));
+ ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(),
+ ranges.data(), ranges.size(), false));
+ ASSERT_EQ("0,1,4", FilesPerLevel(0));
+
+ // Keys [600, 900) should not exist.
+ for (auto i = 600; i < 900; i++) {
+ ReadOptions ropts;
+ std::string result;
+ auto s = db_->Get(ropts, Key(i), &result);
+ ASSERT_TRUE(s.IsNotFound());
+ }
+ for (auto i = 300; i < 600; i++) {
+ ASSERT_EQ(Get(Key(i)), values[i]);
+ }
+ for (auto i = 900; i < 1000; i++) {
+ ASSERT_EQ(Get(Key(i)), values[i]);
+ }
+ }
+
+ // Delete all files.
+ {
+ RangePtr range;
+ ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(), &range, 1));
+ ASSERT_EQ("", FilesPerLevel(0));
+
+ for (auto i = 0; i < 1000; i++) {
+ ReadOptions ropts;
+ std::string result;
+ auto s = db_->Get(ropts, Key(i), &result);
+ ASSERT_TRUE(s.IsNotFound());
+ }
+ }
+}
+
+TEST_F(DBCompactionTest, DeleteFileRangeFileEndpointsOverlapBug) {
+ // regression test for #2833: groups of files whose user-keys overlap at the
+ // endpoints could be split by `DeleteFilesInRange`. This caused old data to
+ // reappear, either because a new version of the key was removed, or a range
+ // deletion was partially dropped. It could also cause non-overlapping
+ // invariant to be violated if the files dropped by DeleteFilesInRange were
+ // a subset of files that a range deletion spans.
+ const int kNumL0Files = 2;
+ const int kValSize = 8 << 10; // 8KB
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = kNumL0Files;
+ options.target_file_size_base = 1 << 10; // 1KB
+ DestroyAndReopen(options);
+
+ // The snapshot prevents key 1 from having its old version dropped. The low
+ // `target_file_size_base` ensures two keys will be in each output file.
+ const Snapshot* snapshot = nullptr;
+ Random rnd(301);
+ // The value indicates which flush the key belonged to, which is enough
+ // for us to determine the keys' relative ages. After L0 flushes finish,
+ // files look like:
+ //
+ // File 0: 0 -> vals[0], 1 -> vals[0]
+ // File 1: 1 -> vals[1], 2 -> vals[1]
+ //
+ // Then L0->L1 compaction happens, which outputs keys as follows:
+ //
+ // File 0: 0 -> vals[0], 1 -> vals[1]
+ // File 1: 1 -> vals[0], 2 -> vals[1]
+ //
+ // DeleteFilesInRange shouldn't be allowed to drop just file 0, as that
+ // would cause `1 -> vals[0]` (an older key) to reappear.
+ std::string vals[kNumL0Files];
+ for (int i = 0; i < kNumL0Files; ++i) {
+ vals[i] = RandomString(&rnd, kValSize);
+ Put(Key(i), vals[i]);
+ Put(Key(i + 1), vals[i]);
+ Flush();
+ if (i == 0) {
+ snapshot = db_->GetSnapshot();
+ }
+ }
+ dbfull()->TEST_WaitForCompact();
+
+ // Verify `DeleteFilesInRange` can't drop only file 0 which would cause
+ // "1 -> vals[0]" to reappear.
+ std::string begin_str = Key(0), end_str = Key(1);
+ Slice begin = begin_str, end = end_str;
+ ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end));
+ ASSERT_EQ(vals[1], Get(Key(1)));
+
+ db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_P(DBCompactionTestWithParam, TrivialMoveToLastLevelWithFiles) {
+ int32_t trivial_move = 0;
+ int32_t non_trivial_move = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:TrivialMove",
+ [&](void* /*arg*/) { trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial",
+ [&](void* /*arg*/) { non_trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100000000;
+ options.max_subcompactions = max_subcompactions_;
+ DestroyAndReopen(options);
+
+ int32_t value_size = 10 * 1024; // 10 KB
+
+ Random rnd(301);
+ std::vector<std::string> values;
+ // File with keys [ 0 => 99 ]
+ for (int i = 0; i < 100; i++) {
+ values.push_back(RandomString(&rnd, value_size));
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ("1", FilesPerLevel(0));
+ // Compaction will do L0=>L1 (trivial move) then move L1 files to L3
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 3;
+ compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,1", FilesPerLevel(0));
+ ASSERT_EQ(trivial_move, 1);
+ ASSERT_EQ(non_trivial_move, 0);
+
+ // File with keys [ 100 => 199 ]
+ for (int i = 100; i < 200; i++) {
+ values.push_back(RandomString(&rnd, value_size));
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ("1,0,0,1", FilesPerLevel(0));
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+ // Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves)
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,2", FilesPerLevel(0));
+ ASSERT_EQ(trivial_move, 4);
+ ASSERT_EQ(non_trivial_move, 0);
+
+ for (int i = 0; i < 200; i++) {
+ ASSERT_EQ(Get(Key(i)), values[i]);
+ }
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBCompactionTestWithParam, LevelCompactionThirdPath) {
+ Options options = CurrentOptions();
+ options.db_paths.emplace_back(dbname_, 500 * 1024);
+ options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
+ options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
+ options.memtable_factory.reset(
+ new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+ options.compaction_style = kCompactionStyleLevel;
+ options.write_buffer_size = 110 << 10; // 110KB
+ options.arena_block_size = 4 << 10;
+ options.level0_file_num_compaction_trigger = 2;
+ options.num_levels = 4;
+ options.max_bytes_for_level_base = 400 * 1024;
+ options.max_subcompactions = max_subcompactions_;
+ // options = CurrentOptions(options);
+
+ std::vector<std::string> filenames;
+ env_->GetChildren(options.db_paths[1].path, &filenames);
+ // Delete archival files.
+ for (size_t i = 0; i < filenames.size(); ++i) {
+ env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]);
+ }
+ env_->DeleteDir(options.db_paths[1].path);
+ Reopen(options);
+
+ Random rnd(301);
+ int key_idx = 0;
+
+ // First three 110KB files are not going to second path.
+ // After that, (100K, 200K)
+ for (int num = 0; num < 3; num++) {
+ GenerateNewFile(&rnd, &key_idx);
+ }
+
+ // Another 110KB triggers a compaction to 400K file to fill up first path
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(3, GetSstFileCount(options.db_paths[1].path));
+
+ // (1, 4)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4", FilesPerLevel(0));
+ ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1, 4, 1)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,1", FilesPerLevel(0));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1, 4, 2)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,2", FilesPerLevel(0));
+ ASSERT_EQ(2, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1, 4, 3)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,3", FilesPerLevel(0));
+ ASSERT_EQ(3, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1, 4, 4)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,4", FilesPerLevel(0));
+ ASSERT_EQ(4, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1, 4, 5)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,5", FilesPerLevel(0));
+ ASSERT_EQ(5, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1, 4, 6)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,6", FilesPerLevel(0));
+ ASSERT_EQ(6, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1, 4, 7)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,7", FilesPerLevel(0));
+ ASSERT_EQ(7, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1, 4, 8)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,8", FilesPerLevel(0));
+ ASSERT_EQ(8, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ for (int i = 0; i < key_idx; i++) {
+ auto v = Get(Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ Reopen(options);
+
+ for (int i = 0; i < key_idx; i++) {
+ auto v = Get(Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ Destroy(options);
+}
+
+TEST_P(DBCompactionTestWithParam, LevelCompactionPathUse) {
+ Options options = CurrentOptions();
+ options.db_paths.emplace_back(dbname_, 500 * 1024);
+ options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
+ options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
+ options.memtable_factory.reset(
+ new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+ options.compaction_style = kCompactionStyleLevel;
+ options.write_buffer_size = 110 << 10; // 110KB
+ options.arena_block_size = 4 << 10;
+ options.level0_file_num_compaction_trigger = 2;
+ options.num_levels = 4;
+ options.max_bytes_for_level_base = 400 * 1024;
+ options.max_subcompactions = max_subcompactions_;
+ // options = CurrentOptions(options);
+
+ std::vector<std::string> filenames;
+ env_->GetChildren(options.db_paths[1].path, &filenames);
+ // Delete archival files.
+ for (size_t i = 0; i < filenames.size(); ++i) {
+ env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]);
+ }
+ env_->DeleteDir(options.db_paths[1].path);
+ Reopen(options);
+
+ Random rnd(301);
+ int key_idx = 0;
+
+ // Always gets compacted into 1 Level1 file,
+ // 0/1 Level 0 file
+ for (int num = 0; num < 3; num++) {
+ key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ }
+
+ key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+ key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,1", FilesPerLevel(0));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,1", FilesPerLevel(0));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,1", FilesPerLevel(0));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,1", FilesPerLevel(0));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,1", FilesPerLevel(0));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ for (int i = 0; i < key_idx; i++) {
+ auto v = Get(Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ Reopen(options);
+
+ for (int i = 0; i < key_idx; i++) {
+ auto v = Get(Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ Destroy(options);
+}
+
+TEST_P(DBCompactionTestWithParam, LevelCompactionCFPathUse) {
+ Options options = CurrentOptions();
+ options.db_paths.emplace_back(dbname_, 500 * 1024);
+ options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
+ options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
+ options.memtable_factory.reset(
+ new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+ options.compaction_style = kCompactionStyleLevel;
+ options.write_buffer_size = 110 << 10; // 110KB
+ options.arena_block_size = 4 << 10;
+ options.level0_file_num_compaction_trigger = 2;
+ options.num_levels = 4;
+ options.max_bytes_for_level_base = 400 * 1024;
+ options.max_subcompactions = max_subcompactions_;
+
+ std::vector<Options> option_vector;
+ option_vector.emplace_back(options);
+ ColumnFamilyOptions cf_opt1(options), cf_opt2(options);
+ // Configure CF1 specific paths.
+ cf_opt1.cf_paths.emplace_back(dbname_ + "cf1", 500 * 1024);
+ cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_2", 4 * 1024 * 1024);
+ cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_3", 1024 * 1024 * 1024);
+ option_vector.emplace_back(DBOptions(options), cf_opt1);
+ CreateColumnFamilies({"one"},option_vector[1]);
+
+ // Configura CF2 specific paths.
+ cf_opt2.cf_paths.emplace_back(dbname_ + "cf2", 500 * 1024);
+ cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_2", 4 * 1024 * 1024);
+ cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_3", 1024 * 1024 * 1024);
+ option_vector.emplace_back(DBOptions(options), cf_opt2);
+ CreateColumnFamilies({"two"},option_vector[2]);
+
+ ReopenWithColumnFamilies({"default", "one", "two"}, option_vector);
+
+ Random rnd(301);
+ int key_idx = 0;
+ int key_idx1 = 0;
+ int key_idx2 = 0;
+
+ auto generate_file = [&]() {
+ GenerateNewFile(0, &rnd, &key_idx);
+ GenerateNewFile(1, &rnd, &key_idx1);
+ GenerateNewFile(2, &rnd, &key_idx2);
+ };
+
+ auto check_sstfilecount = [&](int path_id, int expected) {
+ ASSERT_EQ(expected, GetSstFileCount(options.db_paths[path_id].path));
+ ASSERT_EQ(expected, GetSstFileCount(cf_opt1.cf_paths[path_id].path));
+ ASSERT_EQ(expected, GetSstFileCount(cf_opt2.cf_paths[path_id].path));
+ };
+
+ auto check_filesperlevel = [&](const std::string& expected) {
+ ASSERT_EQ(expected, FilesPerLevel(0));
+ ASSERT_EQ(expected, FilesPerLevel(1));
+ ASSERT_EQ(expected, FilesPerLevel(2));
+ };
+
+ auto check_getvalues = [&]() {
+ for (int i = 0; i < key_idx; i++) {
+ auto v = Get(0, Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ for (int i = 0; i < key_idx1; i++) {
+ auto v = Get(1, Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ for (int i = 0; i < key_idx2; i++) {
+ auto v = Get(2, Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+ };
+
+ // Check that default column family uses db_paths.
+ // And Column family "one" uses cf_paths.
+
+ // First three 110KB files are not going to second path.
+ // After that, (100K, 200K)
+ for (int num = 0; num < 3; num++) {
+ generate_file();
+ }
+
+ // Another 110KB triggers a compaction to 400K file to fill up first path
+ generate_file();
+ check_sstfilecount(1, 3);
+
+ // (1, 4)
+ generate_file();
+ check_filesperlevel("1,4");
+ check_sstfilecount(1, 4);
+ check_sstfilecount(0, 1);
+
+ // (1, 4, 1)
+ generate_file();
+ check_filesperlevel("1,4,1");
+ check_sstfilecount(2, 1);
+ check_sstfilecount(1, 4);
+ check_sstfilecount(0, 1);
+
+ // (1, 4, 2)
+ generate_file();
+ check_filesperlevel("1,4,2");
+ check_sstfilecount(2, 2);
+ check_sstfilecount(1, 4);
+ check_sstfilecount(0, 1);
+
+ check_getvalues();
+
+ ReopenWithColumnFamilies({"default", "one", "two"}, option_vector);
+
+ check_getvalues();
+
+ Destroy(options, true);
+}
+
+TEST_P(DBCompactionTestWithParam, ConvertCompactionStyle) {
+ Random rnd(301);
+ int max_key_level_insert = 200;
+ int max_key_universal_insert = 600;
+
+ // Stage 1: generate a db with level compaction
+ Options options = CurrentOptions();
+ options.write_buffer_size = 110 << 10; // 110KB
+ options.arena_block_size = 4 << 10;
+ options.num_levels = 4;
+ options.level0_file_num_compaction_trigger = 3;
+ options.max_bytes_for_level_base = 500 << 10; // 500KB
+ options.max_bytes_for_level_multiplier = 1;
+ options.target_file_size_base = 200 << 10; // 200KB
+ options.target_file_size_multiplier = 1;
+ options.max_subcompactions = max_subcompactions_;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ for (int i = 0; i <= max_key_level_insert; i++) {
+ // each value is 10K
+ ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
+ }
+ ASSERT_OK(Flush(1));
+ dbfull()->TEST_WaitForCompact();
+
+ ASSERT_GT(TotalTableFiles(1, 4), 1);
+ int non_level0_num_files = 0;
+ for (int i = 1; i < options.num_levels; i++) {
+ non_level0_num_files += NumTableFilesAtLevel(i, 1);
+ }
+ ASSERT_GT(non_level0_num_files, 0);
+
+ // Stage 2: reopen with universal compaction - should fail
+ options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = 1;
+ options = CurrentOptions(options);
+ Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_TRUE(s.IsInvalidArgument());
+
+ // Stage 3: compact into a single file and move the file to level 0
+ options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.target_file_size_base = INT_MAX;
+ options.target_file_size_multiplier = 1;
+ options.max_bytes_for_level_base = INT_MAX;
+ options.max_bytes_for_level_multiplier = 1;
+ options.num_levels = 4;
+ options = CurrentOptions(options);
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 0;
+ // cannot use kForceOptimized here because the compaction here is expected
+ // to generate one output file
+ compact_options.bottommost_level_compaction =
+ BottommostLevelCompaction::kForce;
+ compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+ dbfull()->CompactRange(compact_options, handles_[1], nullptr, nullptr);
+
+ // Only 1 file in L0
+ ASSERT_EQ("1", FilesPerLevel(1));
+
+ // Stage 4: re-open in universal compaction style and do some db operations
+ options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = 4;
+ options.write_buffer_size = 110 << 10; // 110KB
+ options.arena_block_size = 4 << 10;
+ options.level0_file_num_compaction_trigger = 3;
+ options = CurrentOptions(options);
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ options.num_levels = 1;
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ for (int i = max_key_level_insert / 2; i <= max_key_universal_insert; i++) {
+ ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
+ }
+ dbfull()->Flush(FlushOptions());
+ ASSERT_OK(Flush(1));
+ dbfull()->TEST_WaitForCompact();
+
+ for (int i = 1; i < options.num_levels; i++) {
+ ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0);
+ }
+
+ // verify keys inserted in both level compaction style and universal
+ // compaction style
+ std::string keys_in_db;
+ Iterator* iter = dbfull()->NewIterator(ReadOptions(), handles_[1]);
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ keys_in_db.append(iter->key().ToString());
+ keys_in_db.push_back(',');
+ }
+ delete iter;
+
+ std::string expected_keys;
+ for (int i = 0; i <= max_key_universal_insert; i++) {
+ expected_keys.append(Key(i));
+ expected_keys.push_back(',');
+ }
+
+ ASSERT_EQ(keys_in_db, expected_keys);
+}
+
+TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_a) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "b", "v"));
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_OK(Delete(1, "b"));
+ ASSERT_OK(Delete(1, "a"));
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_OK(Delete(1, "a"));
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "a", "v"));
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_EQ("(a->v)", Contents(1));
+ env_->SleepForMicroseconds(1000000); // Wait for compaction to finish
+ ASSERT_EQ("(a->v)", Contents(1));
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_b) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ Put(1, "", "");
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ Delete(1, "e");
+ Put(1, "", "");
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ Put(1, "c", "cv");
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ Put(1, "", "");
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ Put(1, "", "");
+ env_->SleepForMicroseconds(1000000); // Wait for compaction to finish
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ Put(1, "d", "dv");
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ Put(1, "", "");
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ Delete(1, "d");
+ Delete(1, "b");
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_EQ("(->)(c->cv)", Contents(1));
+ env_->SleepForMicroseconds(1000000); // Wait for compaction to finish
+ ASSERT_EQ("(->)(c->cv)", Contents(1));
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBCompactionTest, ManualAutoRace) {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::BGWorkCompaction", "DBCompactionTest::ManualAutoRace:1"},
+ {"DBImpl::RunManualCompaction:WaitScheduled",
+ "BackgroundCallCompaction:0"}});
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Put(1, "foo", "");
+ Put(1, "bar", "");
+ Flush(1);
+ Put(1, "foo", "");
+ Put(1, "bar", "");
+ // Generate four files in CF 0, which should trigger an auto compaction
+ Put("foo", "");
+ Put("bar", "");
+ Flush();
+ Put("foo", "");
+ Put("bar", "");
+ Flush();
+ Put("foo", "");
+ Put("bar", "");
+ Flush();
+ Put("foo", "");
+ Put("bar", "");
+ Flush();
+
+ // The auto compaction is scheduled but waited until here
+ TEST_SYNC_POINT("DBCompactionTest::ManualAutoRace:1");
+ // The auto compaction will wait until the manual compaction is registerd
+ // before processing so that it will be cancelled.
+ dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr);
+ ASSERT_EQ("0,1", FilesPerLevel(1));
+
+ // Eventually the cancelled compaction will be rescheduled and executed.
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBCompactionTestWithParam, ManualCompaction) {
+ Options options = CurrentOptions();
+ options.max_subcompactions = max_subcompactions_;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // iter - 0 with 7 levels
+ // iter - 1 with 3 levels
+ for (int iter = 0; iter < 2; ++iter) {
+ MakeTables(3, "p", "q", 1);
+ ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+ // Compaction range falls before files
+ Compact(1, "", "c");
+ ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+ // Compaction range falls after files
+ Compact(1, "r", "z");
+ ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+ // Compaction range overlaps files
+ Compact(1, "p1", "p9");
+ ASSERT_EQ("0,0,1", FilesPerLevel(1));
+
+ // Populate a different range
+ MakeTables(3, "c", "e", 1);
+ ASSERT_EQ("1,1,2", FilesPerLevel(1));
+
+ // Compact just the new range
+ Compact(1, "b", "f");
+ ASSERT_EQ("0,0,2", FilesPerLevel(1));
+
+ // Compact all
+ MakeTables(1, "a", "z", 1);
+ ASSERT_EQ("1,0,2", FilesPerLevel(1));
+
+ uint64_t prev_block_cache_add =
+ options.statistics->getTickerCount(BLOCK_CACHE_ADD);
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+ db_->CompactRange(cro, handles_[1], nullptr, nullptr);
+ // Verify manual compaction doesn't fill block cache
+ ASSERT_EQ(prev_block_cache_add,
+ options.statistics->getTickerCount(BLOCK_CACHE_ADD));
+
+ ASSERT_EQ("0,0,1", FilesPerLevel(1));
+
+ if (iter == 0) {
+ options = CurrentOptions();
+ options.num_levels = 3;
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ }
+ }
+}
+
+
+TEST_P(DBCompactionTestWithParam, ManualLevelCompactionOutputPathId) {
+ Options options = CurrentOptions();
+ options.db_paths.emplace_back(dbname_ + "_2", 2 * 10485760);
+ options.db_paths.emplace_back(dbname_ + "_3", 100 * 10485760);
+ options.db_paths.emplace_back(dbname_ + "_4", 120 * 10485760);
+ options.max_subcompactions = max_subcompactions_;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // iter - 0 with 7 levels
+ // iter - 1 with 3 levels
+ for (int iter = 0; iter < 2; ++iter) {
+ for (int i = 0; i < 3; ++i) {
+ ASSERT_OK(Put(1, "p", "begin"));
+ ASSERT_OK(Put(1, "q", "end"));
+ ASSERT_OK(Flush(1));
+ }
+ ASSERT_EQ("3", FilesPerLevel(1));
+ ASSERT_EQ(3, GetSstFileCount(options.db_paths[0].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ // Compaction range falls before files
+ Compact(1, "", "c");
+ ASSERT_EQ("3", FilesPerLevel(1));
+
+ // Compaction range falls after files
+ Compact(1, "r", "z");
+ ASSERT_EQ("3", FilesPerLevel(1));
+
+ // Compaction range overlaps files
+ Compact(1, "p1", "p9", 1);
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ("0,1", FilesPerLevel(1));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ // Populate a different range
+ for (int i = 0; i < 3; ++i) {
+ ASSERT_OK(Put(1, "c", "begin"));
+ ASSERT_OK(Put(1, "e", "end"));
+ ASSERT_OK(Flush(1));
+ }
+ ASSERT_EQ("3,1", FilesPerLevel(1));
+
+ // Compact just the new range
+ Compact(1, "b", "f", 1);
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ("0,2", FilesPerLevel(1));
+ ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ // Compact all
+ ASSERT_OK(Put(1, "a", "begin"));
+ ASSERT_OK(Put(1, "z", "end"));
+ ASSERT_OK(Flush(1));
+ ASSERT_EQ("1,2", FilesPerLevel(1));
+ ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
+ CompactRangeOptions compact_options;
+ compact_options.target_path_id = 1;
+ compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+ db_->CompactRange(compact_options, handles_[1], nullptr, nullptr);
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_EQ("0,1", FilesPerLevel(1));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ if (iter == 0) {
+ DestroyAndReopen(options);
+ options = CurrentOptions();
+ options.db_paths.emplace_back(dbname_ + "_2", 2 * 10485760);
+ options.db_paths.emplace_back(dbname_ + "_3", 100 * 10485760);
+ options.db_paths.emplace_back(dbname_ + "_4", 120 * 10485760);
+ options.max_background_flushes = 1;
+ options.num_levels = 3;
+ options.create_if_missing = true;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ }
+ }
+}
+
+TEST_F(DBCompactionTest, FilesDeletedAfterCompaction) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "foo", "v2"));
+ Compact(1, "a", "z");
+ const size_t num_files = CountLiveFiles();
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(Put(1, "foo", "v2"));
+ Compact(1, "a", "z");
+ }
+ ASSERT_EQ(CountLiveFiles(), num_files);
+ } while (ChangeCompactOptions());
+}
+
+// Check level comapction with compact files
+TEST_P(DBCompactionTestWithParam, DISABLED_CompactFilesOnLevelCompaction) {
+ const int kTestKeySize = 16;
+ const int kTestValueSize = 984;
+ const int kEntrySize = kTestKeySize + kTestValueSize;
+ const int kEntriesPerBuffer = 100;
+ Options options;
+ options.create_if_missing = true;
+ options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+ options.compaction_style = kCompactionStyleLevel;
+ options.target_file_size_base = options.write_buffer_size;
+ options.max_bytes_for_level_base = options.target_file_size_base * 2;
+ options.level0_stop_writes_trigger = 2;
+ options.max_bytes_for_level_multiplier = 2;
+ options.compression = kNoCompression;
+ options.max_subcompactions = max_subcompactions_;
+ options = CurrentOptions(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ Random rnd(301);
+ for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) {
+ ASSERT_OK(Put(1, ToString(key), RandomString(&rnd, kTestValueSize)));
+ }
+ dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+ dbfull()->TEST_WaitForCompact();
+
+ ColumnFamilyMetaData cf_meta;
+ dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+ int output_level = static_cast<int>(cf_meta.levels.size()) - 1;
+ for (int file_picked = 5; file_picked > 0; --file_picked) {
+ std::set<std::string> overlapping_file_names;
+ std::vector<std::string> compaction_input_file_names;
+ for (int f = 0; f < file_picked; ++f) {
+ int level = 0;
+ auto file_meta = PickFileRandomly(cf_meta, &rnd, &level);
+ compaction_input_file_names.push_back(file_meta->name);
+ GetOverlappingFileNumbersForLevelCompaction(
+ cf_meta, options.comparator, level, output_level,
+ file_meta, &overlapping_file_names);
+ }
+
+ ASSERT_OK(dbfull()->CompactFiles(
+ CompactionOptions(), handles_[1],
+ compaction_input_file_names,
+ output_level));
+
+ // Make sure all overlapping files do not exist after compaction
+ dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+ VerifyCompactionResult(cf_meta, overlapping_file_names);
+ }
+
+ // make sure all key-values are still there.
+ for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) {
+ ASSERT_NE(Get(1, ToString(key)), "NOT_FOUND");
+ }
+}
+
+TEST_P(DBCompactionTestWithParam, PartialCompactionFailure) {
+ Options options;
+ const int kKeySize = 16;
+ const int kKvSize = 1000;
+ const int kKeysPerBuffer = 100;
+ const int kNumL1Files = 5;
+ options.create_if_missing = true;
+ options.write_buffer_size = kKeysPerBuffer * kKvSize;
+ options.max_write_buffer_number = 2;
+ options.target_file_size_base =
+ options.write_buffer_size *
+ (options.max_write_buffer_number - 1);
+ options.level0_file_num_compaction_trigger = kNumL1Files;
+ options.max_bytes_for_level_base =
+ options.level0_file_num_compaction_trigger *
+ options.target_file_size_base;
+ options.max_bytes_for_level_multiplier = 2;
+ options.compression = kNoCompression;
+ options.max_subcompactions = max_subcompactions_;
+
+ env_->SetBackgroundThreads(1, Env::HIGH);
+ env_->SetBackgroundThreads(1, Env::LOW);
+ // stop the compaction thread until we simulate the file creation failure.
+ test::SleepingBackgroundTask sleeping_task_low;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+
+ options.env = env_;
+
+ DestroyAndReopen(options);
+
+ const int kNumInsertedKeys =
+ options.level0_file_num_compaction_trigger *
+ (options.max_write_buffer_number - 1) *
+ kKeysPerBuffer;
+
+ Random rnd(301);
+ std::vector<std::string> keys;
+ std::vector<std::string> values;
+ for (int k = 0; k < kNumInsertedKeys; ++k) {
+ keys.emplace_back(RandomString(&rnd, kKeySize));
+ values.emplace_back(RandomString(&rnd, kKvSize - kKeySize));
+ ASSERT_OK(Put(Slice(keys[k]), Slice(values[k])));
+ dbfull()->TEST_WaitForFlushMemTable();
+ }
+
+ dbfull()->TEST_FlushMemTable(true);
+ // Make sure the number of L0 files can trigger compaction.
+ ASSERT_GE(NumTableFilesAtLevel(0),
+ options.level0_file_num_compaction_trigger);
+
+ auto previous_num_level0_files = NumTableFilesAtLevel(0);
+
+ // Fail the first file creation.
+ env_->non_writable_count_ = 1;
+ sleeping_task_low.WakeUp();
+ sleeping_task_low.WaitUntilDone();
+
+ // Expect compaction to fail here as one file will fail its
+ // creation.
+ ASSERT_TRUE(!dbfull()->TEST_WaitForCompact().ok());
+
+ // Verify L0 -> L1 compaction does fail.
+ ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+
+ // Verify all L0 files are still there.
+ ASSERT_EQ(NumTableFilesAtLevel(0), previous_num_level0_files);
+
+ // All key-values must exist after compaction fails.
+ for (int k = 0; k < kNumInsertedKeys; ++k) {
+ ASSERT_EQ(values[k], Get(keys[k]));
+ }
+
+ env_->non_writable_count_ = 0;
+
+ // Make sure RocksDB will not get into corrupted state.
+ Reopen(options);
+
+ // Verify again after reopen.
+ for (int k = 0; k < kNumInsertedKeys; ++k) {
+ ASSERT_EQ(values[k], Get(keys[k]));
+ }
+}
+
+TEST_P(DBCompactionTestWithParam, DeleteMovedFileAfterCompaction) {
+ // iter 1 -- delete_obsolete_files_period_micros == 0
+ for (int iter = 0; iter < 2; ++iter) {
+ // This test triggers move compaction and verifies that the file is not
+ // deleted when it's part of move compaction
+ Options options = CurrentOptions();
+ options.env = env_;
+ if (iter == 1) {
+ options.delete_obsolete_files_period_micros = 0;
+ }
+ options.create_if_missing = true;
+ options.level0_file_num_compaction_trigger =
+ 2; // trigger compaction when we have 2 files
+ OnFileDeletionListener* listener = new OnFileDeletionListener();
+ options.listeners.emplace_back(listener);
+ options.max_subcompactions = max_subcompactions_;
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ // Create two 1MB sst files
+ for (int i = 0; i < 2; ++i) {
+ // Create 1MB sst file
+ for (int j = 0; j < 100; ++j) {
+ ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024)));
+ }
+ ASSERT_OK(Flush());
+ }
+ // this should execute L0->L1
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+
+ // block compactions
+ test::SleepingBackgroundTask sleeping_task;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+ Env::Priority::LOW);
+
+ options.max_bytes_for_level_base = 1024 * 1024; // 1 MB
+ Reopen(options);
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+ // let compactions go
+ sleeping_task.WakeUp();
+ sleeping_task.WaitUntilDone();
+
+ // this should execute L1->L2 (move)
+ dbfull()->TEST_WaitForCompact();
+
+ ASSERT_EQ("0,0,1", FilesPerLevel(0));
+
+ std::vector<LiveFileMetaData> metadata;
+ db_->GetLiveFilesMetaData(&metadata);
+ ASSERT_EQ(metadata.size(), 1U);
+ auto moved_file_name = metadata[0].name;
+
+ // Create two more 1MB sst files
+ for (int i = 0; i < 2; ++i) {
+ // Create 1MB sst file
+ for (int j = 0; j < 100; ++j) {
+ ASSERT_OK(Put(Key(i * 50 + j + 100), RandomString(&rnd, 10 * 1024)));
+ }
+ ASSERT_OK(Flush());
+ }
+ // this should execute both L0->L1 and L1->L2 (merge with previous file)
+ dbfull()->TEST_WaitForCompact();
+
+ ASSERT_EQ("0,0,2", FilesPerLevel(0));
+
+ // iterator is holding the file
+ ASSERT_OK(env_->FileExists(dbname_ + moved_file_name));
+
+ listener->SetExpectedFileName(dbname_ + moved_file_name);
+ iterator.reset();
+
+ // this file should have been compacted away
+ ASSERT_NOK(env_->FileExists(dbname_ + moved_file_name));
+ listener->VerifyMatchedCount(1);
+ }
+}
+
+TEST_P(DBCompactionTestWithParam, CompressLevelCompaction) {
+ if (!Zlib_Supported()) {
+ return;
+ }
+ Options options = CurrentOptions();
+ options.memtable_factory.reset(
+ new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+ options.compaction_style = kCompactionStyleLevel;
+ options.write_buffer_size = 110 << 10; // 110KB
+ options.arena_block_size = 4 << 10;
+ options.level0_file_num_compaction_trigger = 2;
+ options.num_levels = 4;
+ options.max_bytes_for_level_base = 400 * 1024;
+ options.max_subcompactions = max_subcompactions_;
+ // First two levels have no compression, so that a trivial move between
+ // them will be allowed. Level 2 has Zlib compression so that a trivial
+ // move to level 3 will not be allowed
+ options.compression_per_level = {kNoCompression, kNoCompression,
+ kZlibCompression};
+ int matches = 0, didnt_match = 0, trivial_move = 0, non_trivial = 0;
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "Compaction::InputCompressionMatchesOutput:Matches",
+ [&](void* /*arg*/) { matches++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "Compaction::InputCompressionMatchesOutput:DidntMatch",
+ [&](void* /*arg*/) { didnt_match++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial",
+ [&](void* /*arg*/) { non_trivial++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:TrivialMove",
+ [&](void* /*arg*/) { trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Reopen(options);
+
+ Random rnd(301);
+ int key_idx = 0;
+
+ // First three 110KB files are going to level 0
+ // After that, (100K, 200K)
+ for (int num = 0; num < 3; num++) {
+ GenerateNewFile(&rnd, &key_idx);
+ }
+
+ // Another 110KB triggers a compaction to 400K file to fill up level 0
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(4, GetSstFileCount(dbname_));
+
+ // (1, 4)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4", FilesPerLevel(0));
+
+ // (1, 4, 1)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,1", FilesPerLevel(0));
+
+ // (1, 4, 2)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,2", FilesPerLevel(0));
+
+ // (1, 4, 3)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,3", FilesPerLevel(0));
+
+ // (1, 4, 4)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,4", FilesPerLevel(0));
+
+ // (1, 4, 5)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,5", FilesPerLevel(0));
+
+ // (1, 4, 6)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,6", FilesPerLevel(0));
+
+ // (1, 4, 7)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,7", FilesPerLevel(0));
+
+ // (1, 4, 8)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,8", FilesPerLevel(0));
+
+ ASSERT_EQ(matches, 12);
+ // Currently, the test relies on the number of calls to
+ // InputCompressionMatchesOutput() per compaction.
+ const int kCallsToInputCompressionMatch = 2;
+ ASSERT_EQ(didnt_match, 8 * kCallsToInputCompressionMatch);
+ ASSERT_EQ(trivial_move, 12);
+ ASSERT_EQ(non_trivial, 8);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ for (int i = 0; i < key_idx; i++) {
+ auto v = Get(Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ Reopen(options);
+
+ for (int i = 0; i < key_idx; i++) {
+ auto v = Get(Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ Destroy(options);
+}
+
+TEST_F(DBCompactionTest, SanitizeCompactionOptionsTest) {
+ Options options = CurrentOptions();
+ options.max_background_compactions = 5;
+ options.soft_pending_compaction_bytes_limit = 0;
+ options.hard_pending_compaction_bytes_limit = 100;
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+ ASSERT_EQ(100, db_->GetOptions().soft_pending_compaction_bytes_limit);
+
+ options.max_background_compactions = 3;
+ options.soft_pending_compaction_bytes_limit = 200;
+ options.hard_pending_compaction_bytes_limit = 150;
+ DestroyAndReopen(options);
+ ASSERT_EQ(150, db_->GetOptions().soft_pending_compaction_bytes_limit);
+}
+
+// This tests for a bug that could cause two level0 compactions running
+// concurrently
+// TODO(aekmekji): Make sure that the reason this fails when run with
+// max_subcompactions > 1 is not a correctness issue but just inherent to
+// running parallel L0-L1 compactions
+TEST_F(DBCompactionTest, SuggestCompactRangeNoTwoLevel0Compactions) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleLevel;
+ options.write_buffer_size = 110 << 10;
+ options.arena_block_size = 4 << 10;
+ options.level0_file_num_compaction_trigger = 4;
+ options.num_levels = 4;
+ options.compression = kNoCompression;
+ options.max_bytes_for_level_base = 450 << 10;
+ options.target_file_size_base = 98 << 10;
+ options.max_write_buffer_number = 2;
+ options.max_background_compactions = 2;
+
+ DestroyAndReopen(options);
+
+ // fill up the DB
+ Random rnd(301);
+ for (int num = 0; num < 10; num++) {
+ GenerateNewRandomFile(&rnd);
+ }
+ db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"CompactionJob::Run():Start",
+ "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:1"},
+ {"DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:2",
+ "CompactionJob::Run():End"}});
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // trigger L0 compaction
+ for (int num = 0; num < options.level0_file_num_compaction_trigger + 1;
+ num++) {
+ GenerateNewRandomFile(&rnd, /* nowait */ true);
+ ASSERT_OK(Flush());
+ }
+
+ TEST_SYNC_POINT(
+ "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:1");
+
+ GenerateNewRandomFile(&rnd, /* nowait */ true);
+ dbfull()->TEST_WaitForFlushMemTable();
+ ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr));
+ for (int num = 0; num < options.level0_file_num_compaction_trigger + 1;
+ num++) {
+ GenerateNewRandomFile(&rnd, /* nowait */ true);
+ ASSERT_OK(Flush());
+ }
+
+ TEST_SYNC_POINT(
+ "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:2");
+ dbfull()->TEST_WaitForCompact();
+}
+
+static std::string ShortKey(int i) {
+ assert(i < 10000);
+ char buf[100];
+ snprintf(buf, sizeof(buf), "key%04d", i);
+ return std::string(buf);
+}
+
+TEST_P(DBCompactionTestWithParam, ForceBottommostLevelCompaction) {
+ int32_t trivial_move = 0;
+ int32_t non_trivial_move = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:TrivialMove",
+ [&](void* /*arg*/) { trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial",
+ [&](void* /*arg*/) { non_trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // The key size is guaranteed to be <= 8
+ class ShortKeyComparator : public Comparator {
+ int Compare(const ROCKSDB_NAMESPACE::Slice& a,
+ const ROCKSDB_NAMESPACE::Slice& b) const override {
+ assert(a.size() <= 8);
+ assert(b.size() <= 8);
+ return BytewiseComparator()->Compare(a, b);
+ }
+ const char* Name() const override { return "ShortKeyComparator"; }
+ void FindShortestSeparator(
+ std::string* start,
+ const ROCKSDB_NAMESPACE::Slice& limit) const override {
+ return BytewiseComparator()->FindShortestSeparator(start, limit);
+ }
+ void FindShortSuccessor(std::string* key) const override {
+ return BytewiseComparator()->FindShortSuccessor(key);
+ }
+ } short_key_cmp;
+ Options options = CurrentOptions();
+ options.target_file_size_base = 100000000;
+ options.write_buffer_size = 100000000;
+ options.max_subcompactions = max_subcompactions_;
+ options.comparator = &short_key_cmp;
+ DestroyAndReopen(options);
+
+ int32_t value_size = 10 * 1024; // 10 KB
+
+ Random rnd(301);
+ std::vector<std::string> values;
+ // File with keys [ 0 => 99 ]
+ for (int i = 0; i < 100; i++) {
+ values.push_back(RandomString(&rnd, value_size));
+ ASSERT_OK(Put(ShortKey(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ("1", FilesPerLevel(0));
+ // Compaction will do L0=>L1 (trivial move) then move L1 files to L3
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 3;
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,1", FilesPerLevel(0));
+ ASSERT_EQ(trivial_move, 1);
+ ASSERT_EQ(non_trivial_move, 0);
+
+ // File with keys [ 100 => 199 ]
+ for (int i = 100; i < 200; i++) {
+ values.push_back(RandomString(&rnd, value_size));
+ ASSERT_OK(Put(ShortKey(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ("1,0,0,1", FilesPerLevel(0));
+ // Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves)
+ // then compacte the bottommost level L3=>L3 (non trivial move)
+ compact_options = CompactRangeOptions();
+ compact_options.bottommost_level_compaction =
+ BottommostLevelCompaction::kForceOptimized;
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,1", FilesPerLevel(0));
+ ASSERT_EQ(trivial_move, 4);
+ ASSERT_EQ(non_trivial_move, 1);
+
+ // File with keys [ 200 => 299 ]
+ for (int i = 200; i < 300; i++) {
+ values.push_back(RandomString(&rnd, value_size));
+ ASSERT_OK(Put(ShortKey(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ("1,0,0,1", FilesPerLevel(0));
+ trivial_move = 0;
+ non_trivial_move = 0;
+ compact_options = CompactRangeOptions();
+ compact_options.bottommost_level_compaction =
+ BottommostLevelCompaction::kSkip;
+ // Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves)
+ // and will skip bottommost level compaction
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,2", FilesPerLevel(0));
+ ASSERT_EQ(trivial_move, 3);
+ ASSERT_EQ(non_trivial_move, 0);
+
+ for (int i = 0; i < 300; i++) {
+ ASSERT_EQ(Get(ShortKey(i)), values[i]);
+ }
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBCompactionTestWithParam, IntraL0Compaction) {
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.level0_file_num_compaction_trigger = 5;
+ options.max_background_compactions = 2;
+ options.max_subcompactions = max_subcompactions_;
+ DestroyAndReopen(options);
+
+ const size_t kValueSize = 1 << 20;
+ Random rnd(301);
+ std::string value(RandomString(&rnd, kValueSize));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"LevelCompactionPicker::PickCompactionBySize:0",
+ "CompactionJob::Run():Start"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // index: 0 1 2 3 4 5 6 7 8 9
+ // size: 1MB 1MB 1MB 1MB 1MB 2MB 1MB 1MB 1MB 1MB
+ // score: 1.5 1.3 1.5 2.0 inf
+ //
+ // Files 0-4 will be included in an L0->L1 compaction.
+ //
+ // L0->L0 will be triggered since the sync points guarantee compaction to base
+ // level is still blocked when files 5-9 trigger another compaction.
+ //
+ // Files 6-9 are the longest span of available files for which
+ // work-per-deleted-file decreases (see "score" row above).
+ for (int i = 0; i < 10; ++i) {
+ ASSERT_OK(Put(Key(0), "")); // prevents trivial move
+ if (i == 5) {
+ ASSERT_OK(Put(Key(i + 1), value + value));
+ } else {
+ ASSERT_OK(Put(Key(i + 1), value));
+ }
+ ASSERT_OK(Flush());
+ }
+ dbfull()->TEST_WaitForCompact();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ std::vector<std::vector<FileMetaData>> level_to_files;
+ dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+ &level_to_files);
+ ASSERT_GE(level_to_files.size(), 2); // at least L0 and L1
+ // L0 has the 2MB file (not compacted) and 4MB file (output of L0->L0)
+ ASSERT_EQ(2, level_to_files[0].size());
+ ASSERT_GT(level_to_files[1].size(), 0);
+ for (int i = 0; i < 2; ++i) {
+ ASSERT_GE(level_to_files[0][i].fd.file_size, 1 << 21);
+ }
+}
+
+TEST_P(DBCompactionTestWithParam, IntraL0CompactionDoesNotObsoleteDeletions) {
+ // regression test for issue #2722: L0->L0 compaction can resurrect deleted
+ // keys from older L0 files if L1+ files' key-ranges do not include the key.
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.level0_file_num_compaction_trigger = 5;
+ options.max_background_compactions = 2;
+ options.max_subcompactions = max_subcompactions_;
+ DestroyAndReopen(options);
+
+ const size_t kValueSize = 1 << 20;
+ Random rnd(301);
+ std::string value(RandomString(&rnd, kValueSize));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"LevelCompactionPicker::PickCompactionBySize:0",
+ "CompactionJob::Run():Start"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // index: 0 1 2 3 4 5 6 7 8 9
+ // size: 1MB 1MB 1MB 1MB 1MB 1MB 1MB 1MB 1MB 1MB
+ // score: 1.25 1.33 1.5 2.0 inf
+ //
+ // Files 0-4 will be included in an L0->L1 compaction.
+ //
+ // L0->L0 will be triggered since the sync points guarantee compaction to base
+ // level is still blocked when files 5-9 trigger another compaction. All files
+ // 5-9 are included in the L0->L0 due to work-per-deleted file decreasing.
+ //
+ // Put a key-value in files 0-4. Delete that key in files 5-9. Verify the
+ // L0->L0 preserves the deletion such that the key remains deleted.
+ for (int i = 0; i < 10; ++i) {
+ // key 0 serves both to prevent trivial move and as the key we want to
+ // verify is not resurrected by L0->L0 compaction.
+ if (i < 5) {
+ ASSERT_OK(Put(Key(0), ""));
+ } else {
+ ASSERT_OK(Delete(Key(0)));
+ }
+ ASSERT_OK(Put(Key(i + 1), value));
+ ASSERT_OK(Flush());
+ }
+ dbfull()->TEST_WaitForCompact();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ std::vector<std::vector<FileMetaData>> level_to_files;
+ dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+ &level_to_files);
+ ASSERT_GE(level_to_files.size(), 2); // at least L0 and L1
+ // L0 has a single output file from L0->L0
+ ASSERT_EQ(1, level_to_files[0].size());
+ ASSERT_GT(level_to_files[1].size(), 0);
+ ASSERT_GE(level_to_files[0][0].fd.file_size, 1 << 22);
+
+ ReadOptions roptions;
+ std::string result;
+ ASSERT_TRUE(db_->Get(roptions, Key(0), &result).IsNotFound());
+}
+
+TEST_P(DBCompactionTestWithParam, FullCompactionInBottomPriThreadPool) {
+ const int kNumFilesTrigger = 3;
+ Env::Default()->SetBackgroundThreads(1, Env::Priority::BOTTOM);
+ for (bool use_universal_compaction : {false, true}) {
+ Options options = CurrentOptions();
+ if (use_universal_compaction) {
+ options.compaction_style = kCompactionStyleUniversal;
+ } else {
+ options.compaction_style = kCompactionStyleLevel;
+ options.level_compaction_dynamic_level_bytes = true;
+ }
+ options.num_levels = 4;
+ options.write_buffer_size = 100 << 10; // 100KB
+ options.target_file_size_base = 32 << 10; // 32KB
+ options.level0_file_num_compaction_trigger = kNumFilesTrigger;
+ // Trigger compaction if size amplification exceeds 110%
+ options.compaction_options_universal.max_size_amplification_percent = 110;
+ DestroyAndReopen(options);
+
+ int num_bottom_pri_compactions = 0;
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BGWorkBottomCompaction",
+ [&](void* /*arg*/) { ++num_bottom_pri_compactions; });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ for (int num = 0; num < kNumFilesTrigger; num++) {
+ ASSERT_EQ(NumSortedRuns(), num);
+ int key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ }
+ dbfull()->TEST_WaitForCompact();
+
+ ASSERT_EQ(1, num_bottom_pri_compactions);
+
+ // Verify that size amplification did occur
+ ASSERT_EQ(NumSortedRuns(), 1);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+ Env::Default()->SetBackgroundThreads(0, Env::Priority::BOTTOM);
+}
+
+TEST_F(DBCompactionTest, OptimizedDeletionObsoleting) {
+ // Deletions can be dropped when compacted to non-last level if they fall
+ // outside the lower-level files' key-ranges.
+ const int kNumL0Files = 4;
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = kNumL0Files;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ DestroyAndReopen(options);
+
+ // put key 1 and 3 in separate L1, L2 files.
+ // So key 0, 2, and 4+ fall outside these levels' key-ranges.
+ for (int level = 2; level >= 1; --level) {
+ for (int i = 0; i < 2; ++i) {
+ Put(Key(2 * i + 1), "val");
+ Flush();
+ }
+ MoveFilesToLevel(level);
+ ASSERT_EQ(2, NumTableFilesAtLevel(level));
+ }
+
+ // Delete keys in range [1, 4]. These L0 files will be compacted with L1:
+ // - Tombstones for keys 2 and 4 can be dropped early.
+ // - Tombstones for keys 1 and 3 must be kept due to L2 files' key-ranges.
+ for (int i = 0; i < kNumL0Files; ++i) {
+ Put(Key(0), "val"); // sentinel to prevent trivial move
+ Delete(Key(i + 1));
+ Flush();
+ }
+ dbfull()->TEST_WaitForCompact();
+
+ for (int i = 0; i < kNumL0Files; ++i) {
+ std::string value;
+ ASSERT_TRUE(db_->Get(ReadOptions(), Key(i + 1), &value).IsNotFound());
+ }
+ ASSERT_EQ(2, options.statistics->getTickerCount(
+ COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE));
+ ASSERT_EQ(2,
+ options.statistics->getTickerCount(COMPACTION_KEY_DROP_OBSOLETE));
+}
+
+TEST_F(DBCompactionTest, CompactFilesPendingL0Bug) {
+ // https://www.facebook.com/groups/rocksdb.dev/permalink/1389452781153232/
+ // CompactFiles() had a bug where it failed to pick a compaction when an L0
+ // compaction existed, but marked it as scheduled anyways. It'd never be
+ // unmarked as scheduled, so future compactions or DB close could hang.
+ const int kNumL0Files = 5;
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = kNumL0Files - 1;
+ options.max_background_compactions = 2;
+ DestroyAndReopen(options);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"LevelCompactionPicker::PickCompaction:Return",
+ "DBCompactionTest::CompactFilesPendingL0Bug:Picked"},
+ {"DBCompactionTest::CompactFilesPendingL0Bug:ManualCompacted",
+ "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ auto schedule_multi_compaction_token =
+ dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+ // Files 0-3 will be included in an L0->L1 compaction.
+ //
+ // File 4 will be included in a call to CompactFiles() while the first
+ // compaction is running.
+ for (int i = 0; i < kNumL0Files - 1; ++i) {
+ ASSERT_OK(Put(Key(0), "val")); // sentinel to prevent trivial move
+ ASSERT_OK(Put(Key(i + 1), "val"));
+ ASSERT_OK(Flush());
+ }
+ TEST_SYNC_POINT("DBCompactionTest::CompactFilesPendingL0Bug:Picked");
+ // file 4 flushed after 0-3 picked
+ ASSERT_OK(Put(Key(kNumL0Files), "val"));
+ ASSERT_OK(Flush());
+
+ // previously DB close would hang forever as this situation caused scheduled
+ // compactions count to never decrement to zero.
+ ColumnFamilyMetaData cf_meta;
+ dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta);
+ ASSERT_EQ(kNumL0Files, cf_meta.levels[0].files.size());
+ std::vector<std::string> input_filenames;
+ input_filenames.push_back(cf_meta.levels[0].files.front().name);
+ ASSERT_OK(dbfull()
+ ->CompactFiles(CompactionOptions(), input_filenames,
+ 0 /* output_level */));
+ TEST_SYNC_POINT("DBCompactionTest::CompactFilesPendingL0Bug:ManualCompacted");
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, CompactFilesOverlapInL0Bug) {
+ // Regression test for bug of not pulling in L0 files that overlap the user-
+ // specified input files in time- and key-ranges.
+ Put(Key(0), "old_val");
+ Flush();
+ Put(Key(0), "new_val");
+ Flush();
+
+ ColumnFamilyMetaData cf_meta;
+ dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta);
+ ASSERT_GE(cf_meta.levels.size(), 2);
+ ASSERT_EQ(2, cf_meta.levels[0].files.size());
+
+ // Compacting {new L0 file, L1 file} should pull in the old L0 file since it
+ // overlaps in key-range and time-range.
+ std::vector<std::string> input_filenames;
+ input_filenames.push_back(cf_meta.levels[0].files.front().name);
+ ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), input_filenames,
+ 1 /* output_level */));
+ ASSERT_EQ("new_val", Get(Key(0)));
+}
+
+TEST_F(DBCompactionTest, CompactBottomLevelFilesWithDeletions) {
+ // bottom-level files may contain deletions due to snapshots protecting the
+ // deleted keys. Once the snapshot is released, we should see files with many
+ // such deletions undergo single-file compactions.
+ const int kNumKeysPerFile = 1024;
+ const int kNumLevelFiles = 4;
+ const int kValueSize = 128;
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.level0_file_num_compaction_trigger = kNumLevelFiles;
+ // inflate it a bit to account for key/metadata overhead
+ options.target_file_size_base = 120 * kNumKeysPerFile * kValueSize / 100;
+ CreateAndReopenWithCF({"one"}, options);
+
+ Random rnd(301);
+ const Snapshot* snapshot = nullptr;
+ for (int i = 0; i < kNumLevelFiles; ++i) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(
+ Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+ }
+ if (i == kNumLevelFiles - 1) {
+ snapshot = db_->GetSnapshot();
+ // delete every other key after grabbing a snapshot, so these deletions
+ // and the keys they cover can't be dropped until after the snapshot is
+ // released.
+ for (int j = 0; j < kNumLevelFiles * kNumKeysPerFile; j += 2) {
+ ASSERT_OK(Delete(Key(j)));
+ }
+ }
+ Flush();
+ if (i < kNumLevelFiles - 1) {
+ ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+ }
+ }
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(kNumLevelFiles, NumTableFilesAtLevel(1));
+
+ std::vector<LiveFileMetaData> pre_release_metadata, post_release_metadata;
+ db_->GetLiveFilesMetaData(&pre_release_metadata);
+ // just need to bump seqnum so ReleaseSnapshot knows the newest key in the SST
+ // files does not need to be preserved in case of a future snapshot.
+ ASSERT_OK(Put(Key(0), "val"));
+ ASSERT_NE(kMaxSequenceNumber, dbfull()->bottommost_files_mark_threshold_);
+ // release snapshot and wait for compactions to finish. Single-file
+ // compactions should be triggered, which reduce the size of each bottom-level
+ // file without changing file count.
+ db_->ReleaseSnapshot(snapshot);
+ ASSERT_EQ(kMaxSequenceNumber, dbfull()->bottommost_files_mark_threshold_);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+ Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+ ASSERT_TRUE(compaction->compaction_reason() ==
+ CompactionReason::kBottommostFiles);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ dbfull()->TEST_WaitForCompact();
+ db_->GetLiveFilesMetaData(&post_release_metadata);
+ ASSERT_EQ(pre_release_metadata.size(), post_release_metadata.size());
+
+ for (size_t i = 0; i < pre_release_metadata.size(); ++i) {
+ const auto& pre_file = pre_release_metadata[i];
+ const auto& post_file = post_release_metadata[i];
+ ASSERT_EQ(1, pre_file.level);
+ ASSERT_EQ(1, post_file.level);
+ // each file is smaller than it was before as it was rewritten without
+ // deletion markers/deleted keys.
+ ASSERT_LT(post_file.size, pre_file.size);
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) {
+ const int kNumKeysPerFile = 32;
+ const int kNumLevelFiles = 2;
+ const int kValueSize = 1024;
+
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.ttl = 24 * 60 * 60; // 24 hours
+ options.max_open_files = -1;
+ env_->time_elapse_only_sleep_ = false;
+ options.env = env_;
+
+ env_->addon_time_.store(0);
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < kNumLevelFiles; ++i) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(
+ Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+ }
+ Flush();
+ }
+ dbfull()->TEST_WaitForCompact();
+ MoveFilesToLevel(3);
+ ASSERT_EQ("0,0,0,2", FilesPerLevel());
+
+ // Delete previously written keys.
+ for (int i = 0; i < kNumLevelFiles; ++i) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(Delete(Key(i * kNumKeysPerFile + j)));
+ }
+ Flush();
+ }
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ("2,0,0,2", FilesPerLevel());
+ MoveFilesToLevel(1);
+ ASSERT_EQ("0,2,0,2", FilesPerLevel());
+
+ env_->addon_time_.fetch_add(36 * 60 * 60); // 36 hours
+ ASSERT_EQ("0,2,0,2", FilesPerLevel());
+
+ // Just do a simple write + flush so that the Ttl expired files get
+ // compacted.
+ ASSERT_OK(Put("a", "1"));
+ Flush();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+ Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+ ASSERT_TRUE(compaction->compaction_reason() == CompactionReason::kTtl);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ dbfull()->TEST_WaitForCompact();
+ // All non-L0 files are deleted, as they contained only deleted data.
+ ASSERT_EQ("1", FilesPerLevel());
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ // Test dynamically changing ttl.
+
+ env_->addon_time_.store(0);
+ DestroyAndReopen(options);
+
+ for (int i = 0; i < kNumLevelFiles; ++i) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(
+ Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+ }
+ Flush();
+ }
+ dbfull()->TEST_WaitForCompact();
+ MoveFilesToLevel(3);
+ ASSERT_EQ("0,0,0,2", FilesPerLevel());
+
+ // Delete previously written keys.
+ for (int i = 0; i < kNumLevelFiles; ++i) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(Delete(Key(i * kNumKeysPerFile + j)));
+ }
+ Flush();
+ }
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ("2,0,0,2", FilesPerLevel());
+ MoveFilesToLevel(1);
+ ASSERT_EQ("0,2,0,2", FilesPerLevel());
+
+ // Move time forward by 12 hours, and make sure that compaction still doesn't
+ // trigger as ttl is set to 24 hours.
+ env_->addon_time_.fetch_add(12 * 60 * 60);
+ ASSERT_OK(Put("a", "1"));
+ Flush();
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ("1,2,0,2", FilesPerLevel());
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+ Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+ ASSERT_TRUE(compaction->compaction_reason() == CompactionReason::kTtl);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Dynamically change ttl to 10 hours.
+ // This should trigger a ttl compaction, as 12 hours have already passed.
+ ASSERT_OK(dbfull()->SetOptions({{"ttl", "36000"}}));
+ dbfull()->TEST_WaitForCompact();
+ // All non-L0 files are deleted, as they contained only deleted data.
+ ASSERT_EQ("1", FilesPerLevel());
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) {
+ const int kValueSize = 100;
+
+ for (bool if_restart : {false, true}) {
+ for (bool if_open_all_files : {false, true}) {
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.ttl = 24 * 60 * 60; // 24 hours
+ if (if_open_all_files) {
+ options.max_open_files = -1;
+ } else {
+ options.max_open_files = 20;
+ }
+ // RocksDB sanitize max open files to at least 20. Modify it back.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+ int* max_open_files = static_cast<int*>(arg);
+ *max_open_files = 2;
+ });
+ // In the case where all files are opened and doing DB restart
+ // forcing the oldest ancester time in manifest file to be 0 to
+ // simulate the case of reading from an old version.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "VersionEdit::EncodeTo:VarintOldestAncesterTime", [&](void* arg) {
+ if (if_restart && if_open_all_files) {
+ std::string* encoded_fieled = static_cast<std::string*>(arg);
+ *encoded_fieled = "";
+ PutVarint64(encoded_fieled, 0);
+ }
+ });
+
+ env_->time_elapse_only_sleep_ = false;
+ options.env = env_;
+
+ env_->addon_time_.store(0);
+ DestroyAndReopen(options);
+
+ int ttl_compactions = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+ Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+ auto compaction_reason = compaction->compaction_reason();
+ if (compaction_reason == CompactionReason::kTtl) {
+ ttl_compactions++;
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Add two L6 files with key ranges: [1 .. 100], [101 .. 200].
+ Random rnd(301);
+ for (int i = 1; i <= 100; ++i) {
+ ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
+ }
+ Flush();
+ // Get the first file's creation time. This will be the oldest file in the
+ // DB. Compactions inolving this file's descendents should keep getting
+ // this time.
+ std::vector<std::vector<FileMetaData>> level_to_files;
+ dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+ &level_to_files);
+ uint64_t oldest_time = level_to_files[0][0].oldest_ancester_time;
+ // Add 1 hour and do another flush.
+ env_->addon_time_.fetch_add(1 * 60 * 60);
+ for (int i = 101; i <= 200; ++i) {
+ ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
+ }
+ Flush();
+ MoveFilesToLevel(6);
+ ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+
+ env_->addon_time_.fetch_add(1 * 60 * 60);
+ // Add two L4 files with key ranges: [1 .. 50], [51 .. 150].
+ for (int i = 1; i <= 50; ++i) {
+ ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
+ }
+ Flush();
+ env_->addon_time_.fetch_add(1 * 60 * 60);
+ for (int i = 51; i <= 150; ++i) {
+ ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
+ }
+ Flush();
+ MoveFilesToLevel(4);
+ ASSERT_EQ("0,0,0,0,2,0,2", FilesPerLevel());
+
+ env_->addon_time_.fetch_add(1 * 60 * 60);
+ // Add one L1 file with key range: [26, 75].
+ for (int i = 26; i <= 75; ++i) {
+ ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
+ }
+ Flush();
+ dbfull()->TEST_WaitForCompact();
+ MoveFilesToLevel(1);
+ ASSERT_EQ("0,1,0,0,2,0,2", FilesPerLevel());
+
+ // LSM tree:
+ // L1: [26 .. 75]
+ // L4: [1 .. 50][51 ..... 150]
+ // L6: [1 ........ 100][101 .... 200]
+ //
+ // On TTL expiry, TTL compaction should be initiated on L1 file, and the
+ // compactions should keep going on until the key range hits bottom level.
+ // In other words: the compaction on this data range "cascasdes" until
+ // reaching the bottom level.
+ //
+ // Order of events on TTL expiry:
+ // 1. L1 file falls to L3 via 2 trivial moves which are initiated by the
+ // ttl
+ // compaction.
+ // 2. A TTL compaction happens between L3 and L4 files. Output file in L4.
+ // 3. The new output file from L4 falls to L5 via 1 trival move initiated
+ // by the ttl compaction.
+ // 4. A TTL compaction happens between L5 and L6 files. Ouptut in L6.
+
+ // Add 25 hours and do a write
+ env_->addon_time_.fetch_add(25 * 60 * 60);
+
+ ASSERT_OK(Put(Key(1), "1"));
+ if (if_restart) {
+ Reopen(options);
+ } else {
+ Flush();
+ }
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_EQ(5, ttl_compactions);
+
+ dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+ &level_to_files);
+ ASSERT_EQ(oldest_time, level_to_files[6][0].oldest_ancester_time);
+
+ env_->addon_time_.fetch_add(25 * 60 * 60);
+ ASSERT_OK(Put(Key(2), "1"));
+ if (if_restart) {
+ Reopen(options);
+ } else {
+ Flush();
+ }
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_GE(ttl_compactions, 6);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+ }
+}
+
+TEST_F(DBCompactionTest, LevelPeriodicCompaction) {
+ const int kNumKeysPerFile = 32;
+ const int kNumLevelFiles = 2;
+ const int kValueSize = 100;
+
+ for (bool if_restart : {false, true}) {
+ for (bool if_open_all_files : {false, true}) {
+ Options options = CurrentOptions();
+ options.periodic_compaction_seconds = 48 * 60 * 60; // 2 days
+ if (if_open_all_files) {
+ options.max_open_files = -1; // needed for ttl compaction
+ } else {
+ options.max_open_files = 20;
+ }
+ // RocksDB sanitize max open files to at least 20. Modify it back.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+ int* max_open_files = static_cast<int*>(arg);
+ *max_open_files = 0;
+ });
+ // In the case where all files are opened and doing DB restart
+ // forcing the file creation time in manifest file to be 0 to
+ // simulate the case of reading from an old version.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "VersionEdit::EncodeTo:VarintFileCreationTime", [&](void* arg) {
+ if (if_restart && if_open_all_files) {
+ std::string* encoded_fieled = static_cast<std::string*>(arg);
+ *encoded_fieled = "";
+ PutVarint64(encoded_fieled, 0);
+ }
+ });
+
+ env_->time_elapse_only_sleep_ = false;
+ options.env = env_;
+
+ env_->addon_time_.store(0);
+ DestroyAndReopen(options);
+
+ int periodic_compactions = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+ Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+ auto compaction_reason = compaction->compaction_reason();
+ if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+ periodic_compactions++;
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ for (int i = 0; i < kNumLevelFiles; ++i) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(Put(Key(i * kNumKeysPerFile + j),
+ RandomString(&rnd, kValueSize)));
+ }
+ Flush();
+ }
+ dbfull()->TEST_WaitForCompact();
+
+ ASSERT_EQ("2", FilesPerLevel());
+ ASSERT_EQ(0, periodic_compactions);
+
+ // Add 50 hours and do a write
+ env_->addon_time_.fetch_add(50 * 60 * 60);
+ ASSERT_OK(Put("a", "1"));
+ Flush();
+ dbfull()->TEST_WaitForCompact();
+ // Assert that the files stay in the same level
+ ASSERT_EQ("3", FilesPerLevel());
+ // The two old files go through the periodic compaction process
+ ASSERT_EQ(2, periodic_compactions);
+
+ MoveFilesToLevel(1);
+ ASSERT_EQ("0,3", FilesPerLevel());
+
+ // Add another 50 hours and do another write
+ env_->addon_time_.fetch_add(50 * 60 * 60);
+ ASSERT_OK(Put("b", "2"));
+ if (if_restart) {
+ Reopen(options);
+ } else {
+ Flush();
+ }
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ("1,3", FilesPerLevel());
+ // The three old files now go through the periodic compaction process. 2
+ // + 3.
+ ASSERT_EQ(5, periodic_compactions);
+
+ // Add another 50 hours and do another write
+ env_->addon_time_.fetch_add(50 * 60 * 60);
+ ASSERT_OK(Put("c", "3"));
+ Flush();
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ("2,3", FilesPerLevel());
+ // The four old files now go through the periodic compaction process. 5
+ // + 4.
+ ASSERT_EQ(9, periodic_compactions);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+ }
+}
+
+TEST_F(DBCompactionTest, LevelPeriodicCompactionWithOldDB) {
+ // This test makes sure that periodic compactions are working with a DB
+ // where file_creation_time of some files is 0.
+ // After compactions the new files are created with a valid file_creation_time
+
+ const int kNumKeysPerFile = 32;
+ const int kNumFiles = 4;
+ const int kValueSize = 100;
+
+ Options options = CurrentOptions();
+ env_->time_elapse_only_sleep_ = false;
+ options.env = env_;
+
+ env_->addon_time_.store(0);
+ DestroyAndReopen(options);
+
+ int periodic_compactions = 0;
+ bool set_file_creation_time_to_zero = true;
+ bool set_creation_time_to_zero = true;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+ Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+ auto compaction_reason = compaction->compaction_reason();
+ if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+ periodic_compactions++;
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "PropertyBlockBuilder::AddTableProperty:Start", [&](void* arg) {
+ TableProperties* props = reinterpret_cast<TableProperties*>(arg);
+ if (set_file_creation_time_to_zero) {
+ props->file_creation_time = 0;
+ }
+ if (set_creation_time_to_zero) {
+ props->creation_time = 0;
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ for (int i = 0; i < kNumFiles; ++i) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(
+ Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+ }
+ Flush();
+ // Move the first two files to L2.
+ if (i == 1) {
+ MoveFilesToLevel(2);
+ set_creation_time_to_zero = false;
+ }
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_EQ("2,0,2", FilesPerLevel());
+ ASSERT_EQ(0, periodic_compactions);
+
+ Close();
+
+ set_file_creation_time_to_zero = false;
+ // Forward the clock by 2 days.
+ env_->addon_time_.fetch_add(2 * 24 * 60 * 60);
+ options.periodic_compaction_seconds = 1 * 24 * 60 * 60; // 1 day
+
+ Reopen(options);
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ("2,0,2", FilesPerLevel());
+ // Make sure that all files go through periodic compaction.
+ ASSERT_EQ(kNumFiles, periodic_compactions);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, LevelPeriodicAndTtlCompaction) {
+ const int kNumKeysPerFile = 32;
+ const int kNumLevelFiles = 2;
+ const int kValueSize = 100;
+
+ Options options = CurrentOptions();
+ options.ttl = 10 * 60 * 60; // 10 hours
+ options.periodic_compaction_seconds = 48 * 60 * 60; // 2 days
+ options.max_open_files = -1; // needed for both periodic and ttl compactions
+ env_->time_elapse_only_sleep_ = false;
+ options.env = env_;
+
+ env_->addon_time_.store(0);
+ DestroyAndReopen(options);
+
+ int periodic_compactions = 0;
+ int ttl_compactions = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+ Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+ auto compaction_reason = compaction->compaction_reason();
+ if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+ periodic_compactions++;
+ } else if (compaction_reason == CompactionReason::kTtl) {
+ ttl_compactions++;
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ for (int i = 0; i < kNumLevelFiles; ++i) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(
+ Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+ }
+ Flush();
+ }
+ dbfull()->TEST_WaitForCompact();
+
+ MoveFilesToLevel(3);
+
+ ASSERT_EQ("0,0,0,2", FilesPerLevel());
+ ASSERT_EQ(0, periodic_compactions);
+ ASSERT_EQ(0, ttl_compactions);
+
+ // Add some time greater than periodic_compaction_time.
+ env_->addon_time_.fetch_add(50 * 60 * 60);
+ ASSERT_OK(Put("a", "1"));
+ Flush();
+ dbfull()->TEST_WaitForCompact();
+ // Files in the bottom level go through periodic compactions.
+ ASSERT_EQ("1,0,0,2", FilesPerLevel());
+ ASSERT_EQ(2, periodic_compactions);
+ ASSERT_EQ(0, ttl_compactions);
+
+ // Add a little more time than ttl
+ env_->addon_time_.fetch_add(11 * 60 * 60);
+ ASSERT_OK(Put("b", "1"));
+ Flush();
+ dbfull()->TEST_WaitForCompact();
+ // Notice that the previous file in level 1 falls down to the bottom level
+ // due to ttl compactions, one level at a time.
+ // And bottom level files don't get picked up for ttl compactions.
+ ASSERT_EQ("1,0,0,3", FilesPerLevel());
+ ASSERT_EQ(2, periodic_compactions);
+ ASSERT_EQ(3, ttl_compactions);
+
+ // Add some time greater than periodic_compaction_time.
+ env_->addon_time_.fetch_add(50 * 60 * 60);
+ ASSERT_OK(Put("c", "1"));
+ Flush();
+ dbfull()->TEST_WaitForCompact();
+ // Previous L0 file falls one level at a time to bottom level due to ttl.
+ // And all 4 bottom files go through periodic compactions.
+ ASSERT_EQ("1,0,0,4", FilesPerLevel());
+ ASSERT_EQ(6, periodic_compactions);
+ ASSERT_EQ(6, ttl_compactions);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, LevelPeriodicCompactionWithCompactionFilters) {
+ class TestCompactionFilter : public CompactionFilter {
+ const char* Name() const override { return "TestCompactionFilter"; }
+ };
+ class TestCompactionFilterFactory : public CompactionFilterFactory {
+ const char* Name() const override { return "TestCompactionFilterFactory"; }
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& /*context*/) override {
+ return std::unique_ptr<CompactionFilter>(new TestCompactionFilter());
+ }
+ };
+
+ const int kNumKeysPerFile = 32;
+ const int kNumLevelFiles = 2;
+ const int kValueSize = 100;
+
+ Random rnd(301);
+
+ Options options = CurrentOptions();
+ TestCompactionFilter test_compaction_filter;
+ env_->time_elapse_only_sleep_ = false;
+ options.env = env_;
+ env_->addon_time_.store(0);
+
+ enum CompactionFilterType {
+ kUseCompactionFilter,
+ kUseCompactionFilterFactory
+ };
+
+ for (CompactionFilterType comp_filter_type :
+ {kUseCompactionFilter, kUseCompactionFilterFactory}) {
+ // Assert that periodic compactions are not enabled.
+ ASSERT_EQ(port::kMaxUint64 - 1, options.periodic_compaction_seconds);
+
+ if (comp_filter_type == kUseCompactionFilter) {
+ options.compaction_filter = &test_compaction_filter;
+ options.compaction_filter_factory.reset();
+ } else if (comp_filter_type == kUseCompactionFilterFactory) {
+ options.compaction_filter = nullptr;
+ options.compaction_filter_factory.reset(
+ new TestCompactionFilterFactory());
+ }
+ DestroyAndReopen(options);
+
+ // periodic_compaction_seconds should be set to the sanitized value when
+ // a compaction filter or a compaction filter factory is used.
+ ASSERT_EQ(30 * 24 * 60 * 60,
+ dbfull()->GetOptions().periodic_compaction_seconds);
+
+ int periodic_compactions = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+ Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+ auto compaction_reason = compaction->compaction_reason();
+ if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+ periodic_compactions++;
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ for (int i = 0; i < kNumLevelFiles; ++i) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(
+ Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+ }
+ Flush();
+ }
+ dbfull()->TEST_WaitForCompact();
+
+ ASSERT_EQ("2", FilesPerLevel());
+ ASSERT_EQ(0, periodic_compactions);
+
+ // Add 31 days and do a write
+ env_->addon_time_.fetch_add(31 * 24 * 60 * 60);
+ ASSERT_OK(Put("a", "1"));
+ Flush();
+ dbfull()->TEST_WaitForCompact();
+ // Assert that the files stay in the same level
+ ASSERT_EQ("3", FilesPerLevel());
+ // The two old files go through the periodic compaction process
+ ASSERT_EQ(2, periodic_compactions);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+}
+
+TEST_F(DBCompactionTest, CompactRangeDelayedByL0FileCount) {
+ // Verify that, when `CompactRangeOptions::allow_write_stall == false`, manual
+ // compaction only triggers flush after it's sure stall won't be triggered for
+ // L0 file count going too high.
+ const int kNumL0FilesTrigger = 4;
+ const int kNumL0FilesLimit = 8;
+ // i == 0: verifies normal case where stall is avoided by delay
+ // i == 1: verifies no delay in edge case where stall trigger is same as
+ // compaction trigger, so stall can't be avoided
+ for (int i = 0; i < 2; ++i) {
+ Options options = CurrentOptions();
+ options.level0_slowdown_writes_trigger = kNumL0FilesLimit;
+ if (i == 0) {
+ options.level0_file_num_compaction_trigger = kNumL0FilesTrigger;
+ } else {
+ options.level0_file_num_compaction_trigger = kNumL0FilesLimit;
+ }
+ Reopen(options);
+
+ if (i == 0) {
+ // ensure the auto compaction doesn't finish until manual compaction has
+ // had a chance to be delayed.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
+ "CompactionJob::Run():End"}});
+ } else {
+ // ensure the auto-compaction doesn't finish until manual compaction has
+ // continued without delay.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::FlushMemTable:StallWaitDone",
+ "CompactionJob::Run():End"}});
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ for (int j = 0; j < kNumL0FilesLimit - 1; ++j) {
+ for (int k = 0; k < 2; ++k) {
+ ASSERT_OK(Put(Key(k), RandomString(&rnd, 1024)));
+ }
+ Flush();
+ }
+ auto manual_compaction_thread = port::Thread([this]() {
+ CompactRangeOptions cro;
+ cro.allow_write_stall = false;
+ db_->CompactRange(cro, nullptr, nullptr);
+ });
+
+ manual_compaction_thread.join();
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_GT(NumTableFilesAtLevel(1), 0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+}
+
+TEST_F(DBCompactionTest, CompactRangeDelayedByImmMemTableCount) {
+ // Verify that, when `CompactRangeOptions::allow_write_stall == false`, manual
+ // compaction only triggers flush after it's sure stall won't be triggered for
+ // immutable memtable count going too high.
+ const int kNumImmMemTableLimit = 8;
+ // i == 0: verifies normal case where stall is avoided by delay
+ // i == 1: verifies no delay in edge case where stall trigger is same as flush
+ // trigger, so stall can't be avoided
+ for (int i = 0; i < 2; ++i) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ // the delay limit is one less than the stop limit. This test focuses on
+ // avoiding delay limit, but this option sets stop limit, so add one.
+ options.max_write_buffer_number = kNumImmMemTableLimit + 1;
+ if (i == 1) {
+ options.min_write_buffer_number_to_merge = kNumImmMemTableLimit;
+ }
+ Reopen(options);
+
+ if (i == 0) {
+ // ensure the flush doesn't finish until manual compaction has had a
+ // chance to be delayed.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
+ "FlushJob::WriteLevel0Table"}});
+ } else {
+ // ensure the flush doesn't finish until manual compaction has continued
+ // without delay.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::FlushMemTable:StallWaitDone",
+ "FlushJob::WriteLevel0Table"}});
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ for (int j = 0; j < kNumImmMemTableLimit - 1; ++j) {
+ ASSERT_OK(Put(Key(0), RandomString(&rnd, 1024)));
+ FlushOptions flush_opts;
+ flush_opts.wait = false;
+ flush_opts.allow_write_stall = true;
+ dbfull()->Flush(flush_opts);
+ }
+
+ auto manual_compaction_thread = port::Thread([this]() {
+ CompactRangeOptions cro;
+ cro.allow_write_stall = false;
+ db_->CompactRange(cro, nullptr, nullptr);
+ });
+
+ manual_compaction_thread.join();
+ dbfull()->TEST_WaitForFlushMemTable();
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_GT(NumTableFilesAtLevel(1), 0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+}
+
+TEST_F(DBCompactionTest, CompactRangeShutdownWhileDelayed) {
+ // Verify that, when `CompactRangeOptions::allow_write_stall == false`, delay
+ // does not hang if CF is dropped or DB is closed
+ const int kNumL0FilesTrigger = 4;
+ const int kNumL0FilesLimit = 8;
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = kNumL0FilesTrigger;
+ options.level0_slowdown_writes_trigger = kNumL0FilesLimit;
+ // i == 0: DB::DropColumnFamily() on CompactRange's target CF unblocks it
+ // i == 1: DB::CancelAllBackgroundWork() unblocks CompactRange. This is to
+ // simulate what happens during Close as we can't call Close (it
+ // blocks on the auto-compaction, making a cycle).
+ for (int i = 0; i < 2; ++i) {
+ CreateAndReopenWithCF({"one"}, options);
+ // The calls to close CF/DB wait until the manual compaction stalls.
+ // The auto-compaction waits until the manual compaction finishes to ensure
+ // the signal comes from closing CF/DB, not from compaction making progress.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
+ "DBCompactionTest::CompactRangeShutdownWhileDelayed:PreShutdown"},
+ {"DBCompactionTest::CompactRangeShutdownWhileDelayed:PostManual",
+ "CompactionJob::Run():End"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ for (int j = 0; j < kNumL0FilesLimit - 1; ++j) {
+ for (int k = 0; k < 2; ++k) {
+ ASSERT_OK(Put(1, Key(k), RandomString(&rnd, 1024)));
+ }
+ Flush(1);
+ }
+ auto manual_compaction_thread = port::Thread([this, i]() {
+ CompactRangeOptions cro;
+ cro.allow_write_stall = false;
+ Status s = db_->CompactRange(cro, handles_[1], nullptr, nullptr);
+ if (i == 0) {
+ ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr)
+ .IsColumnFamilyDropped());
+ } else {
+ ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr)
+ .IsShutdownInProgress());
+ }
+ });
+
+ TEST_SYNC_POINT(
+ "DBCompactionTest::CompactRangeShutdownWhileDelayed:PreShutdown");
+ if (i == 0) {
+ ASSERT_OK(db_->DropColumnFamily(handles_[1]));
+ } else {
+ dbfull()->CancelAllBackgroundWork(false /* wait */);
+ }
+ manual_compaction_thread.join();
+ TEST_SYNC_POINT(
+ "DBCompactionTest::CompactRangeShutdownWhileDelayed:PostManual");
+ dbfull()->TEST_WaitForCompact();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+}
+
+TEST_F(DBCompactionTest, CompactRangeSkipFlushAfterDelay) {
+ // Verify that, when `CompactRangeOptions::allow_write_stall == false`,
+ // CompactRange skips its flush if the delay is long enough that the memtables
+ // existing at the beginning of the call have already been flushed.
+ const int kNumL0FilesTrigger = 4;
+ const int kNumL0FilesLimit = 8;
+ Options options = CurrentOptions();
+ options.level0_slowdown_writes_trigger = kNumL0FilesLimit;
+ options.level0_file_num_compaction_trigger = kNumL0FilesTrigger;
+ Reopen(options);
+
+ Random rnd(301);
+ // The manual flush includes the memtable that was active when CompactRange
+ // began. So it unblocks CompactRange and precludes its flush. Throughout the
+ // test, stall conditions are upheld via high L0 file count.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
+ "DBCompactionTest::CompactRangeSkipFlushAfterDelay:PreFlush"},
+ {"DBCompactionTest::CompactRangeSkipFlushAfterDelay:PostFlush",
+ "DBImpl::FlushMemTable:StallWaitDone"},
+ {"DBImpl::FlushMemTable:StallWaitDone", "CompactionJob::Run():End"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ //used for the delayable flushes
+ FlushOptions flush_opts;
+ flush_opts.allow_write_stall = true;
+ for (int i = 0; i < kNumL0FilesLimit - 1; ++i) {
+ for (int j = 0; j < 2; ++j) {
+ ASSERT_OK(Put(Key(j), RandomString(&rnd, 1024)));
+ }
+ dbfull()->Flush(flush_opts);
+ }
+ auto manual_compaction_thread = port::Thread([this]() {
+ CompactRangeOptions cro;
+ cro.allow_write_stall = false;
+ db_->CompactRange(cro, nullptr, nullptr);
+ });
+
+ TEST_SYNC_POINT("DBCompactionTest::CompactRangeSkipFlushAfterDelay:PreFlush");
+ Put(ToString(0), RandomString(&rnd, 1024));
+ dbfull()->Flush(flush_opts);
+ Put(ToString(0), RandomString(&rnd, 1024));
+ TEST_SYNC_POINT("DBCompactionTest::CompactRangeSkipFlushAfterDelay:PostFlush");
+ manual_compaction_thread.join();
+
+ // If CompactRange's flush was skipped, the final Put above will still be
+ // in the active memtable.
+ std::string num_keys_in_memtable;
+ db_->GetProperty(DB::Properties::kNumEntriesActiveMemTable, &num_keys_in_memtable);
+ ASSERT_EQ(ToString(1), num_keys_in_memtable);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, CompactRangeFlushOverlappingMemtable) {
+ // Verify memtable only gets flushed if it contains data overlapping the range
+ // provided to `CompactRange`. Tests all kinds of overlap/non-overlap.
+ const int kNumEndpointKeys = 5;
+ std::string keys[kNumEndpointKeys] = {"a", "b", "c", "d", "e"};
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ Reopen(options);
+
+ // One extra iteration for nullptr, which means left side of interval is
+ // unbounded.
+ for (int i = 0; i <= kNumEndpointKeys; ++i) {
+ Slice begin;
+ Slice* begin_ptr;
+ if (i == 0) {
+ begin_ptr = nullptr;
+ } else {
+ begin = keys[i - 1];
+ begin_ptr = &begin;
+ }
+ // Start at `i` so right endpoint comes after left endpoint. One extra
+ // iteration for nullptr, which means right side of interval is unbounded.
+ for (int j = std::max(0, i - 1); j <= kNumEndpointKeys; ++j) {
+ Slice end;
+ Slice* end_ptr;
+ if (j == kNumEndpointKeys) {
+ end_ptr = nullptr;
+ } else {
+ end = keys[j];
+ end_ptr = &end;
+ }
+ ASSERT_OK(Put("b", "val"));
+ ASSERT_OK(Put("d", "val"));
+ CompactRangeOptions compact_range_opts;
+ ASSERT_OK(db_->CompactRange(compact_range_opts, begin_ptr, end_ptr));
+
+ uint64_t get_prop_tmp, num_memtable_entries = 0;
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesImmMemTables,
+ &get_prop_tmp));
+ num_memtable_entries += get_prop_tmp;
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+ &get_prop_tmp));
+ num_memtable_entries += get_prop_tmp;
+ if (begin_ptr == nullptr || end_ptr == nullptr ||
+ (i <= 4 && j >= 1 && (begin != "c" || end != "c"))) {
+ // In this case `CompactRange`'s range overlapped in some way with the
+ // memtable's range, so flush should've happened. Then "b" and "d" won't
+ // be in the memtable.
+ ASSERT_EQ(0, num_memtable_entries);
+ } else {
+ ASSERT_EQ(2, num_memtable_entries);
+ // flush anyways to prepare for next iteration
+ db_->Flush(FlushOptions());
+ }
+ }
+ }
+}
+
+TEST_F(DBCompactionTest, CompactionStatsTest) {
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = 2;
+ CompactionStatsCollector* collector = new CompactionStatsCollector();
+ options.listeners.emplace_back(collector);
+ DestroyAndReopen(options);
+
+ for (int i = 0; i < 32; i++) {
+ for (int j = 0; j < 5000; j++) {
+ Put(std::to_string(j), std::string(1, 'A'));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ }
+ dbfull()->TEST_WaitForCompact();
+ ColumnFamilyHandleImpl* cfh =
+ static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
+ ColumnFamilyData* cfd = cfh->cfd();
+
+ VerifyCompactionStats(*cfd, *collector);
+}
+
+TEST_F(DBCompactionTest, CompactFilesOutputRangeConflict) {
+ // LSM setup:
+ // L1: [ba bz]
+ // L2: [a b] [c d]
+ // L3: [a b] [c d]
+ //
+ // Thread 1: Thread 2:
+ // Begin compacting all L2->L3
+ // Compact [ba bz] L1->L3
+ // End compacting all L2->L3
+ //
+ // The compaction operation in thread 2 should be disallowed because the range
+ // overlaps with the compaction in thread 1, which also covers that range in
+ // L3.
+ Options options = CurrentOptions();
+ FlushedFileCollector* collector = new FlushedFileCollector();
+ options.listeners.emplace_back(collector);
+ Reopen(options);
+
+ for (int level = 3; level >= 2; --level) {
+ ASSERT_OK(Put("a", "val"));
+ ASSERT_OK(Put("b", "val"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("c", "val"));
+ ASSERT_OK(Put("d", "val"));
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(level);
+ }
+ ASSERT_OK(Put("ba", "val"));
+ ASSERT_OK(Put("bz", "val"));
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(1);
+
+ SyncPoint::GetInstance()->LoadDependency({
+ {"CompactFilesImpl:0",
+ "DBCompactionTest::CompactFilesOutputRangeConflict:Thread2Begin"},
+ {"DBCompactionTest::CompactFilesOutputRangeConflict:Thread2End",
+ "CompactFilesImpl:1"},
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ auto bg_thread = port::Thread([&]() {
+ // Thread 1
+ std::vector<std::string> filenames = collector->GetFlushedFiles();
+ filenames.pop_back();
+ ASSERT_OK(db_->CompactFiles(CompactionOptions(), filenames,
+ 3 /* output_level */));
+ });
+
+ // Thread 2
+ TEST_SYNC_POINT(
+ "DBCompactionTest::CompactFilesOutputRangeConflict:Thread2Begin");
+ std::string filename = collector->GetFlushedFiles().back();
+ ASSERT_FALSE(
+ db_->CompactFiles(CompactionOptions(), {filename}, 3 /* output_level */)
+ .ok());
+ TEST_SYNC_POINT(
+ "DBCompactionTest::CompactFilesOutputRangeConflict:Thread2End");
+
+ bg_thread.join();
+}
+
+TEST_F(DBCompactionTest, CompactionHasEmptyOutput) {
+ Options options = CurrentOptions();
+ SstStatsCollector* collector = new SstStatsCollector();
+ options.level0_file_num_compaction_trigger = 2;
+ options.listeners.emplace_back(collector);
+ Reopen(options);
+
+ // Make sure the L0 files overlap to prevent trivial move.
+ ASSERT_OK(Put("a", "val"));
+ ASSERT_OK(Put("b", "val"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Delete("a"));
+ ASSERT_OK(Delete("b"));
+ ASSERT_OK(Flush());
+
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+
+ // Expect one file creation to start for each flush, and zero for compaction
+ // since no keys are written.
+ ASSERT_EQ(2, collector->num_ssts_creation_started());
+}
+
+TEST_F(DBCompactionTest, CompactionLimiter) {
+ const int kNumKeysPerFile = 10;
+ const int kMaxBackgroundThreads = 64;
+
+ struct CompactionLimiter {
+ std::string name;
+ int limit_tasks;
+ int max_tasks;
+ int tasks;
+ std::shared_ptr<ConcurrentTaskLimiter> limiter;
+ };
+
+ std::vector<CompactionLimiter> limiter_settings;
+ limiter_settings.push_back({"limiter_1", 1, 0, 0, nullptr});
+ limiter_settings.push_back({"limiter_2", 2, 0, 0, nullptr});
+ limiter_settings.push_back({"limiter_3", 3, 0, 0, nullptr});
+
+ for (auto& ls : limiter_settings) {
+ ls.limiter.reset(NewConcurrentTaskLimiter(ls.name, ls.limit_tasks));
+ }
+
+ std::shared_ptr<ConcurrentTaskLimiter> unique_limiter(
+ NewConcurrentTaskLimiter("unique_limiter", -1));
+
+ const char* cf_names[] = {"default", "0", "1", "2", "3", "4", "5",
+ "6", "7", "8", "9", "a", "b", "c", "d", "e", "f" };
+ const unsigned int cf_count = sizeof cf_names / sizeof cf_names[0];
+
+ std::unordered_map<std::string, CompactionLimiter*> cf_to_limiter;
+
+ Options options = CurrentOptions();
+ options.write_buffer_size = 110 * 1024; // 110KB
+ options.arena_block_size = 4096;
+ options.num_levels = 3;
+ options.level0_file_num_compaction_trigger = 4;
+ options.level0_slowdown_writes_trigger = 64;
+ options.level0_stop_writes_trigger = 64;
+ options.max_background_jobs = kMaxBackgroundThreads; // Enough threads
+ options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
+ options.max_write_buffer_number = 10; // Enough memtables
+ DestroyAndReopen(options);
+
+ std::vector<Options> option_vector;
+ option_vector.reserve(cf_count);
+
+ for (unsigned int cf = 0; cf < cf_count; cf++) {
+ ColumnFamilyOptions cf_opt(options);
+ if (cf == 0) {
+ // "Default" CF does't use compaction limiter
+ cf_opt.compaction_thread_limiter = nullptr;
+ } else if (cf == 1) {
+ // "1" CF uses bypass compaction limiter
+ unique_limiter->SetMaxOutstandingTask(-1);
+ cf_opt.compaction_thread_limiter = unique_limiter;
+ } else {
+ // Assign limiter by mod
+ auto& ls = limiter_settings[cf % 3];
+ cf_opt.compaction_thread_limiter = ls.limiter;
+ cf_to_limiter[cf_names[cf]] = &ls;
+ }
+ option_vector.emplace_back(DBOptions(options), cf_opt);
+ }
+
+ for (unsigned int cf = 1; cf < cf_count; cf++) {
+ CreateColumnFamilies({cf_names[cf]}, option_vector[cf]);
+ }
+
+ ReopenWithColumnFamilies(std::vector<std::string>(cf_names,
+ cf_names + cf_count),
+ option_vector);
+
+ port::Mutex mutex;
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:BeforeCompaction", [&](void* arg) {
+ const auto& cf_name = static_cast<ColumnFamilyData*>(arg)->GetName();
+ auto iter = cf_to_limiter.find(cf_name);
+ if (iter != cf_to_limiter.end()) {
+ MutexLock l(&mutex);
+ ASSERT_GE(iter->second->limit_tasks, ++iter->second->tasks);
+ iter->second->max_tasks =
+ std::max(iter->second->max_tasks, iter->second->limit_tasks);
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:AfterCompaction", [&](void* arg) {
+ const auto& cf_name = static_cast<ColumnFamilyData*>(arg)->GetName();
+ auto iter = cf_to_limiter.find(cf_name);
+ if (iter != cf_to_limiter.end()) {
+ MutexLock l(&mutex);
+ ASSERT_GE(--iter->second->tasks, 0);
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Block all compact threads in thread pool.
+ const size_t kTotalFlushTasks = kMaxBackgroundThreads / 4;
+ const size_t kTotalCompactTasks = kMaxBackgroundThreads - kTotalFlushTasks;
+ env_->SetBackgroundThreads((int)kTotalFlushTasks, Env::HIGH);
+ env_->SetBackgroundThreads((int)kTotalCompactTasks, Env::LOW);
+
+ test::SleepingBackgroundTask sleeping_compact_tasks[kTotalCompactTasks];
+
+ // Block all compaction threads in thread pool.
+ for (size_t i = 0; i < kTotalCompactTasks; i++) {
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+ &sleeping_compact_tasks[i], Env::LOW);
+ sleeping_compact_tasks[i].WaitUntilSleeping();
+ }
+
+ int keyIndex = 0;
+
+ for (int n = 0; n < options.level0_file_num_compaction_trigger; n++) {
+ for (unsigned int cf = 0; cf < cf_count; cf++) {
+ for (int i = 0; i < kNumKeysPerFile; i++) {
+ ASSERT_OK(Put(cf, Key(keyIndex++), ""));
+ }
+ // put extra key to trigger flush
+ ASSERT_OK(Put(cf, "", ""));
+ }
+
+ for (unsigned int cf = 0; cf < cf_count; cf++) {
+ dbfull()->TEST_WaitForFlushMemTable(handles_[cf]);
+ }
+ }
+
+ // Enough L0 files to trigger compaction
+ for (unsigned int cf = 0; cf < cf_count; cf++) {
+ ASSERT_EQ(NumTableFilesAtLevel(0, cf),
+ options.level0_file_num_compaction_trigger);
+ }
+
+ // Create more files for one column family, which triggers speed up
+ // condition, all compactions will be scheduled.
+ for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+ for (int i = 0; i < kNumKeysPerFile; i++) {
+ ASSERT_OK(Put(0, Key(i), ""));
+ }
+ // put extra key to trigger flush
+ ASSERT_OK(Put(0, "", ""));
+ dbfull()->TEST_WaitForFlushMemTable(handles_[0]);
+ ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1,
+ NumTableFilesAtLevel(0, 0));
+ }
+
+ // All CFs are pending compaction
+ ASSERT_EQ(cf_count, env_->GetThreadPoolQueueLen(Env::LOW));
+
+ // Unblock all compaction threads
+ for (size_t i = 0; i < kTotalCompactTasks; i++) {
+ sleeping_compact_tasks[i].WakeUp();
+ sleeping_compact_tasks[i].WaitUntilDone();
+ }
+
+ for (unsigned int cf = 0; cf < cf_count; cf++) {
+ dbfull()->TEST_WaitForFlushMemTable(handles_[cf]);
+ }
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // Max outstanding compact tasks reached limit
+ for (auto& ls : limiter_settings) {
+ ASSERT_EQ(ls.limit_tasks, ls.max_tasks);
+ ASSERT_EQ(0, ls.limiter->GetOutstandingTask());
+ }
+
+ // test manual compaction under a fully throttled limiter
+ int cf_test = 1;
+ unique_limiter->SetMaxOutstandingTask(0);
+
+ // flush one more file to cf 1
+ for (int i = 0; i < kNumKeysPerFile; i++) {
+ ASSERT_OK(Put(cf_test, Key(keyIndex++), ""));
+ }
+ // put extra key to trigger flush
+ ASSERT_OK(Put(cf_test, "", ""));
+
+ dbfull()->TEST_WaitForFlushMemTable(handles_[cf_test]);
+ ASSERT_EQ(1, NumTableFilesAtLevel(0, cf_test));
+
+ Compact(cf_test, Key(0), Key(keyIndex));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+}
+
+INSTANTIATE_TEST_CASE_P(DBCompactionTestWithParam, DBCompactionTestWithParam,
+ ::testing::Values(std::make_tuple(1, true),
+ std::make_tuple(1, false),
+ std::make_tuple(4, true),
+ std::make_tuple(4, false)));
+
+TEST_P(DBCompactionDirectIOTest, DirectIO) {
+ Options options = CurrentOptions();
+ Destroy(options);
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ options.use_direct_io_for_flush_and_compaction = GetParam();
+ options.env = new MockEnv(Env::Default());
+ Reopen(options);
+ bool readahead = false;
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::OpenCompactionOutputFile", [&](void* arg) {
+ bool* use_direct_writes = static_cast<bool*>(arg);
+ ASSERT_EQ(*use_direct_writes,
+ options.use_direct_io_for_flush_and_compaction);
+ });
+ if (options.use_direct_io_for_flush_and_compaction) {
+ SyncPoint::GetInstance()->SetCallBack(
+ "SanitizeOptions:direct_io", [&](void* /*arg*/) {
+ readahead = true;
+ });
+ }
+ SyncPoint::GetInstance()->EnableProcessing();
+ CreateAndReopenWithCF({"pikachu"}, options);
+ MakeTables(3, "p", "q", 1);
+ ASSERT_EQ("1,1,1", FilesPerLevel(1));
+ Compact(1, "p1", "p9");
+ ASSERT_EQ(readahead, options.use_direct_reads);
+ ASSERT_EQ("0,0,1", FilesPerLevel(1));
+ Destroy(options);
+ delete options.env;
+}
+
+INSTANTIATE_TEST_CASE_P(DBCompactionDirectIOTest, DBCompactionDirectIOTest,
+ testing::Bool());
+
+class CompactionPriTest : public DBTestBase,
+ public testing::WithParamInterface<uint32_t> {
+ public:
+ CompactionPriTest() : DBTestBase("/compaction_pri_test") {
+ compaction_pri_ = GetParam();
+ }
+
+ // Required if inheriting from testing::WithParamInterface<>
+ static void SetUpTestCase() {}
+ static void TearDownTestCase() {}
+
+ uint32_t compaction_pri_;
+};
+
+TEST_P(CompactionPriTest, Test) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 16 * 1024;
+ options.compaction_pri = static_cast<CompactionPri>(compaction_pri_);
+ options.hard_pending_compaction_bytes_limit = 256 * 1024;
+ options.max_bytes_for_level_base = 64 * 1024;
+ options.max_bytes_for_level_multiplier = 4;
+ options.compression = kNoCompression;
+
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ const int kNKeys = 5000;
+ int keys[kNKeys];
+ for (int i = 0; i < kNKeys; i++) {
+ keys[i] = i;
+ }
+ std::random_shuffle(std::begin(keys), std::end(keys));
+
+ for (int i = 0; i < kNKeys; i++) {
+ ASSERT_OK(Put(Key(keys[i]), RandomString(&rnd, 102)));
+ }
+
+ dbfull()->TEST_WaitForCompact();
+ for (int i = 0; i < kNKeys; i++) {
+ ASSERT_NE("NOT_FOUND", Get(Key(i)));
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(
+ CompactionPriTest, CompactionPriTest,
+ ::testing::Values(CompactionPri::kByCompensatedSize,
+ CompactionPri::kOldestLargestSeqFirst,
+ CompactionPri::kOldestSmallestSeqFirst,
+ CompactionPri::kMinOverlappingRatio));
+
+class NoopMergeOperator : public MergeOperator {
+ public:
+ NoopMergeOperator() {}
+
+ bool FullMergeV2(const MergeOperationInput& /*merge_in*/,
+ MergeOperationOutput* merge_out) const override {
+ std::string val("bar");
+ merge_out->new_value = val;
+ return true;
+ }
+
+ const char* Name() const override { return "Noop"; }
+};
+
+TEST_F(DBCompactionTest, PartialManualCompaction) {
+ Options opts = CurrentOptions();
+ opts.num_levels = 3;
+ opts.level0_file_num_compaction_trigger = 10;
+ opts.compression = kNoCompression;
+ opts.merge_operator.reset(new NoopMergeOperator());
+ opts.target_file_size_base = 10240;
+ DestroyAndReopen(opts);
+
+ Random rnd(301);
+ for (auto i = 0; i < 8; ++i) {
+ for (auto j = 0; j < 10; ++j) {
+ Merge("foo", RandomString(&rnd, 1024));
+ }
+ Flush();
+ }
+
+ MoveFilesToLevel(2);
+
+ std::string prop;
+ EXPECT_TRUE(dbfull()->GetProperty(DB::Properties::kLiveSstFilesSize, &prop));
+ uint64_t max_compaction_bytes = atoi(prop.c_str()) / 2;
+ ASSERT_OK(dbfull()->SetOptions(
+ {{"max_compaction_bytes", std::to_string(max_compaction_bytes)}}));
+
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+ dbfull()->CompactRange(cro, nullptr, nullptr);
+}
+
+TEST_F(DBCompactionTest, ManualCompactionFailsInReadOnlyMode) {
+ // Regression test for bug where manual compaction hangs forever when the DB
+ // is in read-only mode. Verify it now at least returns, despite failing.
+ const int kNumL0Files = 4;
+ std::unique_ptr<FaultInjectionTestEnv> mock_env(
+ new FaultInjectionTestEnv(Env::Default()));
+ Options opts = CurrentOptions();
+ opts.disable_auto_compactions = true;
+ opts.env = mock_env.get();
+ DestroyAndReopen(opts);
+
+ Random rnd(301);
+ for (int i = 0; i < kNumL0Files; ++i) {
+ // Make sure files are overlapping in key-range to prevent trivial move.
+ Put("key1", RandomString(&rnd, 1024));
+ Put("key2", RandomString(&rnd, 1024));
+ Flush();
+ }
+ ASSERT_EQ(kNumL0Files, NumTableFilesAtLevel(0));
+
+ // Enter read-only mode by failing a write.
+ mock_env->SetFilesystemActive(false);
+ // Make sure this is outside `CompactRange`'s range so that it doesn't fail
+ // early trying to flush memtable.
+ ASSERT_NOK(Put("key3", RandomString(&rnd, 1024)));
+
+ // In the bug scenario, the first manual compaction would fail and forget to
+ // unregister itself, causing the second one to hang forever due to conflict
+ // with a non-running compaction.
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = false;
+ Slice begin_key("key1");
+ Slice end_key("key2");
+ ASSERT_NOK(dbfull()->CompactRange(cro, &begin_key, &end_key));
+ ASSERT_NOK(dbfull()->CompactRange(cro, &begin_key, &end_key));
+
+ // Close before mock_env destruct.
+ Close();
+}
+
+// ManualCompactionBottomLevelOptimization tests the bottom level manual
+// compaction optimization to skip recompacting files created by Ln-1 to Ln
+// compaction
+TEST_F(DBCompactionTest, ManualCompactionBottomLevelOptimized) {
+ Options opts = CurrentOptions();
+ opts.num_levels = 3;
+ opts.level0_file_num_compaction_trigger = 5;
+ opts.compression = kNoCompression;
+ opts.merge_operator.reset(new NoopMergeOperator());
+ opts.target_file_size_base = 1024;
+ opts.max_bytes_for_level_multiplier = 2;
+ opts.disable_auto_compactions = true;
+ DestroyAndReopen(opts);
+ ColumnFamilyHandleImpl* cfh =
+ static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
+ ColumnFamilyData* cfd = cfh->cfd();
+ InternalStats* internal_stats_ptr = cfd->internal_stats();
+ ASSERT_NE(internal_stats_ptr, nullptr);
+
+ Random rnd(301);
+ for (auto i = 0; i < 8; ++i) {
+ for (auto j = 0; j < 10; ++j) {
+ ASSERT_OK(
+ Put("foo" + std::to_string(i * 10 + j), RandomString(&rnd, 1024)));
+ }
+ Flush();
+ }
+
+ MoveFilesToLevel(2);
+
+ for (auto i = 0; i < 8; ++i) {
+ for (auto j = 0; j < 10; ++j) {
+ ASSERT_OK(
+ Put("bar" + std::to_string(i * 10 + j), RandomString(&rnd, 1024)));
+ }
+ Flush();
+ }
+ const std::vector<InternalStats::CompactionStats>& comp_stats =
+ internal_stats_ptr->TEST_GetCompactionStats();
+ int num = comp_stats[2].num_input_files_in_output_level;
+ ASSERT_EQ(num, 0);
+
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+ dbfull()->CompactRange(cro, nullptr, nullptr);
+
+ const std::vector<InternalStats::CompactionStats>& comp_stats2 =
+ internal_stats_ptr->TEST_GetCompactionStats();
+ num = comp_stats2[2].num_input_files_in_output_level;
+ ASSERT_EQ(num, 0);
+}
+
+TEST_F(DBCompactionTest, CompactionDuringShutdown) {
+ Options opts = CurrentOptions();
+ opts.level0_file_num_compaction_trigger = 2;
+ opts.disable_auto_compactions = true;
+ DestroyAndReopen(opts);
+ ColumnFamilyHandleImpl* cfh =
+ static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
+ ColumnFamilyData* cfd = cfh->cfd();
+ InternalStats* internal_stats_ptr = cfd->internal_stats();
+ ASSERT_NE(internal_stats_ptr, nullptr);
+
+ Random rnd(301);
+ for (auto i = 0; i < 2; ++i) {
+ for (auto j = 0; j < 10; ++j) {
+ ASSERT_OK(
+ Put("foo" + std::to_string(i * 10 + j), RandomString(&rnd, 1024)));
+ }
+ Flush();
+ }
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun",
+ [&](void* /*arg*/) { dbfull()->shutting_down_.store(true); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ ASSERT_OK(dbfull()->error_handler_.GetBGError());
+}
+
+// FixFileIngestionCompactionDeadlock tests and verifies that compaction and
+// file ingestion do not cause deadlock in the event of write stall triggered
+// by number of L0 files reaching level0_stop_writes_trigger.
+TEST_P(DBCompactionTestWithParam, FixFileIngestionCompactionDeadlock) {
+ const int kNumKeysPerFile = 100;
+ // Generate SST files.
+ Options options = CurrentOptions();
+
+ // Generate an external SST file containing a single key, i.e. 99
+ std::string sst_files_dir = dbname_ + "/sst_files/";
+ test::DestroyDir(env_, sst_files_dir);
+ ASSERT_OK(env_->CreateDir(sst_files_dir));
+ SstFileWriter sst_writer(EnvOptions(), options);
+ const std::string sst_file_path = sst_files_dir + "test.sst";
+ ASSERT_OK(sst_writer.Open(sst_file_path));
+ ASSERT_OK(sst_writer.Put(Key(kNumKeysPerFile - 1), "value"));
+ ASSERT_OK(sst_writer.Finish());
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->LoadDependency({
+ {"DBImpl::IngestExternalFile:AfterIncIngestFileCounter",
+ "BackgroundCallCompaction:0"},
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ options.write_buffer_size = 110 << 10; // 110KB
+ options.level0_file_num_compaction_trigger =
+ options.level0_stop_writes_trigger;
+ options.max_subcompactions = max_subcompactions_;
+ options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
+ DestroyAndReopen(options);
+ Random rnd(301);
+
+ // Generate level0_stop_writes_trigger L0 files to trigger write stop
+ for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) {
+ for (int j = 0; j != kNumKeysPerFile; ++j) {
+ ASSERT_OK(Put(Key(j), RandomString(&rnd, 990)));
+ }
+ if (0 == i) {
+ // When we reach here, the memtables have kNumKeysPerFile keys. Note that
+ // flush is not yet triggered. We need to write an extra key so that the
+ // write path will call PreprocessWrite and flush the previous key-value
+ // pairs to e flushed. After that, there will be the newest key in the
+ // memtable, and a bunch of L0 files. Since there is already one key in
+ // the memtable, then for i = 1, 2, ..., we do not have to write this
+ // extra key to trigger flush.
+ ASSERT_OK(Put("", ""));
+ }
+ dbfull()->TEST_WaitForFlushMemTable();
+ ASSERT_EQ(NumTableFilesAtLevel(0 /*level*/, 0 /*cf*/), i + 1);
+ }
+ // When we reach this point, there will be level0_stop_writes_trigger L0
+ // files and one extra key (99) in memory, which overlaps with the external
+ // SST file. Write stall triggers, and can be cleared only after compaction
+ // reduces the number of L0 files.
+
+ // Compaction will also be triggered since we have reached the threshold for
+ // auto compaction. Note that compaction may begin after the following file
+ // ingestion thread and waits for ingestion to finish.
+
+ // Thread to ingest file with overlapping key range with the current
+ // memtable. Consequently ingestion will trigger a flush. The flush MUST
+ // proceed without waiting for the write stall condition to clear, otherwise
+ // deadlock can happen.
+ port::Thread ingestion_thr([&]() {
+ IngestExternalFileOptions ifo;
+ Status s = db_->IngestExternalFile({sst_file_path}, ifo);
+ ASSERT_OK(s);
+ });
+
+ // More write to trigger write stop
+ ingestion_thr.join();
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ Close();
+}
+
+TEST_F(DBCompactionTest, ConsistencyFailTest) {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "VersionBuilder::CheckConsistency", [&](void* arg) {
+ auto p =
+ reinterpret_cast<std::pair<FileMetaData**, FileMetaData**>*>(arg);
+ // just swap the two FileMetaData so that we hit error
+ // in CheckConsistency funcion
+ FileMetaData* temp = *(p->first);
+ *(p->first) = *(p->second);
+ *(p->second) = temp;
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ for (int k = 0; k < 2; ++k) {
+ ASSERT_OK(Put("foo", "bar"));
+ Flush();
+ }
+
+ ASSERT_NOK(Put("foo", "bar"));
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+void IngestOneKeyValue(DBImpl* db, const std::string& key,
+ const std::string& value, const Options& options) {
+ ExternalSstFileInfo info;
+ std::string f = test::PerThreadDBPath("sst_file" + key);
+ EnvOptions env;
+ ROCKSDB_NAMESPACE::SstFileWriter writer(env, options);
+ auto s = writer.Open(f);
+ ASSERT_OK(s);
+ // ASSERT_OK(writer.Put(Key(), ""));
+ ASSERT_OK(writer.Put(key, value));
+
+ ASSERT_OK(writer.Finish(&info));
+ IngestExternalFileOptions ingest_opt;
+
+ ASSERT_OK(db->IngestExternalFile({info.file_path}, ingest_opt));
+}
+
+TEST_P(DBCompactionTestWithParam,
+ FlushAfterIntraL0CompactionCheckConsistencyFail) {
+ Options options = CurrentOptions();
+ options.force_consistency_checks = true;
+ options.compression = kNoCompression;
+ options.level0_file_num_compaction_trigger = 5;
+ options.max_background_compactions = 2;
+ options.max_subcompactions = max_subcompactions_;
+ DestroyAndReopen(options);
+
+ const size_t kValueSize = 1 << 20;
+ Random rnd(301);
+ std::atomic<int> pick_intra_l0_count(0);
+ std::string value(RandomString(&rnd, kValueSize));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBCompactionTestWithParam::FlushAfterIntraL0:1",
+ "CompactionJob::Run():Start"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "FindIntraL0Compaction",
+ [&](void* /*arg*/) { pick_intra_l0_count.fetch_add(1); });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // prevents trivial move
+ for (int i = 0; i < 10; ++i) {
+ ASSERT_OK(Put(Key(i), "")); // prevents trivial move
+ }
+ ASSERT_OK(Flush());
+ Compact("", Key(99));
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+
+ // Flush 5 L0 sst.
+ for (int i = 0; i < 5; ++i) {
+ ASSERT_OK(Put(Key(i + 1), value));
+ ASSERT_OK(Flush());
+ }
+ ASSERT_EQ(5, NumTableFilesAtLevel(0));
+
+ // Put one key, to make smallest log sequence number in this memtable is less
+ // than sst which would be ingested in next step.
+ ASSERT_OK(Put(Key(0), "a"));
+
+ ASSERT_EQ(5, NumTableFilesAtLevel(0));
+
+ // Ingest 5 L0 sst. And this files would trigger PickIntraL0Compaction.
+ for (int i = 5; i < 10; i++) {
+ IngestOneKeyValue(dbfull(), Key(i), value, options);
+ ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+ }
+
+ TEST_SYNC_POINT("DBCompactionTestWithParam::FlushAfterIntraL0:1");
+ // Put one key, to make biggest log sequence number in this memtable is bigger
+ // than sst which would be ingested in next step.
+ ASSERT_OK(Put(Key(2), "b"));
+ ASSERT_EQ(10, NumTableFilesAtLevel(0));
+ dbfull()->TEST_WaitForCompact();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ std::vector<std::vector<FileMetaData>> level_to_files;
+ dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+ &level_to_files);
+ ASSERT_GT(level_to_files[0].size(), 0);
+ ASSERT_GT(pick_intra_l0_count.load(), 0);
+
+ ASSERT_OK(Flush());
+}
+
+TEST_P(DBCompactionTestWithParam,
+ IntraL0CompactionAfterFlushCheckConsistencyFail) {
+ Options options = CurrentOptions();
+ options.force_consistency_checks = true;
+ options.compression = kNoCompression;
+ options.level0_file_num_compaction_trigger = 5;
+ options.max_background_compactions = 2;
+ options.max_subcompactions = max_subcompactions_;
+ options.write_buffer_size = 2 << 20;
+ options.max_write_buffer_number = 6;
+ DestroyAndReopen(options);
+
+ const size_t kValueSize = 1 << 20;
+ Random rnd(301);
+ std::string value(RandomString(&rnd, kValueSize));
+ std::string value2(RandomString(&rnd, kValueSize));
+ std::string bigvalue = value + value;
+
+ // prevents trivial move
+ for (int i = 0; i < 10; ++i) {
+ ASSERT_OK(Put(Key(i), "")); // prevents trivial move
+ }
+ ASSERT_OK(Flush());
+ Compact("", Key(99));
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+
+ std::atomic<int> pick_intra_l0_count(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBCompactionTestWithParam::IntraL0CompactionAfterFlush:1",
+ "CompactionJob::Run():Start"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "FindIntraL0Compaction",
+ [&](void* /*arg*/) { pick_intra_l0_count.fetch_add(1); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ // Make 6 L0 sst.
+ for (int i = 0; i < 6; ++i) {
+ if (i % 2 == 0) {
+ IngestOneKeyValue(dbfull(), Key(i), value, options);
+ } else {
+ ASSERT_OK(Put(Key(i), value));
+ ASSERT_OK(Flush());
+ }
+ }
+
+ ASSERT_EQ(6, NumTableFilesAtLevel(0));
+
+ // Stop run flush job
+ env_->SetBackgroundThreads(1, Env::HIGH);
+ test::SleepingBackgroundTask sleeping_tasks;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_tasks,
+ Env::Priority::HIGH);
+ sleeping_tasks.WaitUntilSleeping();
+
+ // Put many keys to make memtable request to flush
+ for (int i = 0; i < 6; ++i) {
+ ASSERT_OK(Put(Key(i), bigvalue));
+ }
+
+ ASSERT_EQ(6, NumTableFilesAtLevel(0));
+ // ingest file to trigger IntraL0Compaction
+ for (int i = 6; i < 10; ++i) {
+ ASSERT_EQ(i, NumTableFilesAtLevel(0));
+ IngestOneKeyValue(dbfull(), Key(i), value2, options);
+ }
+ ASSERT_EQ(10, NumTableFilesAtLevel(0));
+
+ // Wake up flush job
+ sleeping_tasks.WakeUp();
+ sleeping_tasks.WaitUntilDone();
+ TEST_SYNC_POINT("DBCompactionTestWithParam::IntraL0CompactionAfterFlush:1");
+ dbfull()->TEST_WaitForCompact();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ uint64_t error_count = 0;
+ db_->GetIntProperty("rocksdb.background-errors", &error_count);
+ ASSERT_EQ(error_count, 0);
+ ASSERT_GT(pick_intra_l0_count.load(), 0);
+ for (int i = 0; i < 6; ++i) {
+ ASSERT_EQ(bigvalue, Get(Key(i)));
+ }
+ for (int i = 6; i < 10; ++i) {
+ ASSERT_EQ(value2, Get(Key(i)));
+ }
+}
+
+#endif // !defined(ROCKSDB_LITE)
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+#if !defined(ROCKSDB_LITE)
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+#else
+ (void) argc;
+ (void) argv;
+ return 0;
+#endif
+}
diff --git a/src/rocksdb/db/db_dynamic_level_test.cc b/src/rocksdb/db/db_dynamic_level_test.cc
new file mode 100644
index 000000000..c26657701
--- /dev/null
+++ b/src/rocksdb/db/db_dynamic_level_test.cc
@@ -0,0 +1,505 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// Introduction of SyncPoint effectively disabled building and running this test
+// in Release build.
+// which is a pity, it is a good test
+#if !defined(ROCKSDB_LITE)
+
+#include "db/db_test_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+class DBTestDynamicLevel : public DBTestBase {
+ public:
+ DBTestDynamicLevel() : DBTestBase("/db_dynamic_level_test") {}
+};
+
+TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase) {
+ if (!Snappy_Supported() || !LZ4_Supported()) {
+ return;
+ }
+ // Use InMemoryEnv, or it would be too slow.
+ std::unique_ptr<Env> env(new MockEnv(env_));
+
+ const int kNKeys = 1000;
+ int keys[kNKeys];
+
+ auto verify_func = [&]() {
+ for (int i = 0; i < kNKeys; i++) {
+ ASSERT_NE("NOT_FOUND", Get(Key(i)));
+ ASSERT_NE("NOT_FOUND", Get(Key(kNKeys * 2 + i)));
+ if (i < kNKeys / 10) {
+ ASSERT_EQ("NOT_FOUND", Get(Key(kNKeys + keys[i])));
+ } else {
+ ASSERT_NE("NOT_FOUND", Get(Key(kNKeys + keys[i])));
+ }
+ }
+ };
+
+ Random rnd(301);
+ for (int ordered_insert = 0; ordered_insert <= 1; ordered_insert++) {
+ for (int i = 0; i < kNKeys; i++) {
+ keys[i] = i;
+ }
+ if (ordered_insert == 0) {
+ std::random_shuffle(std::begin(keys), std::end(keys));
+ }
+ for (int max_background_compactions = 1; max_background_compactions < 4;
+ max_background_compactions += 2) {
+ Options options;
+ options.env = env.get();
+ options.create_if_missing = true;
+ options.write_buffer_size = 2048;
+ options.max_write_buffer_number = 2;
+ options.level0_file_num_compaction_trigger = 2;
+ options.level0_slowdown_writes_trigger = 2;
+ options.level0_stop_writes_trigger = 2;
+ options.target_file_size_base = 2048;
+ options.level_compaction_dynamic_level_bytes = true;
+ options.max_bytes_for_level_base = 10240;
+ options.max_bytes_for_level_multiplier = 4;
+ options.soft_rate_limit = 1.1;
+ options.max_background_compactions = max_background_compactions;
+ options.num_levels = 5;
+
+ options.compression_per_level.resize(3);
+ options.compression_per_level[0] = kNoCompression;
+ options.compression_per_level[1] = kLZ4Compression;
+ options.compression_per_level[2] = kSnappyCompression;
+ options.env = env_;
+
+ DestroyAndReopen(options);
+
+ for (int i = 0; i < kNKeys; i++) {
+ int key = keys[i];
+ ASSERT_OK(Put(Key(kNKeys + key), RandomString(&rnd, 102)));
+ ASSERT_OK(Put(Key(key), RandomString(&rnd, 102)));
+ ASSERT_OK(Put(Key(kNKeys * 2 + key), RandomString(&rnd, 102)));
+ ASSERT_OK(Delete(Key(kNKeys + keys[i / 10])));
+ env_->SleepForMicroseconds(5000);
+ }
+
+ uint64_t int_prop;
+ ASSERT_TRUE(db_->GetIntProperty("rocksdb.background-errors", &int_prop));
+ ASSERT_EQ(0U, int_prop);
+
+ // Verify DB
+ for (int j = 0; j < 2; j++) {
+ verify_func();
+ if (j == 0) {
+ Reopen(options);
+ }
+ }
+
+ // Test compact range works
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ // All data should be in the last level.
+ ColumnFamilyMetaData cf_meta;
+ db_->GetColumnFamilyMetaData(&cf_meta);
+ ASSERT_EQ(5U, cf_meta.levels.size());
+ for (int i = 0; i < 4; i++) {
+ ASSERT_EQ(0U, cf_meta.levels[i].files.size());
+ }
+ ASSERT_GT(cf_meta.levels[4U].files.size(), 0U);
+ verify_func();
+
+ Close();
+ }
+ }
+
+ env_->SetBackgroundThreads(1, Env::LOW);
+ env_->SetBackgroundThreads(1, Env::HIGH);
+}
+
+// Test specific cases in dynamic max bytes
+TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) {
+ Random rnd(301);
+ int kMaxKey = 1000000;
+
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.create_if_missing = true;
+ options.write_buffer_size = 20480;
+ options.max_write_buffer_number = 2;
+ options.level0_file_num_compaction_trigger = 2;
+ options.level0_slowdown_writes_trigger = 9999;
+ options.level0_stop_writes_trigger = 9999;
+ options.target_file_size_base = 9102;
+ options.level_compaction_dynamic_level_bytes = true;
+ options.max_bytes_for_level_base = 40960;
+ options.max_bytes_for_level_multiplier = 4;
+ options.max_background_compactions = 2;
+ options.num_levels = 5;
+ options.max_compaction_bytes = 0; // Force not expanding in compactions
+ BlockBasedTableOptions table_options;
+ table_options.block_size = 1024;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ DestroyAndReopen(options);
+ ASSERT_OK(dbfull()->SetOptions({
+ {"disable_auto_compactions", "true"},
+ }));
+
+ uint64_t int_prop;
+ std::string str_prop;
+
+ // Initial base level is the last level
+ ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+ ASSERT_EQ(4U, int_prop);
+
+ // Put about 28K to L0
+ for (int i = 0; i < 70; i++) {
+ ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+ RandomString(&rnd, 380)));
+ }
+ ASSERT_OK(dbfull()->SetOptions({
+ {"disable_auto_compactions", "false"},
+ }));
+ Flush();
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+ ASSERT_EQ(4U, int_prop);
+
+ // Insert extra about 28K to L0. After they are compacted to L4, the base
+ // level should be changed to L3.
+ ASSERT_OK(dbfull()->SetOptions({
+ {"disable_auto_compactions", "true"},
+ }));
+ for (int i = 0; i < 70; i++) {
+ ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+ RandomString(&rnd, 380)));
+ }
+
+ ASSERT_OK(dbfull()->SetOptions({
+ {"disable_auto_compactions", "false"},
+ }));
+ Flush();
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+ ASSERT_EQ(3U, int_prop);
+ ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level1", &str_prop));
+ ASSERT_EQ("0", str_prop);
+ ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level2", &str_prop));
+ ASSERT_EQ("0", str_prop);
+
+ // Write even more data while leaving the base level at L3.
+ ASSERT_OK(dbfull()->SetOptions({
+ {"disable_auto_compactions", "true"},
+ }));
+ // Write about 40K more
+ for (int i = 0; i < 100; i++) {
+ ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+ RandomString(&rnd, 380)));
+ }
+ ASSERT_OK(dbfull()->SetOptions({
+ {"disable_auto_compactions", "false"},
+ }));
+ Flush();
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+ ASSERT_EQ(3U, int_prop);
+
+ // Fill up L0, and then run an (auto) L0->Lmax compaction to raise the base
+ // level to 2.
+ ASSERT_OK(dbfull()->SetOptions({
+ {"disable_auto_compactions", "true"},
+ }));
+ // Write about 650K more.
+ // Each file is about 11KB, with 9KB of data.
+ for (int i = 0; i < 1300; i++) {
+ ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+ RandomString(&rnd, 380)));
+ }
+
+ // Make sure that the compaction starts before the last bit of data is
+ // flushed, so that the base level isn't raised to L1.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"CompactionJob::Run():Start", "DynamicLevelMaxBytesBase2:0"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(dbfull()->SetOptions({
+ {"disable_auto_compactions", "false"},
+ }));
+
+ TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:0");
+ Flush();
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+ ASSERT_EQ(2U, int_prop);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ // Write more data until the base level changes to L1. There will be
+ // a manual compaction going on at the same time.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"CompactionJob::Run():Start", "DynamicLevelMaxBytesBase2:1"},
+ {"DynamicLevelMaxBytesBase2:2", "CompactionJob::Run():End"},
+ {"DynamicLevelMaxBytesBase2:compact_range_finish",
+ "FlushJob::WriteLevel0Table"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ROCKSDB_NAMESPACE::port::Thread thread([this] {
+ TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:compact_range_start");
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:compact_range_finish");
+ });
+
+ TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:1");
+ for (int i = 0; i < 2; i++) {
+ ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+ RandomString(&rnd, 380)));
+ }
+ TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:2");
+
+ Flush();
+
+ thread.join();
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+ ASSERT_EQ(1U, int_prop);
+}
+
+// Test specific cases in dynamic max bytes
+TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesCompactRange) {
+ Random rnd(301);
+ int kMaxKey = 1000000;
+
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.write_buffer_size = 2048;
+ options.max_write_buffer_number = 2;
+ options.level0_file_num_compaction_trigger = 2;
+ options.level0_slowdown_writes_trigger = 9999;
+ options.level0_stop_writes_trigger = 9999;
+ options.target_file_size_base = 2;
+ options.level_compaction_dynamic_level_bytes = true;
+ options.max_bytes_for_level_base = 10240;
+ options.max_bytes_for_level_multiplier = 4;
+ options.max_background_compactions = 1;
+ const int kNumLevels = 5;
+ options.num_levels = kNumLevels;
+ options.max_compaction_bytes = 1; // Force not expanding in compactions
+ BlockBasedTableOptions table_options;
+ table_options.block_size = 1024;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ DestroyAndReopen(options);
+
+ // Compact against empty DB
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+ uint64_t int_prop;
+ std::string str_prop;
+
+ // Initial base level is the last level
+ ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+ ASSERT_EQ(4U, int_prop);
+
+ // Put about 7K to L0
+ for (int i = 0; i < 140; i++) {
+ ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+ RandomString(&rnd, 80)));
+ }
+ Flush();
+ dbfull()->TEST_WaitForCompact();
+ if (NumTableFilesAtLevel(0) == 0) {
+ // Make sure level 0 is not empty
+ ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+ RandomString(&rnd, 80)));
+ Flush();
+ }
+
+ ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+ ASSERT_EQ(3U, int_prop);
+ ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level1", &str_prop));
+ ASSERT_EQ("0", str_prop);
+ ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level2", &str_prop));
+ ASSERT_EQ("0", str_prop);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ std::set<int> output_levels;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionPicker::CompactRange:Return", [&](void* arg) {
+ Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+ output_levels.insert(compaction->output_level());
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ ASSERT_EQ(output_levels.size(), 2);
+ ASSERT_TRUE(output_levels.find(3) != output_levels.end());
+ ASSERT_TRUE(output_levels.find(4) != output_levels.end());
+ ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level0", &str_prop));
+ ASSERT_EQ("0", str_prop);
+ ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level3", &str_prop));
+ ASSERT_EQ("0", str_prop);
+ // Base level is still level 3.
+ ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+ ASSERT_EQ(3U, int_prop);
+}
+
+TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBaseInc) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.write_buffer_size = 2048;
+ options.max_write_buffer_number = 2;
+ options.level0_file_num_compaction_trigger = 2;
+ options.level0_slowdown_writes_trigger = 2;
+ options.level0_stop_writes_trigger = 2;
+ options.target_file_size_base = 2048;
+ options.level_compaction_dynamic_level_bytes = true;
+ options.max_bytes_for_level_base = 10240;
+ options.max_bytes_for_level_multiplier = 4;
+ options.soft_rate_limit = 1.1;
+ options.max_background_compactions = 2;
+ options.num_levels = 5;
+ options.max_compaction_bytes = 100000000;
+
+ DestroyAndReopen(options);
+
+ int non_trivial = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial",
+ [&](void* /*arg*/) { non_trivial++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ const int total_keys = 3000;
+ const int random_part_size = 100;
+ for (int i = 0; i < total_keys; i++) {
+ std::string value = RandomString(&rnd, random_part_size);
+ PutFixed32(&value, static_cast<uint32_t>(i));
+ ASSERT_OK(Put(Key(i), value));
+ }
+ Flush();
+ dbfull()->TEST_WaitForCompact();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ ASSERT_EQ(non_trivial, 0);
+
+ for (int i = 0; i < total_keys; i++) {
+ std::string value = Get(Key(i));
+ ASSERT_EQ(DecodeFixed32(value.c_str() + random_part_size),
+ static_cast<uint32_t>(i));
+ }
+
+ env_->SetBackgroundThreads(1, Env::LOW);
+ env_->SetBackgroundThreads(1, Env::HIGH);
+}
+
+TEST_F(DBTestDynamicLevel, DISABLED_MigrateToDynamicLevelMaxBytesBase) {
+ Random rnd(301);
+ const int kMaxKey = 2000;
+
+ Options options;
+ options.create_if_missing = true;
+ options.write_buffer_size = 2048;
+ options.max_write_buffer_number = 8;
+ options.level0_file_num_compaction_trigger = 4;
+ options.level0_slowdown_writes_trigger = 4;
+ options.level0_stop_writes_trigger = 8;
+ options.target_file_size_base = 2048;
+ options.level_compaction_dynamic_level_bytes = false;
+ options.max_bytes_for_level_base = 10240;
+ options.max_bytes_for_level_multiplier = 4;
+ options.soft_rate_limit = 1.1;
+ options.num_levels = 8;
+
+ DestroyAndReopen(options);
+
+ auto verify_func = [&](int num_keys, bool if_sleep) {
+ for (int i = 0; i < num_keys; i++) {
+ ASSERT_NE("NOT_FOUND", Get(Key(kMaxKey + i)));
+ if (i < num_keys / 10) {
+ ASSERT_EQ("NOT_FOUND", Get(Key(i)));
+ } else {
+ ASSERT_NE("NOT_FOUND", Get(Key(i)));
+ }
+ if (if_sleep && i % 1000 == 0) {
+ // Without it, valgrind may choose not to give another
+ // thread a chance to run before finishing the function,
+ // causing the test to be extremely slow.
+ env_->SleepForMicroseconds(1);
+ }
+ }
+ };
+
+ int total_keys = 1000;
+ for (int i = 0; i < total_keys; i++) {
+ ASSERT_OK(Put(Key(i), RandomString(&rnd, 102)));
+ ASSERT_OK(Put(Key(kMaxKey + i), RandomString(&rnd, 102)));
+ ASSERT_OK(Delete(Key(i / 10)));
+ }
+ verify_func(total_keys, false);
+ dbfull()->TEST_WaitForCompact();
+
+ options.level_compaction_dynamic_level_bytes = true;
+ options.disable_auto_compactions = true;
+ Reopen(options);
+ verify_func(total_keys, false);
+
+ std::atomic_bool compaction_finished;
+ compaction_finished = false;
+ // Issue manual compaction in one thread and still verify DB state
+ // in main thread.
+ ROCKSDB_NAMESPACE::port::Thread t([&]() {
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = options.num_levels - 1;
+ dbfull()->CompactRange(compact_options, nullptr, nullptr);
+ compaction_finished.store(true);
+ });
+ do {
+ verify_func(total_keys, true);
+ } while (!compaction_finished.load());
+ t.join();
+
+ ASSERT_OK(dbfull()->SetOptions({
+ {"disable_auto_compactions", "false"},
+ }));
+
+ int total_keys2 = 2000;
+ for (int i = total_keys; i < total_keys2; i++) {
+ ASSERT_OK(Put(Key(i), RandomString(&rnd, 102)));
+ ASSERT_OK(Put(Key(kMaxKey + i), RandomString(&rnd, 102)));
+ ASSERT_OK(Delete(Key(i / 10)));
+ }
+
+ verify_func(total_keys2, false);
+ dbfull()->TEST_WaitForCompact();
+ verify_func(total_keys2, false);
+
+ // Base level is not level 1
+ ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+}
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !defined(ROCKSDB_LITE)
+
+int main(int argc, char** argv) {
+#if !defined(ROCKSDB_LITE)
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+#else
+ (void) argc;
+ (void) argv;
+ return 0;
+#endif
+}
diff --git a/src/rocksdb/db/db_encryption_test.cc b/src/rocksdb/db/db_encryption_test.cc
new file mode 100644
index 000000000..b1f3ce23f
--- /dev/null
+++ b/src/rocksdb/db/db_encryption_test.cc
@@ -0,0 +1,122 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/perf_context.h"
+#if !defined(ROCKSDB_LITE)
+#include "test_util/sync_point.h"
+#endif
+#include <iostream>
+#include <string>
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBEncryptionTest : public DBTestBase {
+ public:
+ DBEncryptionTest() : DBTestBase("/db_encryption_test") {}
+};
+
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBEncryptionTest, CheckEncrypted) {
+ ASSERT_OK(Put("foo567", "v1.fetdq"));
+ ASSERT_OK(Put("bar123", "v2.dfgkjdfghsd"));
+ Close();
+
+ // Open all files and look for the values we've put in there.
+ // They should not be found if encrypted, otherwise
+ // they should be found.
+ std::vector<std::string> fileNames;
+ auto status = env_->GetChildren(dbname_, &fileNames);
+ ASSERT_OK(status);
+
+ auto defaultEnv = Env::Default();
+ int hits = 0;
+ for (auto it = fileNames.begin() ; it != fileNames.end(); ++it) {
+ if ((*it == "..") || (*it == ".")) {
+ continue;
+ }
+ auto filePath = dbname_ + "/" + *it;
+ std::unique_ptr<SequentialFile> seqFile;
+ auto envOptions = EnvOptions(CurrentOptions());
+ status = defaultEnv->NewSequentialFile(filePath, &seqFile, envOptions);
+ ASSERT_OK(status);
+
+ uint64_t fileSize;
+ status = defaultEnv->GetFileSize(filePath, &fileSize);
+ ASSERT_OK(status);
+
+ std::string scratch;
+ scratch.reserve(fileSize);
+ Slice data;
+ status = seqFile->Read(fileSize, &data, (char*)scratch.data());
+ ASSERT_OK(status);
+
+ if (data.ToString().find("foo567") != std::string::npos) {
+ hits++;
+ //std::cout << "Hit in " << filePath << "\n";
+ }
+ if (data.ToString().find("v1.fetdq") != std::string::npos) {
+ hits++;
+ //std::cout << "Hit in " << filePath << "\n";
+ }
+ if (data.ToString().find("bar123") != std::string::npos) {
+ hits++;
+ //std::cout << "Hit in " << filePath << "\n";
+ }
+ if (data.ToString().find("v2.dfgkjdfghsd") != std::string::npos) {
+ hits++;
+ //std::cout << "Hit in " << filePath << "\n";
+ }
+ if (data.ToString().find("dfgk") != std::string::npos) {
+ hits++;
+ //std::cout << "Hit in " << filePath << "\n";
+ }
+ }
+ if (encrypted_env_) {
+ ASSERT_EQ(hits, 0);
+ } else {
+ ASSERT_GE(hits, 4);
+ }
+}
+
+TEST_F(DBEncryptionTest, ReadEmptyFile) {
+ auto defaultEnv = Env::Default();
+
+ // create empty file for reading it back in later
+ auto envOptions = EnvOptions(CurrentOptions());
+ auto filePath = dbname_ + "/empty.empty";
+
+ Status status;
+ {
+ std::unique_ptr<WritableFile> writableFile;
+ status = defaultEnv->NewWritableFile(filePath, &writableFile, envOptions);
+ ASSERT_OK(status);
+ }
+
+ std::unique_ptr<SequentialFile> seqFile;
+ status = defaultEnv->NewSequentialFile(filePath, &seqFile, envOptions);
+ ASSERT_OK(status);
+
+ std::string scratch;
+ Slice data;
+ // reading back 16 bytes from the empty file shouldn't trigger an assertion.
+ // it should just work and return an empty string
+ status = seqFile->Read(16, &data, (char*)scratch.data());
+ ASSERT_OK(status);
+
+ ASSERT_TRUE(data.empty());
+}
+
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_filesnapshot.cc b/src/rocksdb/db/db_filesnapshot.cc
new file mode 100644
index 000000000..f0f22cb47
--- /dev/null
+++ b/src/rocksdb/db/db_filesnapshot.cc
@@ -0,0 +1,177 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+
+#ifndef ROCKSDB_LITE
+
+#include <stdint.h>
+#include <algorithm>
+#include <cinttypes>
+#include <string>
+#include "db/db_impl/db_impl.h"
+#include "db/job_context.h"
+#include "db/version_set.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "test_util/sync_point.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status DBImpl::DisableFileDeletions() {
+ InstrumentedMutexLock l(&mutex_);
+ ++disable_delete_obsolete_files_;
+ if (disable_delete_obsolete_files_ == 1) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Disabled");
+ } else {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "File Deletions Disabled, but already disabled. Counter: %d",
+ disable_delete_obsolete_files_);
+ }
+ return Status::OK();
+}
+
+Status DBImpl::EnableFileDeletions(bool force) {
+ // Job id == 0 means that this is not our background process, but rather
+ // user thread
+ JobContext job_context(0);
+ bool file_deletion_enabled = false;
+ {
+ InstrumentedMutexLock l(&mutex_);
+ if (force) {
+ // if force, we need to enable file deletions right away
+ disable_delete_obsolete_files_ = 0;
+ } else if (disable_delete_obsolete_files_ > 0) {
+ --disable_delete_obsolete_files_;
+ }
+ if (disable_delete_obsolete_files_ == 0) {
+ file_deletion_enabled = true;
+ FindObsoleteFiles(&job_context, true);
+ bg_cv_.SignalAll();
+ }
+ }
+ if (file_deletion_enabled) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Enabled");
+ if (job_context.HaveSomethingToDelete()) {
+ PurgeObsoleteFiles(job_context);
+ }
+ } else {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "File Deletions Enable, but not really enabled. Counter: %d",
+ disable_delete_obsolete_files_);
+ }
+ job_context.Clean();
+ LogFlush(immutable_db_options_.info_log);
+ return Status::OK();
+}
+
+int DBImpl::IsFileDeletionsEnabled() const {
+ return !disable_delete_obsolete_files_;
+}
+
+Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
+ uint64_t* manifest_file_size,
+ bool flush_memtable) {
+ *manifest_file_size = 0;
+
+ mutex_.Lock();
+
+ if (flush_memtable) {
+ // flush all dirty data to disk.
+ Status status;
+ if (immutable_db_options_.atomic_flush) {
+ autovector<ColumnFamilyData*> cfds;
+ SelectColumnFamiliesForAtomicFlush(&cfds);
+ mutex_.Unlock();
+ status = AtomicFlushMemTables(cfds, FlushOptions(),
+ FlushReason::kGetLiveFiles);
+ mutex_.Lock();
+ } else {
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ cfd->Ref();
+ mutex_.Unlock();
+ status = FlushMemTable(cfd, FlushOptions(), FlushReason::kGetLiveFiles);
+ TEST_SYNC_POINT("DBImpl::GetLiveFiles:1");
+ TEST_SYNC_POINT("DBImpl::GetLiveFiles:2");
+ mutex_.Lock();
+ cfd->UnrefAndTryDelete();
+ if (!status.ok()) {
+ break;
+ }
+ }
+ }
+ versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
+
+ if (!status.ok()) {
+ mutex_.Unlock();
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log, "Cannot Flush data %s\n",
+ status.ToString().c_str());
+ return status;
+ }
+ }
+
+ // Make a set of all of the live *.sst files
+ std::vector<FileDescriptor> live;
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ cfd->current()->AddLiveFiles(&live);
+ }
+
+ ret.clear();
+ ret.reserve(live.size() + 3); // *.sst + CURRENT + MANIFEST + OPTIONS
+
+ // create names of the live files. The names are not absolute
+ // paths, instead they are relative to dbname_;
+ for (const auto& live_file : live) {
+ ret.push_back(MakeTableFileName("", live_file.GetNumber()));
+ }
+
+ ret.push_back(CurrentFileName(""));
+ ret.push_back(DescriptorFileName("", versions_->manifest_file_number()));
+ ret.push_back(OptionsFileName("", versions_->options_file_number()));
+
+ // find length of manifest file while holding the mutex lock
+ *manifest_file_size = versions_->manifest_file_size();
+
+ mutex_.Unlock();
+ return Status::OK();
+}
+
+Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
+ {
+ // If caller disabled deletions, this function should return files that are
+ // guaranteed not to be deleted until deletions are re-enabled. We need to
+ // wait for pending purges to finish since WalManager doesn't know which
+ // files are going to be purged. Additional purges won't be scheduled as
+ // long as deletions are disabled (so the below loop must terminate).
+ InstrumentedMutexLock l(&mutex_);
+ while (disable_delete_obsolete_files_ > 0 &&
+ pending_purge_obsolete_files_ > 0) {
+ bg_cv_.Wait();
+ }
+ }
+ return wal_manager_.GetSortedWalFiles(files);
+}
+
+Status DBImpl::GetCurrentWalFile(std::unique_ptr<LogFile>* current_log_file) {
+ uint64_t current_logfile_number;
+ {
+ InstrumentedMutexLock l(&mutex_);
+ current_logfile_number = logfile_number_;
+ }
+
+ return wal_manager_.GetLiveWalFile(current_logfile_number, current_log_file);
+}
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/db/db_flush_test.cc b/src/rocksdb/db/db_flush_test.cc
new file mode 100644
index 000000000..bab206d3d
--- /dev/null
+++ b/src/rocksdb/db/db_flush_test.cc
@@ -0,0 +1,784 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <atomic>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBFlushTest : public DBTestBase {
+ public:
+ DBFlushTest() : DBTestBase("/db_flush_test") {}
+};
+
+class DBFlushDirectIOTest : public DBFlushTest,
+ public ::testing::WithParamInterface<bool> {
+ public:
+ DBFlushDirectIOTest() : DBFlushTest() {}
+};
+
+class DBAtomicFlushTest : public DBFlushTest,
+ public ::testing::WithParamInterface<bool> {
+ public:
+ DBAtomicFlushTest() : DBFlushTest() {}
+};
+
+// We had issue when two background threads trying to flush at the same time,
+// only one of them get committed. The test verifies the issue is fixed.
+TEST_F(DBFlushTest, FlushWhileWritingManifest) {
+ Options options;
+ options.disable_auto_compactions = true;
+ options.max_background_flushes = 2;
+ options.env = env_;
+ Reopen(options);
+ FlushOptions no_wait;
+ no_wait.wait = false;
+ no_wait.allow_write_stall=true;
+
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"VersionSet::LogAndApply:WriteManifest",
+ "DBFlushTest::FlushWhileWritingManifest:1"},
+ {"MemTableList::TryInstallMemtableFlushResults:InProgress",
+ "VersionSet::LogAndApply:WriteManifestDone"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put("foo", "v"));
+ ASSERT_OK(dbfull()->Flush(no_wait));
+ TEST_SYNC_POINT("DBFlushTest::FlushWhileWritingManifest:1");
+ ASSERT_OK(Put("bar", "v"));
+ ASSERT_OK(dbfull()->Flush(no_wait));
+ // If the issue is hit we will wait here forever.
+ dbfull()->TEST_WaitForFlushMemTable();
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ(2, TotalTableFiles());
+#endif // ROCKSDB_LITE
+}
+
+// Disable this test temporarily on Travis as it fails intermittently.
+// Github issue: #4151
+TEST_F(DBFlushTest, SyncFail) {
+ std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+ new FaultInjectionTestEnv(env_));
+ Options options;
+ options.disable_auto_compactions = true;
+ options.env = fault_injection_env.get();
+
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBFlushTest::SyncFail:GetVersionRefCount:1",
+ "DBImpl::FlushMemTableToOutputFile:BeforePickMemtables"},
+ {"DBImpl::FlushMemTableToOutputFile:AfterPickMemtables",
+ "DBFlushTest::SyncFail:GetVersionRefCount:2"},
+ {"DBFlushTest::SyncFail:1", "DBImpl::SyncClosedLogs:Start"},
+ {"DBImpl::SyncClosedLogs:Failed", "DBFlushTest::SyncFail:2"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ CreateAndReopenWithCF({"pikachu"}, options);
+ Put("key", "value");
+ auto* cfd =
+ reinterpret_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())
+ ->cfd();
+ FlushOptions flush_options;
+ flush_options.wait = false;
+ ASSERT_OK(dbfull()->Flush(flush_options));
+ // Flush installs a new super-version. Get the ref count after that.
+ auto current_before = cfd->current();
+ int refs_before = cfd->current()->TEST_refs();
+ TEST_SYNC_POINT("DBFlushTest::SyncFail:GetVersionRefCount:1");
+ TEST_SYNC_POINT("DBFlushTest::SyncFail:GetVersionRefCount:2");
+ int refs_after_picking_memtables = cfd->current()->TEST_refs();
+ ASSERT_EQ(refs_before + 1, refs_after_picking_memtables);
+ fault_injection_env->SetFilesystemActive(false);
+ TEST_SYNC_POINT("DBFlushTest::SyncFail:1");
+ TEST_SYNC_POINT("DBFlushTest::SyncFail:2");
+ fault_injection_env->SetFilesystemActive(true);
+ // Now the background job will do the flush; wait for it.
+ dbfull()->TEST_WaitForFlushMemTable();
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("", FilesPerLevel()); // flush failed.
+#endif // ROCKSDB_LITE
+ // Backgroun flush job should release ref count to current version.
+ ASSERT_EQ(current_before, cfd->current());
+ ASSERT_EQ(refs_before, cfd->current()->TEST_refs());
+ Destroy(options);
+}
+
+TEST_F(DBFlushTest, SyncSkip) {
+ Options options = CurrentOptions();
+
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBFlushTest::SyncSkip:1", "DBImpl::SyncClosedLogs:Skip"},
+ {"DBImpl::SyncClosedLogs:Skip", "DBFlushTest::SyncSkip:2"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ Reopen(options);
+ Put("key", "value");
+
+ FlushOptions flush_options;
+ flush_options.wait = false;
+ ASSERT_OK(dbfull()->Flush(flush_options));
+
+ TEST_SYNC_POINT("DBFlushTest::SyncSkip:1");
+ TEST_SYNC_POINT("DBFlushTest::SyncSkip:2");
+
+ // Now the background job will do the flush; wait for it.
+ dbfull()->TEST_WaitForFlushMemTable();
+
+ Destroy(options);
+}
+
+TEST_F(DBFlushTest, FlushInLowPriThreadPool) {
+ // Verify setting an empty high-pri (flush) thread pool causes flushes to be
+ // scheduled in the low-pri (compaction) thread pool.
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = 4;
+ options.memtable_factory.reset(new SpecialSkipListFactory(1));
+ Reopen(options);
+ env_->SetBackgroundThreads(0, Env::HIGH);
+
+ std::thread::id tid;
+ int num_flushes = 0, num_compactions = 0;
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BGWorkFlush", [&](void* /*arg*/) {
+ if (tid == std::thread::id()) {
+ tid = std::this_thread::get_id();
+ } else {
+ ASSERT_EQ(tid, std::this_thread::get_id());
+ }
+ ++num_flushes;
+ });
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BGWorkCompaction", [&](void* /*arg*/) {
+ ASSERT_EQ(tid, std::this_thread::get_id());
+ ++num_compactions;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put("key", "val"));
+ for (int i = 0; i < 4; ++i) {
+ ASSERT_OK(Put("key", "val"));
+ dbfull()->TEST_WaitForFlushMemTable();
+ }
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(4, num_flushes);
+ ASSERT_EQ(1, num_compactions);
+}
+
+TEST_F(DBFlushTest, ManualFlushWithMinWriteBufferNumberToMerge) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100;
+ options.max_write_buffer_number = 4;
+ options.min_write_buffer_number_to_merge = 3;
+ Reopen(options);
+
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::BGWorkFlush",
+ "DBFlushTest::ManualFlushWithMinWriteBufferNumberToMerge:1"},
+ {"DBFlushTest::ManualFlushWithMinWriteBufferNumberToMerge:2",
+ "FlushJob::WriteLevel0Table"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put("key1", "value1"));
+
+ port::Thread t([&]() {
+ // The call wait for flush to finish, i.e. with flush_options.wait = true.
+ ASSERT_OK(Flush());
+ });
+
+ // Wait for flush start.
+ TEST_SYNC_POINT("DBFlushTest::ManualFlushWithMinWriteBufferNumberToMerge:1");
+ // Insert a second memtable before the manual flush finish.
+ // At the end of the manual flush job, it will check if further flush
+ // is needed, but it will not trigger flush of the second memtable because
+ // min_write_buffer_number_to_merge is not reached.
+ ASSERT_OK(Put("key2", "value2"));
+ ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+ TEST_SYNC_POINT("DBFlushTest::ManualFlushWithMinWriteBufferNumberToMerge:2");
+
+ // Manual flush should return, without waiting for flush indefinitely.
+ t.join();
+}
+
+TEST_F(DBFlushTest, ScheduleOnlyOneBgThread) {
+ Options options = CurrentOptions();
+ Reopen(options);
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ int called = 0;
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::MaybeScheduleFlushOrCompaction:AfterSchedule:0", [&](void* arg) {
+ ASSERT_NE(nullptr, arg);
+ auto unscheduled_flushes = *reinterpret_cast<int*>(arg);
+ ASSERT_EQ(0, unscheduled_flushes);
+ ++called;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put("a", "foo"));
+ FlushOptions flush_opts;
+ ASSERT_OK(dbfull()->Flush(flush_opts));
+ ASSERT_EQ(1, called);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(DBFlushDirectIOTest, DirectIO) {
+ Options options;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ options.max_background_flushes = 2;
+ options.use_direct_io_for_flush_and_compaction = GetParam();
+ options.env = new MockEnv(Env::Default());
+ SyncPoint::GetInstance()->SetCallBack(
+ "BuildTable:create_file", [&](void* arg) {
+ bool* use_direct_writes = static_cast<bool*>(arg);
+ ASSERT_EQ(*use_direct_writes,
+ options.use_direct_io_for_flush_and_compaction);
+ });
+
+ SyncPoint::GetInstance()->EnableProcessing();
+ Reopen(options);
+ ASSERT_OK(Put("foo", "v"));
+ FlushOptions flush_options;
+ flush_options.wait = true;
+ ASSERT_OK(dbfull()->Flush(flush_options));
+ Destroy(options);
+ delete options.env;
+}
+
+TEST_F(DBFlushTest, FlushError) {
+ Options options;
+ std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+ new FaultInjectionTestEnv(env_));
+ options.write_buffer_size = 100;
+ options.max_write_buffer_number = 4;
+ options.min_write_buffer_number_to_merge = 3;
+ options.disable_auto_compactions = true;
+ options.env = fault_injection_env.get();
+ Reopen(options);
+
+ ASSERT_OK(Put("key1", "value1"));
+ ASSERT_OK(Put("key2", "value2"));
+ fault_injection_env->SetFilesystemActive(false);
+ Status s = dbfull()->TEST_SwitchMemtable();
+ fault_injection_env->SetFilesystemActive(true);
+ Destroy(options);
+ ASSERT_NE(s, Status::OK());
+}
+
+TEST_F(DBFlushTest, ManualFlushFailsInReadOnlyMode) {
+ // Regression test for bug where manual flush hangs forever when the DB
+ // is in read-only mode. Verify it now at least returns, despite failing.
+ Options options;
+ std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+ new FaultInjectionTestEnv(env_));
+ options.env = fault_injection_env.get();
+ options.max_write_buffer_number = 2;
+ Reopen(options);
+
+ // Trigger a first flush but don't let it run
+ ASSERT_OK(db_->PauseBackgroundWork());
+ ASSERT_OK(Put("key1", "value1"));
+ FlushOptions flush_opts;
+ flush_opts.wait = false;
+ ASSERT_OK(db_->Flush(flush_opts));
+
+ // Write a key to the second memtable so we have something to flush later
+ // after the DB is in read-only mode.
+ ASSERT_OK(Put("key2", "value2"));
+
+ // Let the first flush continue, hit an error, and put the DB in read-only
+ // mode.
+ fault_injection_env->SetFilesystemActive(false);
+ ASSERT_OK(db_->ContinueBackgroundWork());
+ dbfull()->TEST_WaitForFlushMemTable();
+#ifndef ROCKSDB_LITE
+ uint64_t num_bg_errors;
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBackgroundErrors,
+ &num_bg_errors));
+ ASSERT_GT(num_bg_errors, 0);
+#endif // ROCKSDB_LITE
+
+ // In the bug scenario, triggering another flush would cause the second flush
+ // to hang forever. After the fix we expect it to return an error.
+ ASSERT_NOK(db_->Flush(FlushOptions()));
+
+ Close();
+}
+
+TEST_F(DBFlushTest, CFDropRaceWithWaitForFlushMemTables) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::FlushMemTable:AfterScheduleFlush",
+ "DBFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop"},
+ {"DBFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree",
+ "DBImpl::BackgroundCallFlush:start"},
+ {"DBImpl::BackgroundCallFlush:start",
+ "DBImpl::FlushMemTable:BeforeWaitForBgFlush"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_EQ(2, handles_.size());
+ ASSERT_OK(Put(1, "key", "value"));
+ auto* cfd = static_cast<ColumnFamilyHandleImpl*>(handles_[1])->cfd();
+ port::Thread drop_cf_thr([&]() {
+ TEST_SYNC_POINT(
+ "DBFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop");
+ ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+ ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1]));
+ handles_.resize(1);
+ TEST_SYNC_POINT(
+ "DBFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree");
+ });
+ FlushOptions flush_opts;
+ flush_opts.allow_write_stall = true;
+ ASSERT_NOK(dbfull()->TEST_FlushMemTable(cfd, flush_opts));
+ drop_cf_thr.join();
+ Close();
+ SyncPoint::GetInstance()->DisableProcessing();
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBFlushTest, FireOnFlushCompletedAfterCommittedResult) {
+ class TestListener : public EventListener {
+ public:
+ void OnFlushCompleted(DB* db, const FlushJobInfo& info) override {
+ // There's only one key in each flush.
+ ASSERT_EQ(info.smallest_seqno, info.largest_seqno);
+ ASSERT_NE(0, info.smallest_seqno);
+ if (info.smallest_seqno == seq1) {
+ // First flush completed
+ ASSERT_FALSE(completed1);
+ completed1 = true;
+ CheckFlushResultCommitted(db, seq1);
+ } else {
+ // Second flush completed
+ ASSERT_FALSE(completed2);
+ completed2 = true;
+ ASSERT_EQ(info.smallest_seqno, seq2);
+ CheckFlushResultCommitted(db, seq2);
+ }
+ }
+
+ void CheckFlushResultCommitted(DB* db, SequenceNumber seq) {
+ DBImpl* db_impl = static_cast_with_check<DBImpl>(db);
+ InstrumentedMutex* mutex = db_impl->mutex();
+ mutex->Lock();
+ auto* cfd =
+ reinterpret_cast<ColumnFamilyHandleImpl*>(db->DefaultColumnFamily())
+ ->cfd();
+ ASSERT_LT(seq, cfd->imm()->current()->GetEarliestSequenceNumber());
+ mutex->Unlock();
+ }
+
+ std::atomic<SequenceNumber> seq1{0};
+ std::atomic<SequenceNumber> seq2{0};
+ std::atomic<bool> completed1{false};
+ std::atomic<bool> completed2{false};
+ };
+ std::shared_ptr<TestListener> listener = std::make_shared<TestListener>();
+
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::BackgroundCallFlush:start",
+ "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitFirst"},
+ {"DBImpl::FlushMemTableToOutputFile:Finish",
+ "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitSecond"}});
+ SyncPoint::GetInstance()->SetCallBack(
+ "FlushJob::WriteLevel0Table", [&listener](void* arg) {
+ // Wait for the second flush finished, out of mutex.
+ auto* mems = reinterpret_cast<autovector<MemTable*>*>(arg);
+ if (mems->front()->GetEarliestSequenceNumber() == listener->seq1 - 1) {
+ TEST_SYNC_POINT(
+ "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:"
+ "WaitSecond");
+ }
+ });
+
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.listeners.push_back(listener);
+ // Setting max_flush_jobs = max_background_jobs / 4 = 2.
+ options.max_background_jobs = 8;
+ // Allow 2 immutable memtables.
+ options.max_write_buffer_number = 3;
+ Reopen(options);
+ SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(Put("foo", "v"));
+ listener->seq1 = db_->GetLatestSequenceNumber();
+ // t1 will wait for the second flush complete before committing flush result.
+ auto t1 = port::Thread([&]() {
+ // flush_opts.wait = true
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ });
+ // Wait for first flush started.
+ TEST_SYNC_POINT(
+ "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitFirst");
+ // The second flush will exit early without commit its result. The work
+ // is delegated to the first flush.
+ ASSERT_OK(Put("bar", "v"));
+ listener->seq2 = db_->GetLatestSequenceNumber();
+ FlushOptions flush_opts;
+ flush_opts.wait = false;
+ ASSERT_OK(db_->Flush(flush_opts));
+ t1.join();
+ ASSERT_TRUE(listener->completed1);
+ ASSERT_TRUE(listener->completed2);
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif // !ROCKSDB_LITE
+
+TEST_P(DBAtomicFlushTest, ManualAtomicFlush) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.atomic_flush = GetParam();
+ options.write_buffer_size = (static_cast<size_t>(64) << 20);
+
+ CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+ size_t num_cfs = handles_.size();
+ ASSERT_EQ(3, num_cfs);
+ WriteOptions wopts;
+ wopts.disableWAL = true;
+ for (size_t i = 0; i != num_cfs; ++i) {
+ ASSERT_OK(Put(static_cast<int>(i) /*cf*/, "key", "value", wopts));
+ }
+ std::vector<int> cf_ids;
+ for (size_t i = 0; i != num_cfs; ++i) {
+ cf_ids.emplace_back(static_cast<int>(i));
+ }
+ ASSERT_OK(Flush(cf_ids));
+ for (size_t i = 0; i != num_cfs; ++i) {
+ auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+ ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+ ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty());
+ }
+}
+
+TEST_P(DBAtomicFlushTest, AtomicFlushTriggeredByMemTableFull) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.atomic_flush = GetParam();
+ // 4KB so that we can easily trigger auto flush.
+ options.write_buffer_size = 4096;
+
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::BackgroundCallFlush:FlushFinish:0",
+ "DBAtomicFlushTest::AtomicFlushTriggeredByMemTableFull:BeforeCheck"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+ size_t num_cfs = handles_.size();
+ ASSERT_EQ(3, num_cfs);
+ WriteOptions wopts;
+ wopts.disableWAL = true;
+ for (size_t i = 0; i != num_cfs; ++i) {
+ ASSERT_OK(Put(static_cast<int>(i) /*cf*/, "key", "value", wopts));
+ }
+ // Keep writing to one of them column families to trigger auto flush.
+ for (int i = 0; i != 4000; ++i) {
+ ASSERT_OK(Put(static_cast<int>(num_cfs) - 1 /*cf*/,
+ "key" + std::to_string(i), "value" + std::to_string(i),
+ wopts));
+ }
+
+ TEST_SYNC_POINT(
+ "DBAtomicFlushTest::AtomicFlushTriggeredByMemTableFull:BeforeCheck");
+ if (options.atomic_flush) {
+ for (size_t i = 0; i != num_cfs - 1; ++i) {
+ auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+ ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+ ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty());
+ }
+ } else {
+ for (size_t i = 0; i != num_cfs - 1; ++i) {
+ auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+ ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+ ASSERT_FALSE(cfh->cfd()->mem()->IsEmpty());
+ }
+ }
+ SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBAtomicFlushTest, AtomicFlushRollbackSomeJobs) {
+ bool atomic_flush = GetParam();
+ if (!atomic_flush) {
+ return;
+ }
+ std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+ new FaultInjectionTestEnv(env_));
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.atomic_flush = atomic_flush;
+ options.env = fault_injection_env.get();
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:1",
+ "DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:1"},
+ {"DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:2",
+ "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:2"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+ size_t num_cfs = handles_.size();
+ ASSERT_EQ(3, num_cfs);
+ WriteOptions wopts;
+ wopts.disableWAL = true;
+ for (size_t i = 0; i != num_cfs; ++i) {
+ int cf_id = static_cast<int>(i);
+ ASSERT_OK(Put(cf_id, "key", "value", wopts));
+ }
+ FlushOptions flush_opts;
+ flush_opts.wait = false;
+ ASSERT_OK(dbfull()->Flush(flush_opts, handles_));
+ TEST_SYNC_POINT("DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:1");
+ fault_injection_env->SetFilesystemActive(false);
+ TEST_SYNC_POINT("DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:2");
+ for (auto* cfh : handles_) {
+ dbfull()->TEST_WaitForFlushMemTable(cfh);
+ }
+ for (size_t i = 0; i != num_cfs; ++i) {
+ auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+ ASSERT_EQ(1, cfh->cfd()->imm()->NumNotFlushed());
+ ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty());
+ }
+ fault_injection_env->SetFilesystemActive(true);
+ Destroy(options);
+}
+
+TEST_P(DBAtomicFlushTest, FlushMultipleCFs_DropSomeBeforeRequestFlush) {
+ bool atomic_flush = GetParam();
+ if (!atomic_flush) {
+ return;
+ }
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.atomic_flush = atomic_flush;
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+ size_t num_cfs = handles_.size();
+ ASSERT_EQ(3, num_cfs);
+ WriteOptions wopts;
+ wopts.disableWAL = true;
+ std::vector<int> cf_ids;
+ for (size_t i = 0; i != num_cfs; ++i) {
+ int cf_id = static_cast<int>(i);
+ ASSERT_OK(Put(cf_id, "key", "value", wopts));
+ cf_ids.push_back(cf_id);
+ }
+ ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+ ASSERT_TRUE(Flush(cf_ids).IsColumnFamilyDropped());
+ Destroy(options);
+}
+
+TEST_P(DBAtomicFlushTest,
+ FlushMultipleCFs_DropSomeAfterScheduleFlushBeforeFlushJobRun) {
+ bool atomic_flush = GetParam();
+ if (!atomic_flush) {
+ return;
+ }
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.atomic_flush = atomic_flush;
+
+ CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::AtomicFlushMemTables:AfterScheduleFlush",
+ "DBAtomicFlushTest::BeforeDropCF"},
+ {"DBAtomicFlushTest::AfterDropCF",
+ "DBImpl::BackgroundCallFlush:start"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ size_t num_cfs = handles_.size();
+ ASSERT_EQ(3, num_cfs);
+ WriteOptions wopts;
+ wopts.disableWAL = true;
+ for (size_t i = 0; i != num_cfs; ++i) {
+ int cf_id = static_cast<int>(i);
+ ASSERT_OK(Put(cf_id, "key", "value", wopts));
+ }
+ port::Thread user_thread([&]() {
+ TEST_SYNC_POINT("DBAtomicFlushTest::BeforeDropCF");
+ ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+ TEST_SYNC_POINT("DBAtomicFlushTest::AfterDropCF");
+ });
+ FlushOptions flush_opts;
+ flush_opts.wait = true;
+ ASSERT_OK(dbfull()->Flush(flush_opts, handles_));
+ user_thread.join();
+ for (size_t i = 0; i != num_cfs; ++i) {
+ int cf_id = static_cast<int>(i);
+ ASSERT_EQ("value", Get(cf_id, "key"));
+ }
+
+ ReopenWithColumnFamilies({kDefaultColumnFamilyName, "eevee"}, options);
+ num_cfs = handles_.size();
+ ASSERT_EQ(2, num_cfs);
+ for (size_t i = 0; i != num_cfs; ++i) {
+ int cf_id = static_cast<int>(i);
+ ASSERT_EQ("value", Get(cf_id, "key"));
+ }
+ Destroy(options);
+}
+
+TEST_P(DBAtomicFlushTest, TriggerFlushAndClose) {
+ bool atomic_flush = GetParam();
+ if (!atomic_flush) {
+ return;
+ }
+ const int kNumKeysTriggerFlush = 4;
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.atomic_flush = atomic_flush;
+ options.memtable_factory.reset(
+ new SpecialSkipListFactory(kNumKeysTriggerFlush));
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ for (int i = 0; i != kNumKeysTriggerFlush; ++i) {
+ ASSERT_OK(Put(0, "key" + std::to_string(i), "value" + std::to_string(i)));
+ }
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(Put(0, "key", "value"));
+ Close();
+
+ ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+ ASSERT_EQ("value", Get(0, "key"));
+}
+
+TEST_P(DBAtomicFlushTest, PickMemtablesRaceWithBackgroundFlush) {
+ bool atomic_flush = GetParam();
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.atomic_flush = atomic_flush;
+ options.max_write_buffer_number = 4;
+ // Set min_write_buffer_number_to_merge to be greater than 1, so that
+ // a column family with one memtable in the imm will not cause IsFlushPending
+ // to return true when flush_requested_ is false.
+ options.min_write_buffer_number_to_merge = 2;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ASSERT_EQ(2, handles_.size());
+ ASSERT_OK(dbfull()->PauseBackgroundWork());
+ ASSERT_OK(Put(0, "key00", "value00"));
+ ASSERT_OK(Put(1, "key10", "value10"));
+ FlushOptions flush_opts;
+ flush_opts.wait = false;
+ ASSERT_OK(dbfull()->Flush(flush_opts, handles_));
+ ASSERT_OK(Put(0, "key01", "value01"));
+ // Since max_write_buffer_number is 4, the following flush won't cause write
+ // stall.
+ ASSERT_OK(dbfull()->Flush(flush_opts));
+ ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+ ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1]));
+ handles_[1] = nullptr;
+ ASSERT_OK(dbfull()->ContinueBackgroundWork());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
+ delete handles_[0];
+ handles_.clear();
+}
+
+TEST_P(DBAtomicFlushTest, CFDropRaceWithWaitForFlushMemTables) {
+ bool atomic_flush = GetParam();
+ if (!atomic_flush) {
+ return;
+ }
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.atomic_flush = atomic_flush;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::AtomicFlushMemTables:AfterScheduleFlush",
+ "DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop"},
+ {"DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree",
+ "DBImpl::BackgroundCallFlush:start"},
+ {"DBImpl::BackgroundCallFlush:start",
+ "DBImpl::AtomicFlushMemTables:BeforeWaitForBgFlush"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_EQ(2, handles_.size());
+ ASSERT_OK(Put(0, "key", "value"));
+ ASSERT_OK(Put(1, "key", "value"));
+ auto* cfd_default =
+ static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily())
+ ->cfd();
+ auto* cfd_pikachu = static_cast<ColumnFamilyHandleImpl*>(handles_[1])->cfd();
+ port::Thread drop_cf_thr([&]() {
+ TEST_SYNC_POINT(
+ "DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop");
+ ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+ delete handles_[1];
+ handles_.resize(1);
+ TEST_SYNC_POINT(
+ "DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree");
+ });
+ FlushOptions flush_opts;
+ flush_opts.allow_write_stall = true;
+ ASSERT_OK(dbfull()->TEST_AtomicFlushMemTables({cfd_default, cfd_pikachu},
+ flush_opts));
+ drop_cf_thr.join();
+ Close();
+ SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBAtomicFlushTest, RollbackAfterFailToInstallResults) {
+ bool atomic_flush = GetParam();
+ if (!atomic_flush) {
+ return;
+ }
+ auto fault_injection_env = std::make_shared<FaultInjectionTestEnv>(env_);
+ Options options = CurrentOptions();
+ options.env = fault_injection_env.get();
+ options.create_if_missing = true;
+ options.atomic_flush = atomic_flush;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ASSERT_EQ(2, handles_.size());
+ for (size_t cf = 0; cf < handles_.size(); ++cf) {
+ ASSERT_OK(Put(static_cast<int>(cf), "a", "value"));
+ }
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0",
+ [&](void* /*arg*/) { fault_injection_env->SetFilesystemActive(false); });
+ SyncPoint::GetInstance()->EnableProcessing();
+ FlushOptions flush_opts;
+ Status s = db_->Flush(flush_opts, handles_);
+ ASSERT_NOK(s);
+ fault_injection_env->SetFilesystemActive(true);
+ Close();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+INSTANTIATE_TEST_CASE_P(DBFlushDirectIOTest, DBFlushDirectIOTest,
+ testing::Bool());
+
+INSTANTIATE_TEST_CASE_P(DBAtomicFlushTest, DBAtomicFlushTest, testing::Bool());
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_impl/db_impl.cc b/src/rocksdb/db/db_impl/db_impl.cc
new file mode 100644
index 000000000..d7880fc1a
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl.cc
@@ -0,0 +1,4550 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/db_impl/db_impl.h"
+
+#include <stdint.h>
+#ifdef OS_SOLARIS
+#include <alloca.h>
+#endif
+
+#include <algorithm>
+#include <cinttypes>
+#include <cstdio>
+#include <map>
+#include <set>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "db/arena_wrapped_db_iter.h"
+#include "db/builder.h"
+#include "db/compaction/compaction_job.h"
+#include "db/db_info_dumper.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "db/external_sst_file_ingestion_job.h"
+#include "db/flush_job.h"
+#include "db/forward_iterator.h"
+#include "db/import_column_family_job.h"
+#include "db/job_context.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/malloc_stats.h"
+#include "db/memtable.h"
+#include "db/memtable_list.h"
+#include "db/merge_context.h"
+#include "db/merge_helper.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/table_cache.h"
+#include "db/table_properties_collector.h"
+#include "db/transaction_log_impl.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "db/write_callback.h"
+#include "env/composite_env_wrapper.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "file/random_access_file_reader.h"
+#include "file/sst_file_manager_impl.h"
+#include "logging/auto_roll_logger.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "memtable/hash_linklist_rep.h"
+#include "memtable/hash_skiplist_rep.h"
+#include "monitoring/in_memory_stats_history.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/persistent_stats_history.h"
+#include "monitoring/thread_status_updater.h"
+#include "monitoring/thread_status_util.h"
+#include "options/cf_options.h"
+#include "options/options_helper.h"
+#include "options/options_parser.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/stats_history.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/get_context.h"
+#include "table/merging_iterator.h"
+#include "table/multiget_context.h"
+#include "table/table_builder.h"
+#include "table/two_level_iterator.h"
+#include "test_util/sync_point.h"
+#include "tools/sst_dump_tool_imp.h"
+#include "util/autovector.h"
+#include "util/build_version.h"
+#include "util/cast_util.h"
+#include "util/coding.h"
+#include "util/compression.h"
+#include "util/crc32c.h"
+#include "util/mutexlock.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const std::string kDefaultColumnFamilyName("default");
+const std::string kPersistentStatsColumnFamilyName(
+ "___rocksdb_stats_history___");
+void DumpRocksDBBuildVersion(Logger* log);
+
+CompressionType GetCompressionFlush(
+ const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options) {
+ // Compressing memtable flushes might not help unless the sequential load
+ // optimization is used for leveled compaction. Otherwise the CPU and
+ // latency overhead is not offset by saving much space.
+ if (ioptions.compaction_style == kCompactionStyleUniversal) {
+ if (mutable_cf_options.compaction_options_universal
+ .compression_size_percent < 0) {
+ return mutable_cf_options.compression;
+ } else {
+ return kNoCompression;
+ }
+ } else if (!ioptions.compression_per_level.empty()) {
+ // For leveled compress when min_level_to_compress != 0.
+ return ioptions.compression_per_level[0];
+ } else {
+ return mutable_cf_options.compression;
+ }
+}
+
+namespace {
+void DumpSupportInfo(Logger* logger) {
+ ROCKS_LOG_HEADER(logger, "Compression algorithms supported:");
+ for (auto& compression : OptionsHelper::compression_type_string_map) {
+ if (compression.second != kNoCompression &&
+ compression.second != kDisableCompressionOption) {
+ ROCKS_LOG_HEADER(logger, "\t%s supported: %d", compression.first.c_str(),
+ CompressionTypeSupported(compression.second));
+ }
+ }
+ ROCKS_LOG_HEADER(logger, "Fast CRC32 supported: %s",
+ crc32c::IsFastCrc32Supported().c_str());
+}
+} // namespace
+
+DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
+ const bool seq_per_batch, const bool batch_per_txn)
+ : dbname_(dbname),
+ own_info_log_(options.info_log == nullptr),
+ initial_db_options_(SanitizeOptions(dbname, options)),
+ env_(initial_db_options_.env),
+ fs_(initial_db_options_.file_system),
+ immutable_db_options_(initial_db_options_),
+ mutable_db_options_(initial_db_options_),
+ stats_(immutable_db_options_.statistics.get()),
+ mutex_(stats_, env_, DB_MUTEX_WAIT_MICROS,
+ immutable_db_options_.use_adaptive_mutex),
+ default_cf_handle_(nullptr),
+ max_total_in_memory_state_(0),
+ file_options_(BuildDBOptions(immutable_db_options_, mutable_db_options_)),
+ file_options_for_compaction_(fs_->OptimizeForCompactionTableWrite(
+ file_options_, immutable_db_options_)),
+ seq_per_batch_(seq_per_batch),
+ batch_per_txn_(batch_per_txn),
+ db_lock_(nullptr),
+ shutting_down_(false),
+ manual_compaction_paused_(false),
+ bg_cv_(&mutex_),
+ logfile_number_(0),
+ log_dir_synced_(false),
+ log_empty_(true),
+ persist_stats_cf_handle_(nullptr),
+ log_sync_cv_(&mutex_),
+ total_log_size_(0),
+ is_snapshot_supported_(true),
+ write_buffer_manager_(immutable_db_options_.write_buffer_manager.get()),
+ write_thread_(immutable_db_options_),
+ nonmem_write_thread_(immutable_db_options_),
+ write_controller_(mutable_db_options_.delayed_write_rate),
+ last_batch_group_size_(0),
+ unscheduled_flushes_(0),
+ unscheduled_compactions_(0),
+ bg_bottom_compaction_scheduled_(0),
+ bg_compaction_scheduled_(0),
+ num_running_compactions_(0),
+ bg_flush_scheduled_(0),
+ num_running_flushes_(0),
+ bg_purge_scheduled_(0),
+ disable_delete_obsolete_files_(0),
+ pending_purge_obsolete_files_(0),
+ delete_obsolete_files_last_run_(env_->NowMicros()),
+ last_stats_dump_time_microsec_(0),
+ next_job_id_(1),
+ has_unpersisted_data_(false),
+ unable_to_release_oldest_log_(false),
+ num_running_ingest_file_(0),
+#ifndef ROCKSDB_LITE
+ wal_manager_(immutable_db_options_, file_options_, seq_per_batch),
+#endif // ROCKSDB_LITE
+ event_logger_(immutable_db_options_.info_log.get()),
+ bg_work_paused_(0),
+ bg_compaction_paused_(0),
+ refitting_level_(false),
+ opened_successfully_(false),
+ two_write_queues_(options.two_write_queues),
+ manual_wal_flush_(options.manual_wal_flush),
+ // last_sequencee_ is always maintained by the main queue that also writes
+ // to the memtable. When two_write_queues_ is disabled last seq in
+ // memtable is the same as last seq published to the readers. When it is
+ // enabled but seq_per_batch_ is disabled, last seq in memtable still
+ // indicates last published seq since wal-only writes that go to the 2nd
+ // queue do not consume a sequence number. Otherwise writes performed by
+ // the 2nd queue could change what is visible to the readers. In this
+ // cases, last_seq_same_as_publish_seq_==false, the 2nd queue maintains a
+ // separate variable to indicate the last published sequence.
+ last_seq_same_as_publish_seq_(
+ !(seq_per_batch && options.two_write_queues)),
+ // Since seq_per_batch_ is currently set only by WritePreparedTxn which
+ // requires a custom gc for compaction, we use that to set use_custom_gc_
+ // as well.
+ use_custom_gc_(seq_per_batch),
+ shutdown_initiated_(false),
+ own_sfm_(options.sst_file_manager == nullptr),
+ preserve_deletes_(options.preserve_deletes),
+ closed_(false),
+ error_handler_(this, immutable_db_options_, &mutex_),
+ atomic_flush_install_cv_(&mutex_) {
+ // !batch_per_trx_ implies seq_per_batch_ because it is only unset for
+ // WriteUnprepared, which should use seq_per_batch_.
+ assert(batch_per_txn_ || seq_per_batch_);
+ env_->GetAbsolutePath(dbname, &db_absolute_path_);
+
+ // Reserve ten files or so for other uses and give the rest to TableCache.
+ // Give a large number for setting of "infinite" open files.
+ const int table_cache_size = (mutable_db_options_.max_open_files == -1)
+ ? TableCache::kInfiniteCapacity
+ : mutable_db_options_.max_open_files - 10;
+ LRUCacheOptions co;
+ co.capacity = table_cache_size;
+ co.num_shard_bits = immutable_db_options_.table_cache_numshardbits;
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
+ table_cache_ = NewLRUCache(co);
+
+ versions_.reset(new VersionSet(dbname_, &immutable_db_options_, file_options_,
+ table_cache_.get(), write_buffer_manager_,
+ &write_controller_, &block_cache_tracer_));
+ column_family_memtables_.reset(
+ new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet()));
+
+ DumpRocksDBBuildVersion(immutable_db_options_.info_log.get());
+ DumpDBFileSummary(immutable_db_options_, dbname_);
+ immutable_db_options_.Dump(immutable_db_options_.info_log.get());
+ mutable_db_options_.Dump(immutable_db_options_.info_log.get());
+ DumpSupportInfo(immutable_db_options_.info_log.get());
+
+ // always open the DB with 0 here, which means if preserve_deletes_==true
+ // we won't drop any deletion markers until SetPreserveDeletesSequenceNumber()
+ // is called by client and this seqnum is advanced.
+ preserve_deletes_seqnum_.store(0);
+}
+
+Status DBImpl::Resume() {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "Resuming DB");
+
+ InstrumentedMutexLock db_mutex(&mutex_);
+
+ if (!error_handler_.IsDBStopped() && !error_handler_.IsBGWorkStopped()) {
+ // Nothing to do
+ return Status::OK();
+ }
+
+ if (error_handler_.IsRecoveryInProgress()) {
+ // Don't allow a mix of manual and automatic recovery
+ return Status::Busy();
+ }
+
+ mutex_.Unlock();
+ Status s = error_handler_.RecoverFromBGError(true);
+ mutex_.Lock();
+ return s;
+}
+
+// This function implements the guts of recovery from a background error. It
+// is eventually called for both manual as well as automatic recovery. It does
+// the following -
+// 1. Wait for currently scheduled background flush/compaction to exit, in
+// order to inadvertently causing an error and thinking recovery failed
+// 2. Flush memtables if there's any data for all the CFs. This may result
+// another error, which will be saved by error_handler_ and reported later
+// as the recovery status
+// 3. Find and delete any obsolete files
+// 4. Schedule compactions if needed for all the CFs. This is needed as the
+// flush in the prior step might have been a no-op for some CFs, which
+// means a new super version wouldn't have been installed
+Status DBImpl::ResumeImpl() {
+ mutex_.AssertHeld();
+ WaitForBackgroundWork();
+
+ Status bg_error = error_handler_.GetBGError();
+ Status s;
+ if (shutdown_initiated_) {
+ // Returning shutdown status to SFM during auto recovery will cause it
+ // to abort the recovery and allow the shutdown to progress
+ s = Status::ShutdownInProgress();
+ }
+ if (s.ok() && bg_error.severity() > Status::Severity::kHardError) {
+ ROCKS_LOG_INFO(
+ immutable_db_options_.info_log,
+ "DB resume requested but failed due to Fatal/Unrecoverable error");
+ s = bg_error;
+ }
+
+ // We cannot guarantee consistency of the WAL. So force flush Memtables of
+ // all the column families
+ if (s.ok()) {
+ FlushOptions flush_opts;
+ // We allow flush to stall write since we are trying to resume from error.
+ flush_opts.allow_write_stall = true;
+ if (immutable_db_options_.atomic_flush) {
+ autovector<ColumnFamilyData*> cfds;
+ SelectColumnFamiliesForAtomicFlush(&cfds);
+ mutex_.Unlock();
+ s = AtomicFlushMemTables(cfds, flush_opts, FlushReason::kErrorRecovery);
+ mutex_.Lock();
+ } else {
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ cfd->Ref();
+ mutex_.Unlock();
+ s = FlushMemTable(cfd, flush_opts, FlushReason::kErrorRecovery);
+ mutex_.Lock();
+ cfd->UnrefAndTryDelete();
+ if (!s.ok()) {
+ break;
+ }
+ }
+ }
+ if (!s.ok()) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "DB resume requested but failed due to Flush failure [%s]",
+ s.ToString().c_str());
+ }
+ }
+
+ JobContext job_context(0);
+ FindObsoleteFiles(&job_context, true);
+ if (s.ok()) {
+ s = error_handler_.ClearBGError();
+ }
+ mutex_.Unlock();
+
+ job_context.manifest_file_number = 1;
+ if (job_context.HaveSomethingToDelete()) {
+ PurgeObsoleteFiles(job_context);
+ }
+ job_context.Clean();
+
+ if (s.ok()) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB");
+ }
+ mutex_.Lock();
+ // Check for shutdown again before scheduling further compactions,
+ // since we released and re-acquired the lock above
+ if (shutdown_initiated_) {
+ s = Status::ShutdownInProgress();
+ }
+ if (s.ok()) {
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ SchedulePendingCompaction(cfd);
+ }
+ MaybeScheduleFlushOrCompaction();
+ }
+
+ // Wake up any waiters - in this case, it could be the shutdown thread
+ bg_cv_.SignalAll();
+
+ // No need to check BGError again. If something happened, event listener would
+ // be notified and the operation causing it would have failed
+ return s;
+}
+
+void DBImpl::WaitForBackgroundWork() {
+ // Wait for background work to finish
+ while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
+ bg_flush_scheduled_) {
+ bg_cv_.Wait();
+ }
+}
+
+// Will lock the mutex_, will wait for completion if wait is true
+void DBImpl::CancelAllBackgroundWork(bool wait) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Shutdown: canceling all background work");
+
+ if (thread_dump_stats_ != nullptr) {
+ thread_dump_stats_->cancel();
+ thread_dump_stats_.reset();
+ }
+ if (thread_persist_stats_ != nullptr) {
+ thread_persist_stats_->cancel();
+ thread_persist_stats_.reset();
+ }
+ InstrumentedMutexLock l(&mutex_);
+ if (!shutting_down_.load(std::memory_order_acquire) &&
+ has_unpersisted_data_.load(std::memory_order_relaxed) &&
+ !mutable_db_options_.avoid_flush_during_shutdown) {
+ if (immutable_db_options_.atomic_flush) {
+ autovector<ColumnFamilyData*> cfds;
+ SelectColumnFamiliesForAtomicFlush(&cfds);
+ mutex_.Unlock();
+ AtomicFlushMemTables(cfds, FlushOptions(), FlushReason::kShutDown);
+ mutex_.Lock();
+ } else {
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ if (!cfd->IsDropped() && cfd->initialized() && !cfd->mem()->IsEmpty()) {
+ cfd->Ref();
+ mutex_.Unlock();
+ FlushMemTable(cfd, FlushOptions(), FlushReason::kShutDown);
+ mutex_.Lock();
+ cfd->UnrefAndTryDelete();
+ }
+ }
+ }
+ versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
+ }
+
+ shutting_down_.store(true, std::memory_order_release);
+ bg_cv_.SignalAll();
+ if (!wait) {
+ return;
+ }
+ WaitForBackgroundWork();
+}
+
+Status DBImpl::CloseHelper() {
+ // Guarantee that there is no background error recovery in progress before
+ // continuing with the shutdown
+ mutex_.Lock();
+ shutdown_initiated_ = true;
+ error_handler_.CancelErrorRecovery();
+ while (error_handler_.IsRecoveryInProgress()) {
+ bg_cv_.Wait();
+ }
+ mutex_.Unlock();
+
+ // CancelAllBackgroundWork called with false means we just set the shutdown
+ // marker. After this we do a variant of the waiting and unschedule work
+ // (to consider: moving all the waiting into CancelAllBackgroundWork(true))
+ CancelAllBackgroundWork(false);
+ int bottom_compactions_unscheduled =
+ env_->UnSchedule(this, Env::Priority::BOTTOM);
+ int compactions_unscheduled = env_->UnSchedule(this, Env::Priority::LOW);
+ int flushes_unscheduled = env_->UnSchedule(this, Env::Priority::HIGH);
+ Status ret;
+ mutex_.Lock();
+ bg_bottom_compaction_scheduled_ -= bottom_compactions_unscheduled;
+ bg_compaction_scheduled_ -= compactions_unscheduled;
+ bg_flush_scheduled_ -= flushes_unscheduled;
+
+ // Wait for background work to finish
+ while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
+ bg_flush_scheduled_ || bg_purge_scheduled_ ||
+ pending_purge_obsolete_files_ ||
+ error_handler_.IsRecoveryInProgress()) {
+ TEST_SYNC_POINT("DBImpl::~DBImpl:WaitJob");
+ bg_cv_.Wait();
+ }
+ TEST_SYNC_POINT_CALLBACK("DBImpl::CloseHelper:PendingPurgeFinished",
+ &files_grabbed_for_purge_);
+ EraseThreadStatusDbInfo();
+ flush_scheduler_.Clear();
+ trim_history_scheduler_.Clear();
+
+ while (!flush_queue_.empty()) {
+ const FlushRequest& flush_req = PopFirstFromFlushQueue();
+ for (const auto& iter : flush_req) {
+ iter.first->UnrefAndTryDelete();
+ }
+ }
+ while (!compaction_queue_.empty()) {
+ auto cfd = PopFirstFromCompactionQueue();
+ cfd->UnrefAndTryDelete();
+ }
+
+ if (default_cf_handle_ != nullptr || persist_stats_cf_handle_ != nullptr) {
+ // we need to delete handle outside of lock because it does its own locking
+ mutex_.Unlock();
+ if (default_cf_handle_) {
+ delete default_cf_handle_;
+ default_cf_handle_ = nullptr;
+ }
+ if (persist_stats_cf_handle_) {
+ delete persist_stats_cf_handle_;
+ persist_stats_cf_handle_ = nullptr;
+ }
+ mutex_.Lock();
+ }
+
+ // Clean up obsolete files due to SuperVersion release.
+ // (1) Need to delete to obsolete files before closing because RepairDB()
+ // scans all existing files in the file system and builds manifest file.
+ // Keeping obsolete files confuses the repair process.
+ // (2) Need to check if we Open()/Recover() the DB successfully before
+ // deleting because if VersionSet recover fails (may be due to corrupted
+ // manifest file), it is not able to identify live files correctly. As a
+ // result, all "live" files can get deleted by accident. However, corrupted
+ // manifest is recoverable by RepairDB().
+ if (opened_successfully_) {
+ JobContext job_context(next_job_id_.fetch_add(1));
+ FindObsoleteFiles(&job_context, true);
+
+ mutex_.Unlock();
+ // manifest number starting from 2
+ job_context.manifest_file_number = 1;
+ if (job_context.HaveSomethingToDelete()) {
+ PurgeObsoleteFiles(job_context);
+ }
+ job_context.Clean();
+ mutex_.Lock();
+ }
+
+ for (auto l : logs_to_free_) {
+ delete l;
+ }
+ for (auto& log : logs_) {
+ uint64_t log_number = log.writer->get_log_number();
+ Status s = log.ClearWriter();
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(
+ immutable_db_options_.info_log,
+ "Unable to Sync WAL file %s with error -- %s",
+ LogFileName(immutable_db_options_.wal_dir, log_number).c_str(),
+ s.ToString().c_str());
+ // Retain the first error
+ if (ret.ok()) {
+ ret = s;
+ }
+ }
+ }
+ logs_.clear();
+
+ // Table cache may have table handles holding blocks from the block cache.
+ // We need to release them before the block cache is destroyed. The block
+ // cache may be destroyed inside versions_.reset(), when column family data
+ // list is destroyed, so leaving handles in table cache after
+ // versions_.reset() may cause issues.
+ // Here we clean all unreferenced handles in table cache.
+ // Now we assume all user queries have finished, so only version set itself
+ // can possibly hold the blocks from block cache. After releasing unreferenced
+ // handles here, only handles held by version set left and inside
+ // versions_.reset(), we will release them. There, we need to make sure every
+ // time a handle is released, we erase it from the cache too. By doing that,
+ // we can guarantee that after versions_.reset(), table cache is empty
+ // so the cache can be safely destroyed.
+ table_cache_->EraseUnRefEntries();
+
+ for (auto& txn_entry : recovered_transactions_) {
+ delete txn_entry.second;
+ }
+
+ // versions need to be destroyed before table_cache since it can hold
+ // references to table_cache.
+ versions_.reset();
+ mutex_.Unlock();
+ if (db_lock_ != nullptr) {
+ env_->UnlockFile(db_lock_);
+ }
+
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "Shutdown complete");
+ LogFlush(immutable_db_options_.info_log);
+
+#ifndef ROCKSDB_LITE
+ // If the sst_file_manager was allocated by us during DB::Open(), ccall
+ // Close() on it before closing the info_log. Otherwise, background thread
+ // in SstFileManagerImpl might try to log something
+ if (immutable_db_options_.sst_file_manager && own_sfm_) {
+ auto sfm = static_cast<SstFileManagerImpl*>(
+ immutable_db_options_.sst_file_manager.get());
+ sfm->Close();
+ }
+#endif // ROCKSDB_LITE
+
+ if (immutable_db_options_.info_log && own_info_log_) {
+ Status s = immutable_db_options_.info_log->Close();
+ if (ret.ok()) {
+ ret = s;
+ }
+ }
+
+ if (ret.IsAborted()) {
+ // Reserve IsAborted() error for those where users didn't release
+ // certain resource and they can release them and come back and
+ // retry. In this case, we wrap this exception to something else.
+ return Status::Incomplete(ret.ToString());
+ }
+ return ret;
+}
+
+Status DBImpl::CloseImpl() { return CloseHelper(); }
+
+DBImpl::~DBImpl() {
+ if (!closed_) {
+ closed_ = true;
+ CloseHelper();
+ }
+}
+
+void DBImpl::MaybeIgnoreError(Status* s) const {
+ if (s->ok() || immutable_db_options_.paranoid_checks) {
+ // No change needed
+ } else {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log, "Ignoring error %s",
+ s->ToString().c_str());
+ *s = Status::OK();
+ }
+}
+
+const Status DBImpl::CreateArchivalDirectory() {
+ if (immutable_db_options_.wal_ttl_seconds > 0 ||
+ immutable_db_options_.wal_size_limit_mb > 0) {
+ std::string archivalPath = ArchivalDirectory(immutable_db_options_.wal_dir);
+ return env_->CreateDirIfMissing(archivalPath);
+ }
+ return Status::OK();
+}
+
+void DBImpl::PrintStatistics() {
+ auto dbstats = immutable_db_options_.statistics.get();
+ if (dbstats) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "STATISTICS:\n %s",
+ dbstats->ToString().c_str());
+ }
+}
+
+void DBImpl::StartTimedTasks() {
+ unsigned int stats_dump_period_sec = 0;
+ unsigned int stats_persist_period_sec = 0;
+ {
+ InstrumentedMutexLock l(&mutex_);
+ stats_dump_period_sec = mutable_db_options_.stats_dump_period_sec;
+ if (stats_dump_period_sec > 0) {
+ if (!thread_dump_stats_) {
+ thread_dump_stats_.reset(new ROCKSDB_NAMESPACE::RepeatableThread(
+ [this]() { DBImpl::DumpStats(); }, "dump_st", env_,
+ static_cast<uint64_t>(stats_dump_period_sec) * kMicrosInSecond));
+ }
+ }
+ stats_persist_period_sec = mutable_db_options_.stats_persist_period_sec;
+ if (stats_persist_period_sec > 0) {
+ if (!thread_persist_stats_) {
+ thread_persist_stats_.reset(new ROCKSDB_NAMESPACE::RepeatableThread(
+ [this]() { DBImpl::PersistStats(); }, "pst_st", env_,
+ static_cast<uint64_t>(stats_persist_period_sec) * kMicrosInSecond));
+ }
+ }
+ }
+}
+
+// esitmate the total size of stats_history_
+size_t DBImpl::EstimateInMemoryStatsHistorySize() const {
+ size_t size_total =
+ sizeof(std::map<uint64_t, std::map<std::string, uint64_t>>);
+ if (stats_history_.size() == 0) return size_total;
+ size_t size_per_slice =
+ sizeof(uint64_t) + sizeof(std::map<std::string, uint64_t>);
+ // non-empty map, stats_history_.begin() guaranteed to exist
+ std::map<std::string, uint64_t> sample_slice(stats_history_.begin()->second);
+ for (const auto& pairs : sample_slice) {
+ size_per_slice +=
+ pairs.first.capacity() + sizeof(pairs.first) + sizeof(pairs.second);
+ }
+ size_total = size_per_slice * stats_history_.size();
+ return size_total;
+}
+
+void DBImpl::PersistStats() {
+ TEST_SYNC_POINT("DBImpl::PersistStats:Entry");
+#ifndef ROCKSDB_LITE
+ if (shutdown_initiated_) {
+ return;
+ }
+ uint64_t now_seconds = env_->NowMicros() / kMicrosInSecond;
+ Statistics* statistics = immutable_db_options_.statistics.get();
+ if (!statistics) {
+ return;
+ }
+ size_t stats_history_size_limit = 0;
+ {
+ InstrumentedMutexLock l(&mutex_);
+ stats_history_size_limit = mutable_db_options_.stats_history_buffer_size;
+ }
+
+ std::map<std::string, uint64_t> stats_map;
+ if (!statistics->getTickerMap(&stats_map)) {
+ return;
+ }
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "------- PERSISTING STATS -------");
+
+ if (immutable_db_options_.persist_stats_to_disk) {
+ WriteBatch batch;
+ if (stats_slice_initialized_) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Reading %" ROCKSDB_PRIszt " stats from statistics\n",
+ stats_slice_.size());
+ for (const auto& stat : stats_map) {
+ char key[100];
+ int length =
+ EncodePersistentStatsKey(now_seconds, stat.first, 100, key);
+ // calculate the delta from last time
+ if (stats_slice_.find(stat.first) != stats_slice_.end()) {
+ uint64_t delta = stat.second - stats_slice_[stat.first];
+ batch.Put(persist_stats_cf_handle_, Slice(key, std::min(100, length)),
+ ToString(delta));
+ }
+ }
+ }
+ stats_slice_initialized_ = true;
+ std::swap(stats_slice_, stats_map);
+ WriteOptions wo;
+ wo.low_pri = true;
+ wo.no_slowdown = true;
+ wo.sync = false;
+ Status s = Write(wo, &batch);
+ if (!s.ok()) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Writing to persistent stats CF failed -- %s",
+ s.ToString().c_str());
+ } else {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Writing %" ROCKSDB_PRIszt " stats with timestamp %" PRIu64
+ " to persistent stats CF succeeded",
+ stats_slice_.size(), now_seconds);
+ }
+ // TODO(Zhongyi): add purging for persisted data
+ } else {
+ InstrumentedMutexLock l(&stats_history_mutex_);
+ // calculate the delta from last time
+ if (stats_slice_initialized_) {
+ std::map<std::string, uint64_t> stats_delta;
+ for (const auto& stat : stats_map) {
+ if (stats_slice_.find(stat.first) != stats_slice_.end()) {
+ stats_delta[stat.first] = stat.second - stats_slice_[stat.first];
+ }
+ }
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Storing %" ROCKSDB_PRIszt " stats with timestamp %" PRIu64
+ " to in-memory stats history",
+ stats_slice_.size(), now_seconds);
+ stats_history_[now_seconds] = stats_delta;
+ }
+ stats_slice_initialized_ = true;
+ std::swap(stats_slice_, stats_map);
+ TEST_SYNC_POINT("DBImpl::PersistStats:StatsCopied");
+
+ // delete older stats snapshots to control memory consumption
+ size_t stats_history_size = EstimateInMemoryStatsHistorySize();
+ bool purge_needed = stats_history_size > stats_history_size_limit;
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[Pre-GC] In-memory stats history size: %" ROCKSDB_PRIszt
+ " bytes, slice count: %" ROCKSDB_PRIszt,
+ stats_history_size, stats_history_.size());
+ while (purge_needed && !stats_history_.empty()) {
+ stats_history_.erase(stats_history_.begin());
+ purge_needed =
+ EstimateInMemoryStatsHistorySize() > stats_history_size_limit;
+ }
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[Post-GC] In-memory stats history size: %" ROCKSDB_PRIszt
+ " bytes, slice count: %" ROCKSDB_PRIszt,
+ stats_history_size, stats_history_.size());
+ }
+#endif // !ROCKSDB_LITE
+}
+
+bool DBImpl::FindStatsByTime(uint64_t start_time, uint64_t end_time,
+ uint64_t* new_time,
+ std::map<std::string, uint64_t>* stats_map) {
+ assert(new_time);
+ assert(stats_map);
+ if (!new_time || !stats_map) return false;
+ // lock when search for start_time
+ {
+ InstrumentedMutexLock l(&stats_history_mutex_);
+ auto it = stats_history_.lower_bound(start_time);
+ if (it != stats_history_.end() && it->first < end_time) {
+ // make a copy for timestamp and stats_map
+ *new_time = it->first;
+ *stats_map = it->second;
+ return true;
+ } else {
+ return false;
+ }
+ }
+}
+
+Status DBImpl::GetStatsHistory(
+ uint64_t start_time, uint64_t end_time,
+ std::unique_ptr<StatsHistoryIterator>* stats_iterator) {
+ if (!stats_iterator) {
+ return Status::InvalidArgument("stats_iterator not preallocated.");
+ }
+ if (immutable_db_options_.persist_stats_to_disk) {
+ stats_iterator->reset(
+ new PersistentStatsHistoryIterator(start_time, end_time, this));
+ } else {
+ stats_iterator->reset(
+ new InMemoryStatsHistoryIterator(start_time, end_time, this));
+ }
+ return (*stats_iterator)->status();
+}
+
+void DBImpl::DumpStats() {
+ TEST_SYNC_POINT("DBImpl::DumpStats:1");
+#ifndef ROCKSDB_LITE
+ const DBPropertyInfo* cf_property_info =
+ GetPropertyInfo(DB::Properties::kCFStats);
+ assert(cf_property_info != nullptr);
+ const DBPropertyInfo* db_property_info =
+ GetPropertyInfo(DB::Properties::kDBStats);
+ assert(db_property_info != nullptr);
+
+ std::string stats;
+ if (shutdown_initiated_) {
+ return;
+ }
+ {
+ InstrumentedMutexLock l(&mutex_);
+ default_cf_internal_stats_->GetStringProperty(
+ *db_property_info, DB::Properties::kDBStats, &stats);
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ if (cfd->initialized()) {
+ cfd->internal_stats()->GetStringProperty(
+ *cf_property_info, DB::Properties::kCFStatsNoFileHistogram, &stats);
+ }
+ }
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ if (cfd->initialized()) {
+ cfd->internal_stats()->GetStringProperty(
+ *cf_property_info, DB::Properties::kCFFileHistogram, &stats);
+ }
+ }
+ }
+ TEST_SYNC_POINT("DBImpl::DumpStats:2");
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "------- DUMPING STATS -------");
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s", stats.c_str());
+ if (immutable_db_options_.dump_malloc_stats) {
+ stats.clear();
+ DumpMallocStats(&stats);
+ if (!stats.empty()) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "------- Malloc STATS -------");
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s", stats.c_str());
+ }
+ }
+#endif // !ROCKSDB_LITE
+
+ PrintStatistics();
+}
+
+Status DBImpl::TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family,
+ int max_entries_to_print,
+ std::string* out_str) {
+ auto* cfh =
+ static_cast_with_check<ColumnFamilyHandleImpl, ColumnFamilyHandle>(
+ column_family);
+ ColumnFamilyData* cfd = cfh->cfd();
+
+ SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+ Version* version = super_version->current;
+
+ Status s =
+ version->TablesRangeTombstoneSummary(max_entries_to_print, out_str);
+
+ CleanupSuperVersion(super_version);
+ return s;
+}
+
+void DBImpl::ScheduleBgLogWriterClose(JobContext* job_context) {
+ if (!job_context->logs_to_free.empty()) {
+ for (auto l : job_context->logs_to_free) {
+ AddToLogsToFreeQueue(l);
+ }
+ job_context->logs_to_free.clear();
+ }
+}
+
+Directory* DBImpl::GetDataDir(ColumnFamilyData* cfd, size_t path_id) const {
+ assert(cfd);
+ Directory* ret_dir = cfd->GetDataDir(path_id);
+ if (ret_dir == nullptr) {
+ return directories_.GetDataDir(path_id);
+ }
+ return ret_dir;
+}
+
+Status DBImpl::SetOptions(
+ ColumnFamilyHandle* column_family,
+ const std::unordered_map<std::string, std::string>& options_map) {
+#ifdef ROCKSDB_LITE
+ (void)column_family;
+ (void)options_map;
+ return Status::NotSupported("Not supported in ROCKSDB LITE");
+#else
+ auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+ if (options_map.empty()) {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "SetOptions() on column family [%s], empty input",
+ cfd->GetName().c_str());
+ return Status::InvalidArgument("empty input");
+ }
+
+ MutableCFOptions new_options;
+ Status s;
+ Status persist_options_status;
+ SuperVersionContext sv_context(/* create_superversion */ true);
+ {
+ auto db_options = GetDBOptions();
+ InstrumentedMutexLock l(&mutex_);
+ s = cfd->SetOptions(db_options, options_map);
+ if (s.ok()) {
+ new_options = *cfd->GetLatestMutableCFOptions();
+ // Append new version to recompute compaction score.
+ VersionEdit dummy_edit;
+ versions_->LogAndApply(cfd, new_options, &dummy_edit, &mutex_,
+ directories_.GetDbDir());
+ // Trigger possible flush/compactions. This has to be before we persist
+ // options to file, otherwise there will be a deadlock with writer
+ // thread.
+ InstallSuperVersionAndScheduleWork(cfd, &sv_context, new_options);
+
+ persist_options_status = WriteOptionsFile(
+ false /*need_mutex_lock*/, true /*need_enter_write_thread*/);
+ bg_cv_.SignalAll();
+ }
+ }
+ sv_context.Clean();
+
+ ROCKS_LOG_INFO(
+ immutable_db_options_.info_log,
+ "SetOptions() on column family [%s], inputs:", cfd->GetName().c_str());
+ for (const auto& o : options_map) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s: %s\n", o.first.c_str(),
+ o.second.c_str());
+ }
+ if (s.ok()) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[%s] SetOptions() succeeded", cfd->GetName().c_str());
+ new_options.Dump(immutable_db_options_.info_log.get());
+ if (!persist_options_status.ok()) {
+ s = persist_options_status;
+ }
+ } else {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log, "[%s] SetOptions() failed",
+ cfd->GetName().c_str());
+ }
+ LogFlush(immutable_db_options_.info_log);
+ return s;
+#endif // ROCKSDB_LITE
+}
+
+Status DBImpl::SetDBOptions(
+ const std::unordered_map<std::string, std::string>& options_map) {
+#ifdef ROCKSDB_LITE
+ (void)options_map;
+ return Status::NotSupported("Not supported in ROCKSDB LITE");
+#else
+ if (options_map.empty()) {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "SetDBOptions(), empty input.");
+ return Status::InvalidArgument("empty input");
+ }
+
+ MutableDBOptions new_options;
+ Status s;
+ Status persist_options_status;
+ bool wal_changed = false;
+ WriteContext write_context;
+ {
+ InstrumentedMutexLock l(&mutex_);
+ s = GetMutableDBOptionsFromStrings(mutable_db_options_, options_map,
+ &new_options);
+ if (new_options.bytes_per_sync == 0) {
+ new_options.bytes_per_sync = 1024 * 1024;
+ }
+ DBOptions new_db_options =
+ BuildDBOptions(immutable_db_options_, new_options);
+ if (s.ok()) {
+ s = ValidateOptions(new_db_options);
+ }
+ if (s.ok()) {
+ for (auto c : *versions_->GetColumnFamilySet()) {
+ if (!c->IsDropped()) {
+ auto cf_options = c->GetLatestCFOptions();
+ s = ColumnFamilyData::ValidateOptions(new_db_options, cf_options);
+ if (!s.ok()) {
+ break;
+ }
+ }
+ }
+ }
+ if (s.ok()) {
+ const BGJobLimits current_bg_job_limits =
+ GetBGJobLimits(immutable_db_options_.max_background_flushes,
+ mutable_db_options_.max_background_compactions,
+ mutable_db_options_.max_background_jobs,
+ /* parallelize_compactions */ true);
+ const BGJobLimits new_bg_job_limits = GetBGJobLimits(
+ immutable_db_options_.max_background_flushes,
+ new_options.max_background_compactions,
+ new_options.max_background_jobs, /* parallelize_compactions */ true);
+
+ const bool max_flushes_increased =
+ new_bg_job_limits.max_flushes > current_bg_job_limits.max_flushes;
+ const bool max_compactions_increased =
+ new_bg_job_limits.max_compactions >
+ current_bg_job_limits.max_compactions;
+
+ if (max_flushes_increased || max_compactions_increased) {
+ if (max_flushes_increased) {
+ env_->IncBackgroundThreadsIfNeeded(new_bg_job_limits.max_flushes,
+ Env::Priority::HIGH);
+ }
+
+ if (max_compactions_increased) {
+ env_->IncBackgroundThreadsIfNeeded(new_bg_job_limits.max_compactions,
+ Env::Priority::LOW);
+ }
+
+ MaybeScheduleFlushOrCompaction();
+ }
+
+ if (new_options.stats_dump_period_sec !=
+ mutable_db_options_.stats_dump_period_sec) {
+ if (thread_dump_stats_) {
+ mutex_.Unlock();
+ thread_dump_stats_->cancel();
+ mutex_.Lock();
+ }
+ if (new_options.stats_dump_period_sec > 0) {
+ thread_dump_stats_.reset(new ROCKSDB_NAMESPACE::RepeatableThread(
+ [this]() { DBImpl::DumpStats(); }, "dump_st", env_,
+ static_cast<uint64_t>(new_options.stats_dump_period_sec) *
+ kMicrosInSecond));
+ } else {
+ thread_dump_stats_.reset();
+ }
+ }
+ if (new_options.stats_persist_period_sec !=
+ mutable_db_options_.stats_persist_period_sec) {
+ if (thread_persist_stats_) {
+ mutex_.Unlock();
+ thread_persist_stats_->cancel();
+ mutex_.Lock();
+ }
+ if (new_options.stats_persist_period_sec > 0) {
+ thread_persist_stats_.reset(new ROCKSDB_NAMESPACE::RepeatableThread(
+ [this]() { DBImpl::PersistStats(); }, "pst_st", env_,
+ static_cast<uint64_t>(new_options.stats_persist_period_sec) *
+ kMicrosInSecond));
+ } else {
+ thread_persist_stats_.reset();
+ }
+ }
+ write_controller_.set_max_delayed_write_rate(
+ new_options.delayed_write_rate);
+ table_cache_.get()->SetCapacity(new_options.max_open_files == -1
+ ? TableCache::kInfiniteCapacity
+ : new_options.max_open_files - 10);
+ wal_changed = mutable_db_options_.wal_bytes_per_sync !=
+ new_options.wal_bytes_per_sync;
+ mutable_db_options_ = new_options;
+ file_options_for_compaction_ = FileOptions(new_db_options);
+ file_options_for_compaction_ = fs_->OptimizeForCompactionTableWrite(
+ file_options_for_compaction_, immutable_db_options_);
+ versions_->ChangeFileOptions(mutable_db_options_);
+ //TODO(xiez): clarify why apply optimize for read to write options
+ file_options_for_compaction_ = fs_->OptimizeForCompactionTableRead(
+ file_options_for_compaction_, immutable_db_options_);
+ file_options_for_compaction_.compaction_readahead_size =
+ mutable_db_options_.compaction_readahead_size;
+ WriteThread::Writer w;
+ write_thread_.EnterUnbatched(&w, &mutex_);
+ if (total_log_size_ > GetMaxTotalWalSize() || wal_changed) {
+ Status purge_wal_status = SwitchWAL(&write_context);
+ if (!purge_wal_status.ok()) {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "Unable to purge WAL files in SetDBOptions() -- %s",
+ purge_wal_status.ToString().c_str());
+ }
+ }
+ persist_options_status = WriteOptionsFile(
+ false /*need_mutex_lock*/, false /*need_enter_write_thread*/);
+ write_thread_.ExitUnbatched(&w);
+ }
+ }
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "SetDBOptions(), inputs:");
+ for (const auto& o : options_map) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s: %s\n", o.first.c_str(),
+ o.second.c_str());
+ }
+ if (s.ok()) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "SetDBOptions() succeeded");
+ new_options.Dump(immutable_db_options_.info_log.get());
+ if (!persist_options_status.ok()) {
+ if (immutable_db_options_.fail_if_options_file_error) {
+ s = Status::IOError(
+ "SetDBOptions() succeeded, but unable to persist options",
+ persist_options_status.ToString());
+ }
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "Unable to persist options in SetDBOptions() -- %s",
+ persist_options_status.ToString().c_str());
+ }
+ } else {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log, "SetDBOptions failed");
+ }
+ LogFlush(immutable_db_options_.info_log);
+ return s;
+#endif // ROCKSDB_LITE
+}
+
+// return the same level if it cannot be moved
+int DBImpl::FindMinimumEmptyLevelFitting(
+ ColumnFamilyData* cfd, const MutableCFOptions& /*mutable_cf_options*/,
+ int level) {
+ mutex_.AssertHeld();
+ const auto* vstorage = cfd->current()->storage_info();
+ int minimum_level = level;
+ for (int i = level - 1; i > 0; --i) {
+ // stop if level i is not empty
+ if (vstorage->NumLevelFiles(i) > 0) break;
+ // stop if level i is too small (cannot fit the level files)
+ if (vstorage->MaxBytesForLevel(i) < vstorage->NumLevelBytes(level)) {
+ break;
+ }
+
+ minimum_level = i;
+ }
+ return minimum_level;
+}
+
+Status DBImpl::FlushWAL(bool sync) {
+ if (manual_wal_flush_) {
+ Status s;
+ {
+ // We need to lock log_write_mutex_ since logs_ might change concurrently
+ InstrumentedMutexLock wl(&log_write_mutex_);
+ log::Writer* cur_log_writer = logs_.back().writer;
+ s = cur_log_writer->WriteBuffer();
+ }
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL flush error %s",
+ s.ToString().c_str());
+ // In case there is a fs error we should set it globally to prevent the
+ // future writes
+ WriteStatusCheck(s);
+ // whether sync or not, we should abort the rest of function upon error
+ return s;
+ }
+ if (!sync) {
+ ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "FlushWAL sync=false");
+ return s;
+ }
+ }
+ if (!sync) {
+ return Status::OK();
+ }
+ // sync = true
+ ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "FlushWAL sync=true");
+ return SyncWAL();
+}
+
+Status DBImpl::SyncWAL() {
+ autovector<log::Writer*, 1> logs_to_sync;
+ bool need_log_dir_sync;
+ uint64_t current_log_number;
+
+ {
+ InstrumentedMutexLock l(&mutex_);
+ assert(!logs_.empty());
+
+ // This SyncWAL() call only cares about logs up to this number.
+ current_log_number = logfile_number_;
+
+ while (logs_.front().number <= current_log_number &&
+ logs_.front().getting_synced) {
+ log_sync_cv_.Wait();
+ }
+ // First check that logs are safe to sync in background.
+ for (auto it = logs_.begin();
+ it != logs_.end() && it->number <= current_log_number; ++it) {
+ if (!it->writer->file()->writable_file()->IsSyncThreadSafe()) {
+ return Status::NotSupported(
+ "SyncWAL() is not supported for this implementation of WAL file",
+ immutable_db_options_.allow_mmap_writes
+ ? "try setting Options::allow_mmap_writes to false"
+ : Slice());
+ }
+ }
+ for (auto it = logs_.begin();
+ it != logs_.end() && it->number <= current_log_number; ++it) {
+ auto& log = *it;
+ assert(!log.getting_synced);
+ log.getting_synced = true;
+ logs_to_sync.push_back(log.writer);
+ }
+
+ need_log_dir_sync = !log_dir_synced_;
+ }
+
+ TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:1");
+ RecordTick(stats_, WAL_FILE_SYNCED);
+ Status status;
+ for (log::Writer* log : logs_to_sync) {
+ status = log->file()->SyncWithoutFlush(immutable_db_options_.use_fsync);
+ if (!status.ok()) {
+ break;
+ }
+ }
+ if (status.ok() && need_log_dir_sync) {
+ status = directories_.GetWalDir()->Fsync();
+ }
+ TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:2");
+
+ TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:1");
+ {
+ InstrumentedMutexLock l(&mutex_);
+ MarkLogsSynced(current_log_number, need_log_dir_sync, status);
+ }
+ TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2");
+
+ return status;
+}
+
+Status DBImpl::LockWAL() {
+ log_write_mutex_.Lock();
+ auto cur_log_writer = logs_.back().writer;
+ auto status = cur_log_writer->WriteBuffer();
+ if (!status.ok()) {
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL flush error %s",
+ status.ToString().c_str());
+ // In case there is a fs error we should set it globally to prevent the
+ // future writes
+ WriteStatusCheck(status);
+ }
+ return status;
+}
+
+Status DBImpl::UnlockWAL() {
+ log_write_mutex_.Unlock();
+ return Status::OK();
+}
+
+void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir,
+ const Status& status) {
+ mutex_.AssertHeld();
+ if (synced_dir && logfile_number_ == up_to && status.ok()) {
+ log_dir_synced_ = true;
+ }
+ for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) {
+ auto& log = *it;
+ assert(log.getting_synced);
+ if (status.ok() && logs_.size() > 1) {
+ logs_to_free_.push_back(log.ReleaseWriter());
+ // To modify logs_ both mutex_ and log_write_mutex_ must be held
+ InstrumentedMutexLock l(&log_write_mutex_);
+ it = logs_.erase(it);
+ } else {
+ log.getting_synced = false;
+ ++it;
+ }
+ }
+ assert(!status.ok() || logs_.empty() || logs_[0].number > up_to ||
+ (logs_.size() == 1 && !logs_[0].getting_synced));
+ log_sync_cv_.SignalAll();
+}
+
+SequenceNumber DBImpl::GetLatestSequenceNumber() const {
+ return versions_->LastSequence();
+}
+
+void DBImpl::SetLastPublishedSequence(SequenceNumber seq) {
+ versions_->SetLastPublishedSequence(seq);
+}
+
+bool DBImpl::SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) {
+ if (seqnum > preserve_deletes_seqnum_.load()) {
+ preserve_deletes_seqnum_.store(seqnum);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+InternalIterator* DBImpl::NewInternalIterator(
+ Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence,
+ ColumnFamilyHandle* column_family) {
+ ColumnFamilyData* cfd;
+ if (column_family == nullptr) {
+ cfd = default_cf_handle_->cfd();
+ } else {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ cfd = cfh->cfd();
+ }
+
+ mutex_.Lock();
+ SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
+ mutex_.Unlock();
+ ReadOptions roptions;
+ return NewInternalIterator(roptions, cfd, super_version, arena, range_del_agg,
+ sequence);
+}
+
+void DBImpl::SchedulePurge() {
+ mutex_.AssertHeld();
+ assert(opened_successfully_);
+
+ // Purge operations are put into High priority queue
+ bg_purge_scheduled_++;
+ env_->Schedule(&DBImpl::BGWorkPurge, this, Env::Priority::HIGH, nullptr);
+}
+
+void DBImpl::BackgroundCallPurge() {
+ mutex_.Lock();
+
+ while (!logs_to_free_queue_.empty()) {
+ assert(!logs_to_free_queue_.empty());
+ log::Writer* log_writer = *(logs_to_free_queue_.begin());
+ logs_to_free_queue_.pop_front();
+ mutex_.Unlock();
+ delete log_writer;
+ mutex_.Lock();
+ }
+ while (!superversions_to_free_queue_.empty()) {
+ assert(!superversions_to_free_queue_.empty());
+ SuperVersion* sv = superversions_to_free_queue_.front();
+ superversions_to_free_queue_.pop_front();
+ mutex_.Unlock();
+ delete sv;
+ mutex_.Lock();
+ }
+
+ // Can't use iterator to go over purge_files_ because inside the loop we're
+ // unlocking the mutex that protects purge_files_.
+ while (!purge_files_.empty()) {
+ auto it = purge_files_.begin();
+ // Need to make a copy of the PurgeFilesInfo before unlocking the mutex.
+ PurgeFileInfo purge_file = it->second;
+
+ const std::string& fname = purge_file.fname;
+ const std::string& dir_to_sync = purge_file.dir_to_sync;
+ FileType type = purge_file.type;
+ uint64_t number = purge_file.number;
+ int job_id = purge_file.job_id;
+
+ purge_files_.erase(it);
+
+ mutex_.Unlock();
+ DeleteObsoleteFileImpl(job_id, fname, dir_to_sync, type, number);
+ mutex_.Lock();
+ }
+
+ bg_purge_scheduled_--;
+
+ bg_cv_.SignalAll();
+ // IMPORTANT:there should be no code after calling SignalAll. This call may
+ // signal the DB destructor that it's OK to proceed with destruction. In
+ // that case, all DB variables will be dealloacated and referencing them
+ // will cause trouble.
+ mutex_.Unlock();
+}
+
+namespace {
+struct IterState {
+ IterState(DBImpl* _db, InstrumentedMutex* _mu, SuperVersion* _super_version,
+ bool _background_purge)
+ : db(_db),
+ mu(_mu),
+ super_version(_super_version),
+ background_purge(_background_purge) {}
+
+ DBImpl* db;
+ InstrumentedMutex* mu;
+ SuperVersion* super_version;
+ bool background_purge;
+};
+
+static void CleanupIteratorState(void* arg1, void* /*arg2*/) {
+ IterState* state = reinterpret_cast<IterState*>(arg1);
+
+ if (state->super_version->Unref()) {
+ // Job id == 0 means that this is not our background process, but rather
+ // user thread
+ JobContext job_context(0);
+
+ state->mu->Lock();
+ state->super_version->Cleanup();
+ state->db->FindObsoleteFiles(&job_context, false, true);
+ if (state->background_purge) {
+ state->db->ScheduleBgLogWriterClose(&job_context);
+ state->db->AddSuperVersionsToFreeQueue(state->super_version);
+ state->db->SchedulePurge();
+ }
+ state->mu->Unlock();
+
+ if (!state->background_purge) {
+ delete state->super_version;
+ }
+ if (job_context.HaveSomethingToDelete()) {
+ if (state->background_purge) {
+ // PurgeObsoleteFiles here does not delete files. Instead, it adds the
+ // files to be deleted to a job queue, and deletes it in a separate
+ // background thread.
+ state->db->PurgeObsoleteFiles(job_context, true /* schedule only */);
+ state->mu->Lock();
+ state->db->SchedulePurge();
+ state->mu->Unlock();
+ } else {
+ state->db->PurgeObsoleteFiles(job_context);
+ }
+ }
+ job_context.Clean();
+ }
+
+ delete state;
+}
+} // namespace
+
+InternalIterator* DBImpl::NewInternalIterator(const ReadOptions& read_options,
+ ColumnFamilyData* cfd,
+ SuperVersion* super_version,
+ Arena* arena,
+ RangeDelAggregator* range_del_agg,
+ SequenceNumber sequence) {
+ InternalIterator* internal_iter;
+ assert(arena != nullptr);
+ assert(range_del_agg != nullptr);
+ // Need to create internal iterator from the arena.
+ MergeIteratorBuilder merge_iter_builder(
+ &cfd->internal_comparator(), arena,
+ !read_options.total_order_seek &&
+ super_version->mutable_cf_options.prefix_extractor != nullptr);
+ // Collect iterator for mutable mem
+ merge_iter_builder.AddIterator(
+ super_version->mem->NewIterator(read_options, arena));
+ std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter;
+ Status s;
+ if (!read_options.ignore_range_deletions) {
+ range_del_iter.reset(
+ super_version->mem->NewRangeTombstoneIterator(read_options, sequence));
+ range_del_agg->AddTombstones(std::move(range_del_iter));
+ }
+ // Collect all needed child iterators for immutable memtables
+ if (s.ok()) {
+ super_version->imm->AddIterators(read_options, &merge_iter_builder);
+ if (!read_options.ignore_range_deletions) {
+ s = super_version->imm->AddRangeTombstoneIterators(read_options, arena,
+ range_del_agg);
+ }
+ }
+ TEST_SYNC_POINT_CALLBACK("DBImpl::NewInternalIterator:StatusCallback", &s);
+ if (s.ok()) {
+ // Collect iterators for files in L0 - Ln
+ if (read_options.read_tier != kMemtableTier) {
+ super_version->current->AddIterators(read_options, file_options_,
+ &merge_iter_builder, range_del_agg);
+ }
+ internal_iter = merge_iter_builder.Finish();
+ IterState* cleanup =
+ new IterState(this, &mutex_, super_version,
+ read_options.background_purge_on_iterator_cleanup ||
+ immutable_db_options_.avoid_unnecessary_blocking_io);
+ internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr);
+
+ return internal_iter;
+ } else {
+ CleanupSuperVersion(super_version);
+ }
+ return NewErrorInternalIterator<Slice>(s, arena);
+}
+
+ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const {
+ return default_cf_handle_;
+}
+
+ColumnFamilyHandle* DBImpl::PersistentStatsColumnFamily() const {
+ return persist_stats_cf_handle_;
+}
+
+Status DBImpl::Get(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value) {
+ GetImplOptions get_impl_options;
+ get_impl_options.column_family = column_family;
+ get_impl_options.value = value;
+ return GetImpl(read_options, key, get_impl_options);
+}
+
+Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key,
+ GetImplOptions get_impl_options) {
+ assert(get_impl_options.value != nullptr ||
+ get_impl_options.merge_operands != nullptr);
+ PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
+ StopWatch sw(env_, stats_, DB_GET);
+ PERF_TIMER_GUARD(get_snapshot_time);
+
+ auto cfh =
+ reinterpret_cast<ColumnFamilyHandleImpl*>(get_impl_options.column_family);
+ auto cfd = cfh->cfd();
+
+ if (tracer_) {
+ // TODO: This mutex should be removed later, to improve performance when
+ // tracing is enabled.
+ InstrumentedMutexLock lock(&trace_mutex_);
+ if (tracer_) {
+ tracer_->Get(get_impl_options.column_family, key);
+ }
+ }
+
+ // Acquire SuperVersion
+ SuperVersion* sv = GetAndRefSuperVersion(cfd);
+
+ TEST_SYNC_POINT("DBImpl::GetImpl:1");
+ TEST_SYNC_POINT("DBImpl::GetImpl:2");
+
+ SequenceNumber snapshot;
+ if (read_options.snapshot != nullptr) {
+ if (get_impl_options.callback) {
+ // Already calculated based on read_options.snapshot
+ snapshot = get_impl_options.callback->max_visible_seq();
+ } else {
+ snapshot =
+ reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)->number_;
+ }
+ } else {
+ // Note that the snapshot is assigned AFTER referencing the super
+ // version because otherwise a flush happening in between may compact away
+ // data for the snapshot, so the reader would see neither data that was be
+ // visible to the snapshot before compaction nor the newer data inserted
+ // afterwards.
+ snapshot = last_seq_same_as_publish_seq_
+ ? versions_->LastSequence()
+ : versions_->LastPublishedSequence();
+ if (get_impl_options.callback) {
+ // The unprep_seqs are not published for write unprepared, so it could be
+ // that max_visible_seq is larger. Seek to the std::max of the two.
+ // However, we still want our callback to contain the actual snapshot so
+ // that it can do the correct visibility filtering.
+ get_impl_options.callback->Refresh(snapshot);
+
+ // Internally, WriteUnpreparedTxnReadCallback::Refresh would set
+ // max_visible_seq = max(max_visible_seq, snapshot)
+ //
+ // Currently, the commented out assert is broken by
+ // InvalidSnapshotReadCallback, but if write unprepared recovery followed
+ // the regular transaction flow, then this special read callback would not
+ // be needed.
+ //
+ // assert(callback->max_visible_seq() >= snapshot);
+ snapshot = get_impl_options.callback->max_visible_seq();
+ }
+ }
+ TEST_SYNC_POINT("DBImpl::GetImpl:3");
+ TEST_SYNC_POINT("DBImpl::GetImpl:4");
+
+ // Prepare to store a list of merge operations if merge occurs.
+ MergeContext merge_context;
+ SequenceNumber max_covering_tombstone_seq = 0;
+
+ Status s;
+ // First look in the memtable, then in the immutable memtable (if any).
+ // s is both in/out. When in, s could either be OK or MergeInProgress.
+ // merge_operands will contain the sequence of merges in the latter case.
+ LookupKey lkey(key, snapshot, read_options.timestamp);
+ PERF_TIMER_STOP(get_snapshot_time);
+
+ bool skip_memtable = (read_options.read_tier == kPersistedTier &&
+ has_unpersisted_data_.load(std::memory_order_relaxed));
+ bool done = false;
+ if (!skip_memtable) {
+ // Get value associated with key
+ if (get_impl_options.get_value) {
+ if (sv->mem->Get(lkey, get_impl_options.value->GetSelf(), &s,
+ &merge_context, &max_covering_tombstone_seq,
+ read_options, get_impl_options.callback,
+ get_impl_options.is_blob_index)) {
+ done = true;
+ get_impl_options.value->PinSelf();
+ RecordTick(stats_, MEMTABLE_HIT);
+ } else if ((s.ok() || s.IsMergeInProgress()) &&
+ sv->imm->Get(lkey, get_impl_options.value->GetSelf(), &s,
+ &merge_context, &max_covering_tombstone_seq,
+ read_options, get_impl_options.callback,
+ get_impl_options.is_blob_index)) {
+ done = true;
+ get_impl_options.value->PinSelf();
+ RecordTick(stats_, MEMTABLE_HIT);
+ }
+ } else {
+ // Get Merge Operands associated with key, Merge Operands should not be
+ // merged and raw values should be returned to the user.
+ if (sv->mem->Get(lkey, nullptr, &s, &merge_context,
+ &max_covering_tombstone_seq, read_options, nullptr,
+ nullptr, false)) {
+ done = true;
+ RecordTick(stats_, MEMTABLE_HIT);
+ } else if ((s.ok() || s.IsMergeInProgress()) &&
+ sv->imm->GetMergeOperands(lkey, &s, &merge_context,
+ &max_covering_tombstone_seq,
+ read_options)) {
+ done = true;
+ RecordTick(stats_, MEMTABLE_HIT);
+ }
+ }
+ if (!done && !s.ok() && !s.IsMergeInProgress()) {
+ ReturnAndCleanupSuperVersion(cfd, sv);
+ return s;
+ }
+ }
+ if (!done) {
+ PERF_TIMER_GUARD(get_from_output_files_time);
+ sv->current->Get(
+ read_options, lkey, get_impl_options.value, &s, &merge_context,
+ &max_covering_tombstone_seq,
+ get_impl_options.get_value ? get_impl_options.value_found : nullptr,
+ nullptr, nullptr,
+ get_impl_options.get_value ? get_impl_options.callback : nullptr,
+ get_impl_options.get_value ? get_impl_options.is_blob_index : nullptr,
+ get_impl_options.get_value);
+ RecordTick(stats_, MEMTABLE_MISS);
+ }
+
+ {
+ PERF_TIMER_GUARD(get_post_process_time);
+
+ ReturnAndCleanupSuperVersion(cfd, sv);
+
+ RecordTick(stats_, NUMBER_KEYS_READ);
+ size_t size = 0;
+ if (s.ok()) {
+ if (get_impl_options.get_value) {
+ size = get_impl_options.value->size();
+ } else {
+ // Return all merge operands for get_impl_options.key
+ *get_impl_options.number_of_operands =
+ static_cast<int>(merge_context.GetNumOperands());
+ if (*get_impl_options.number_of_operands >
+ get_impl_options.get_merge_operands_options
+ ->expected_max_number_of_operands) {
+ s = Status::Incomplete(
+ Status::SubCode::KMergeOperandsInsufficientCapacity);
+ } else {
+ for (const Slice& sl : merge_context.GetOperands()) {
+ size += sl.size();
+ get_impl_options.merge_operands->PinSelf(sl);
+ get_impl_options.merge_operands++;
+ }
+ }
+ }
+ RecordTick(stats_, BYTES_READ, size);
+ PERF_COUNTER_ADD(get_read_bytes, size);
+ }
+ RecordInHistogram(stats_, BYTES_PER_READ, size);
+ }
+ return s;
+}
+
+std::vector<Status> DBImpl::MultiGet(
+ const ReadOptions& read_options,
+ const std::vector<ColumnFamilyHandle*>& column_family,
+ const std::vector<Slice>& keys, std::vector<std::string>* values) {
+ PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
+ StopWatch sw(env_, stats_, DB_MULTIGET);
+ PERF_TIMER_GUARD(get_snapshot_time);
+
+ SequenceNumber consistent_seqnum;
+ ;
+
+ std::unordered_map<uint32_t, MultiGetColumnFamilyData> multiget_cf_data(
+ column_family.size());
+ for (auto cf : column_family) {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(cf);
+ auto cfd = cfh->cfd();
+ if (multiget_cf_data.find(cfd->GetID()) == multiget_cf_data.end()) {
+ multiget_cf_data.emplace(cfd->GetID(),
+ MultiGetColumnFamilyData(cfh, nullptr));
+ }
+ }
+
+ std::function<MultiGetColumnFamilyData*(
+ std::unordered_map<uint32_t, MultiGetColumnFamilyData>::iterator&)>
+ iter_deref_lambda =
+ [](std::unordered_map<uint32_t, MultiGetColumnFamilyData>::iterator&
+ cf_iter) { return &cf_iter->second; };
+
+ bool unref_only =
+ MultiCFSnapshot<std::unordered_map<uint32_t, MultiGetColumnFamilyData>>(
+ read_options, nullptr, iter_deref_lambda, &multiget_cf_data,
+ &consistent_seqnum);
+
+ // Contain a list of merge operations if merge occurs.
+ MergeContext merge_context;
+
+ // Note: this always resizes the values array
+ size_t num_keys = keys.size();
+ std::vector<Status> stat_list(num_keys);
+ values->resize(num_keys);
+
+ // Keep track of bytes that we read for statistics-recording later
+ uint64_t bytes_read = 0;
+ PERF_TIMER_STOP(get_snapshot_time);
+
+ // For each of the given keys, apply the entire "get" process as follows:
+ // First look in the memtable, then in the immutable memtable (if any).
+ // s is both in/out. When in, s could either be OK or MergeInProgress.
+ // merge_operands will contain the sequence of merges in the latter case.
+ size_t num_found = 0;
+ for (size_t i = 0; i < num_keys; ++i) {
+ merge_context.Clear();
+ Status& s = stat_list[i];
+ std::string* value = &(*values)[i];
+
+ LookupKey lkey(keys[i], consistent_seqnum);
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family[i]);
+ SequenceNumber max_covering_tombstone_seq = 0;
+ auto mgd_iter = multiget_cf_data.find(cfh->cfd()->GetID());
+ assert(mgd_iter != multiget_cf_data.end());
+ auto mgd = mgd_iter->second;
+ auto super_version = mgd.super_version;
+ bool skip_memtable =
+ (read_options.read_tier == kPersistedTier &&
+ has_unpersisted_data_.load(std::memory_order_relaxed));
+ bool done = false;
+ if (!skip_memtable) {
+ if (super_version->mem->Get(lkey, value, &s, &merge_context,
+ &max_covering_tombstone_seq, read_options)) {
+ done = true;
+ RecordTick(stats_, MEMTABLE_HIT);
+ } else if (super_version->imm->Get(lkey, value, &s, &merge_context,
+ &max_covering_tombstone_seq,
+ read_options)) {
+ done = true;
+ RecordTick(stats_, MEMTABLE_HIT);
+ }
+ }
+ if (!done) {
+ PinnableSlice pinnable_val;
+ PERF_TIMER_GUARD(get_from_output_files_time);
+ super_version->current->Get(read_options, lkey, &pinnable_val, &s,
+ &merge_context, &max_covering_tombstone_seq);
+ value->assign(pinnable_val.data(), pinnable_val.size());
+ RecordTick(stats_, MEMTABLE_MISS);
+ }
+
+ if (s.ok()) {
+ bytes_read += value->size();
+ num_found++;
+ }
+ }
+
+ // Post processing (decrement reference counts and record statistics)
+ PERF_TIMER_GUARD(get_post_process_time);
+ autovector<SuperVersion*> superversions_to_delete;
+
+ for (auto mgd_iter : multiget_cf_data) {
+ auto mgd = mgd_iter.second;
+ if (!unref_only) {
+ ReturnAndCleanupSuperVersion(mgd.cfd, mgd.super_version);
+ } else {
+ mgd.cfd->GetSuperVersion()->Unref();
+ }
+ }
+ RecordTick(stats_, NUMBER_MULTIGET_CALLS);
+ RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys);
+ RecordTick(stats_, NUMBER_MULTIGET_KEYS_FOUND, num_found);
+ RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read);
+ RecordInHistogram(stats_, BYTES_PER_MULTIGET, bytes_read);
+ PERF_COUNTER_ADD(multiget_read_bytes, bytes_read);
+ PERF_TIMER_STOP(get_post_process_time);
+
+ return stat_list;
+}
+
+template <class T>
+bool DBImpl::MultiCFSnapshot(
+ const ReadOptions& read_options, ReadCallback* callback,
+ std::function<MultiGetColumnFamilyData*(typename T::iterator&)>&
+ iter_deref_func,
+ T* cf_list, SequenceNumber* snapshot) {
+ PERF_TIMER_GUARD(get_snapshot_time);
+
+ bool last_try = false;
+ if (cf_list->size() == 1) {
+ // Fast path for a single column family. We can simply get the thread loca
+ // super version
+ auto cf_iter = cf_list->begin();
+ auto node = iter_deref_func(cf_iter);
+ node->super_version = GetAndRefSuperVersion(node->cfd);
+ if (read_options.snapshot != nullptr) {
+ // Note: In WritePrepared txns this is not necessary but not harmful
+ // either. Because prep_seq > snapshot => commit_seq > snapshot so if
+ // a snapshot is specified we should be fine with skipping seq numbers
+ // that are greater than that.
+ //
+ // In WriteUnprepared, we cannot set snapshot in the lookup key because we
+ // may skip uncommitted data that should be visible to the transaction for
+ // reading own writes.
+ *snapshot =
+ static_cast<const SnapshotImpl*>(read_options.snapshot)->number_;
+ if (callback) {
+ *snapshot = std::max(*snapshot, callback->max_visible_seq());
+ }
+ } else {
+ // Since we get and reference the super version before getting
+ // the snapshot number, without a mutex protection, it is possible
+ // that a memtable switch happened in the middle and not all the
+ // data for this snapshot is available. But it will contain all
+ // the data available in the super version we have, which is also
+ // a valid snapshot to read from.
+ // We shouldn't get snapshot before finding and referencing the super
+ // version because a flush happening in between may compact away data for
+ // the snapshot, but the snapshot is earlier than the data overwriting it,
+ // so users may see wrong results.
+ *snapshot = last_seq_same_as_publish_seq_
+ ? versions_->LastSequence()
+ : versions_->LastPublishedSequence();
+ }
+ } else {
+ // If we end up with the same issue of memtable geting sealed during 2
+ // consecutive retries, it means the write rate is very high. In that case
+ // its probably ok to take the mutex on the 3rd try so we can succeed for
+ // sure
+ static const int num_retries = 3;
+ for (int i = 0; i < num_retries; ++i) {
+ last_try = (i == num_retries - 1);
+ bool retry = false;
+
+ if (i > 0) {
+ for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end();
+ ++cf_iter) {
+ auto node = iter_deref_func(cf_iter);
+ SuperVersion* super_version = node->super_version;
+ ColumnFamilyData* cfd = node->cfd;
+ if (super_version != nullptr) {
+ ReturnAndCleanupSuperVersion(cfd, super_version);
+ }
+ node->super_version = nullptr;
+ }
+ }
+ if (read_options.snapshot == nullptr) {
+ if (last_try) {
+ TEST_SYNC_POINT("DBImpl::MultiGet::LastTry");
+ // We're close to max number of retries. For the last retry,
+ // acquire the lock so we're sure to succeed
+ mutex_.Lock();
+ }
+ *snapshot = last_seq_same_as_publish_seq_
+ ? versions_->LastSequence()
+ : versions_->LastPublishedSequence();
+ } else {
+ *snapshot = reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
+ ->number_;
+ }
+ for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end();
+ ++cf_iter) {
+ auto node = iter_deref_func(cf_iter);
+ if (!last_try) {
+ node->super_version = GetAndRefSuperVersion(node->cfd);
+ } else {
+ node->super_version = node->cfd->GetSuperVersion()->Ref();
+ }
+ TEST_SYNC_POINT("DBImpl::MultiGet::AfterRefSV");
+ if (read_options.snapshot != nullptr || last_try) {
+ // If user passed a snapshot, then we don't care if a memtable is
+ // sealed or compaction happens because the snapshot would ensure
+ // that older key versions are kept around. If this is the last
+ // retry, then we have the lock so nothing bad can happen
+ continue;
+ }
+ // We could get the earliest sequence number for the whole list of
+ // memtables, which will include immutable memtables as well, but that
+ // might be tricky to maintain in case we decide, in future, to do
+ // memtable compaction.
+ if (!last_try) {
+ SequenceNumber seq =
+ node->super_version->mem->GetEarliestSequenceNumber();
+ if (seq > *snapshot) {
+ retry = true;
+ break;
+ }
+ }
+ }
+ if (!retry) {
+ if (last_try) {
+ mutex_.Unlock();
+ }
+ break;
+ }
+ }
+ }
+
+ // Keep track of bytes that we read for statistics-recording later
+ PERF_TIMER_STOP(get_snapshot_time);
+
+ return last_try;
+}
+
+void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys,
+ ColumnFamilyHandle** column_families, const Slice* keys,
+ PinnableSlice* values, Status* statuses,
+ const bool sorted_input) {
+ if (num_keys == 0) {
+ return;
+ }
+ autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
+ autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
+ sorted_keys.resize(num_keys);
+ for (size_t i = 0; i < num_keys; ++i) {
+ key_context.emplace_back(column_families[i], keys[i], &values[i],
+ &statuses[i]);
+ }
+ for (size_t i = 0; i < num_keys; ++i) {
+ sorted_keys[i] = &key_context[i];
+ }
+ PrepareMultiGetKeys(num_keys, sorted_input, &sorted_keys);
+
+ autovector<MultiGetColumnFamilyData, MultiGetContext::MAX_BATCH_SIZE>
+ multiget_cf_data;
+ size_t cf_start = 0;
+ ColumnFamilyHandle* cf = sorted_keys[0]->column_family;
+ for (size_t i = 0; i < num_keys; ++i) {
+ KeyContext* key_ctx = sorted_keys[i];
+ if (key_ctx->column_family != cf) {
+ multiget_cf_data.emplace_back(
+ MultiGetColumnFamilyData(cf, cf_start, i - cf_start, nullptr));
+ cf_start = i;
+ cf = key_ctx->column_family;
+ }
+ }
+ {
+ // multiget_cf_data.emplace_back(
+ // MultiGetColumnFamilyData(cf, cf_start, num_keys - cf_start, nullptr));
+ multiget_cf_data.emplace_back(cf, cf_start, num_keys - cf_start, nullptr);
+ }
+ std::function<MultiGetColumnFamilyData*(
+ autovector<MultiGetColumnFamilyData,
+ MultiGetContext::MAX_BATCH_SIZE>::iterator&)>
+ iter_deref_lambda =
+ [](autovector<MultiGetColumnFamilyData,
+ MultiGetContext::MAX_BATCH_SIZE>::iterator& cf_iter) {
+ return &(*cf_iter);
+ };
+
+ SequenceNumber consistent_seqnum;
+ bool unref_only = MultiCFSnapshot<
+ autovector<MultiGetColumnFamilyData, MultiGetContext::MAX_BATCH_SIZE>>(
+ read_options, nullptr, iter_deref_lambda, &multiget_cf_data,
+ &consistent_seqnum);
+
+ for (auto cf_iter = multiget_cf_data.begin();
+ cf_iter != multiget_cf_data.end(); ++cf_iter) {
+ MultiGetImpl(read_options, cf_iter->start, cf_iter->num_keys, &sorted_keys,
+ cf_iter->super_version, consistent_seqnum, nullptr, nullptr);
+ if (!unref_only) {
+ ReturnAndCleanupSuperVersion(cf_iter->cfd, cf_iter->super_version);
+ } else {
+ cf_iter->cfd->GetSuperVersion()->Unref();
+ }
+ }
+}
+
+namespace {
+// Order keys by CF ID, followed by key contents
+struct CompareKeyContext {
+ inline bool operator()(const KeyContext* lhs, const KeyContext* rhs) {
+ ColumnFamilyHandleImpl* cfh =
+ static_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
+ uint32_t cfd_id1 = cfh->cfd()->GetID();
+ const Comparator* comparator = cfh->cfd()->user_comparator();
+ cfh = static_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
+ uint32_t cfd_id2 = cfh->cfd()->GetID();
+
+ if (cfd_id1 < cfd_id2) {
+ return true;
+ } else if (cfd_id1 > cfd_id2) {
+ return false;
+ }
+
+ // Both keys are from the same column family
+ int cmp = comparator->Compare(*(lhs->key), *(rhs->key));
+ if (cmp < 0) {
+ return true;
+ }
+ return false;
+ }
+};
+
+} // anonymous namespace
+
+void DBImpl::PrepareMultiGetKeys(
+ size_t num_keys, bool sorted_input,
+ autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys) {
+#ifndef NDEBUG
+ if (sorted_input) {
+ for (size_t index = 0; index < sorted_keys->size(); ++index) {
+ if (index > 0) {
+ KeyContext* lhs = (*sorted_keys)[index - 1];
+ KeyContext* rhs = (*sorted_keys)[index];
+ ColumnFamilyHandleImpl* cfh =
+ reinterpret_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
+ uint32_t cfd_id1 = cfh->cfd()->GetID();
+ const Comparator* comparator = cfh->cfd()->user_comparator();
+ cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
+ uint32_t cfd_id2 = cfh->cfd()->GetID();
+
+ assert(cfd_id1 <= cfd_id2);
+ if (cfd_id1 < cfd_id2) {
+ continue;
+ }
+
+ // Both keys are from the same column family
+ int cmp = comparator->Compare(*(lhs->key), *(rhs->key));
+ assert(cmp <= 0);
+ }
+ index++;
+ }
+ }
+#endif
+ if (!sorted_input) {
+ CompareKeyContext sort_comparator;
+ std::sort(sorted_keys->begin(), sorted_keys->begin() + num_keys,
+ sort_comparator);
+ }
+}
+
+void DBImpl::MultiGet(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family, const size_t num_keys,
+ const Slice* keys, PinnableSlice* values,
+ Status* statuses, const bool sorted_input) {
+ autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
+ autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
+ sorted_keys.resize(num_keys);
+ for (size_t i = 0; i < num_keys; ++i) {
+ key_context.emplace_back(column_family, keys[i], &values[i], &statuses[i]);
+ }
+ for (size_t i = 0; i < num_keys; ++i) {
+ sorted_keys[i] = &key_context[i];
+ }
+ PrepareMultiGetKeys(num_keys, sorted_input, &sorted_keys);
+ MultiGetWithCallback(read_options, column_family, nullptr, &sorted_keys);
+}
+
+void DBImpl::MultiGetWithCallback(
+ const ReadOptions& read_options, ColumnFamilyHandle* column_family,
+ ReadCallback* callback,
+ autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys) {
+ std::array<MultiGetColumnFamilyData, 1> multiget_cf_data;
+ multiget_cf_data[0] = MultiGetColumnFamilyData(column_family, nullptr);
+ std::function<MultiGetColumnFamilyData*(
+ std::array<MultiGetColumnFamilyData, 1>::iterator&)>
+ iter_deref_lambda =
+ [](std::array<MultiGetColumnFamilyData, 1>::iterator& cf_iter) {
+ return &(*cf_iter);
+ };
+
+ size_t num_keys = sorted_keys->size();
+ SequenceNumber consistent_seqnum;
+ bool unref_only = MultiCFSnapshot<std::array<MultiGetColumnFamilyData, 1>>(
+ read_options, callback, iter_deref_lambda, &multiget_cf_data,
+ &consistent_seqnum);
+#ifndef NDEBUG
+ assert(!unref_only);
+#else
+ // Silence unused variable warning
+ (void)unref_only;
+#endif // NDEBUG
+
+ if (callback && read_options.snapshot == nullptr) {
+ // The unprep_seqs are not published for write unprepared, so it could be
+ // that max_visible_seq is larger. Seek to the std::max of the two.
+ // However, we still want our callback to contain the actual snapshot so
+ // that it can do the correct visibility filtering.
+ callback->Refresh(consistent_seqnum);
+
+ // Internally, WriteUnpreparedTxnReadCallback::Refresh would set
+ // max_visible_seq = max(max_visible_seq, snapshot)
+ //
+ // Currently, the commented out assert is broken by
+ // InvalidSnapshotReadCallback, but if write unprepared recovery followed
+ // the regular transaction flow, then this special read callback would not
+ // be needed.
+ //
+ // assert(callback->max_visible_seq() >= snapshot);
+ consistent_seqnum = callback->max_visible_seq();
+ }
+
+ MultiGetImpl(read_options, 0, num_keys, sorted_keys,
+ multiget_cf_data[0].super_version, consistent_seqnum, nullptr,
+ nullptr);
+ ReturnAndCleanupSuperVersion(multiget_cf_data[0].cfd,
+ multiget_cf_data[0].super_version);
+}
+
+void DBImpl::MultiGetImpl(
+ const ReadOptions& read_options, size_t start_key, size_t num_keys,
+ autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys,
+ SuperVersion* super_version, SequenceNumber snapshot,
+ ReadCallback* callback, bool* is_blob_index) {
+ PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
+ StopWatch sw(env_, stats_, DB_MULTIGET);
+
+ // For each of the given keys, apply the entire "get" process as follows:
+ // First look in the memtable, then in the immutable memtable (if any).
+ // s is both in/out. When in, s could either be OK or MergeInProgress.
+ // merge_operands will contain the sequence of merges in the latter case.
+ size_t keys_left = num_keys;
+ while (keys_left) {
+ size_t batch_size = (keys_left > MultiGetContext::MAX_BATCH_SIZE)
+ ? MultiGetContext::MAX_BATCH_SIZE
+ : keys_left;
+ MultiGetContext ctx(sorted_keys, start_key + num_keys - keys_left,
+ batch_size, snapshot);
+ MultiGetRange range = ctx.GetMultiGetRange();
+ bool lookup_current = false;
+
+ keys_left -= batch_size;
+ for (auto mget_iter = range.begin(); mget_iter != range.end();
+ ++mget_iter) {
+ mget_iter->merge_context.Clear();
+ *mget_iter->s = Status::OK();
+ }
+
+ bool skip_memtable =
+ (read_options.read_tier == kPersistedTier &&
+ has_unpersisted_data_.load(std::memory_order_relaxed));
+ if (!skip_memtable) {
+ super_version->mem->MultiGet(read_options, &range, callback,
+ is_blob_index);
+ if (!range.empty()) {
+ super_version->imm->MultiGet(read_options, &range, callback,
+ is_blob_index);
+ }
+ if (!range.empty()) {
+ lookup_current = true;
+ uint64_t left = range.KeysLeft();
+ RecordTick(stats_, MEMTABLE_MISS, left);
+ }
+ }
+ if (lookup_current) {
+ PERF_TIMER_GUARD(get_from_output_files_time);
+ super_version->current->MultiGet(read_options, &range, callback,
+ is_blob_index);
+ }
+ }
+
+ // Post processing (decrement reference counts and record statistics)
+ PERF_TIMER_GUARD(get_post_process_time);
+ size_t num_found = 0;
+ uint64_t bytes_read = 0;
+ for (size_t i = start_key; i < start_key + num_keys; ++i) {
+ KeyContext* key = (*sorted_keys)[i];
+ if (key->s->ok()) {
+ bytes_read += key->value->size();
+ num_found++;
+ }
+ }
+
+ RecordTick(stats_, NUMBER_MULTIGET_CALLS);
+ RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys);
+ RecordTick(stats_, NUMBER_MULTIGET_KEYS_FOUND, num_found);
+ RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read);
+ RecordInHistogram(stats_, BYTES_PER_MULTIGET, bytes_read);
+ PERF_COUNTER_ADD(multiget_read_bytes, bytes_read);
+ PERF_TIMER_STOP(get_post_process_time);
+}
+
+Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options,
+ const std::string& column_family,
+ ColumnFamilyHandle** handle) {
+ assert(handle != nullptr);
+ Status s = CreateColumnFamilyImpl(cf_options, column_family, handle);
+ if (s.ok()) {
+ s = WriteOptionsFile(true /*need_mutex_lock*/,
+ true /*need_enter_write_thread*/);
+ }
+ return s;
+}
+
+Status DBImpl::CreateColumnFamilies(
+ const ColumnFamilyOptions& cf_options,
+ const std::vector<std::string>& column_family_names,
+ std::vector<ColumnFamilyHandle*>* handles) {
+ assert(handles != nullptr);
+ handles->clear();
+ size_t num_cf = column_family_names.size();
+ Status s;
+ bool success_once = false;
+ for (size_t i = 0; i < num_cf; i++) {
+ ColumnFamilyHandle* handle;
+ s = CreateColumnFamilyImpl(cf_options, column_family_names[i], &handle);
+ if (!s.ok()) {
+ break;
+ }
+ handles->push_back(handle);
+ success_once = true;
+ }
+ if (success_once) {
+ Status persist_options_status = WriteOptionsFile(
+ true /*need_mutex_lock*/, true /*need_enter_write_thread*/);
+ if (s.ok() && !persist_options_status.ok()) {
+ s = persist_options_status;
+ }
+ }
+ return s;
+}
+
+Status DBImpl::CreateColumnFamilies(
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles) {
+ assert(handles != nullptr);
+ handles->clear();
+ size_t num_cf = column_families.size();
+ Status s;
+ bool success_once = false;
+ for (size_t i = 0; i < num_cf; i++) {
+ ColumnFamilyHandle* handle;
+ s = CreateColumnFamilyImpl(column_families[i].options,
+ column_families[i].name, &handle);
+ if (!s.ok()) {
+ break;
+ }
+ handles->push_back(handle);
+ success_once = true;
+ }
+ if (success_once) {
+ Status persist_options_status = WriteOptionsFile(
+ true /*need_mutex_lock*/, true /*need_enter_write_thread*/);
+ if (s.ok() && !persist_options_status.ok()) {
+ s = persist_options_status;
+ }
+ }
+ return s;
+}
+
+Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options,
+ const std::string& column_family_name,
+ ColumnFamilyHandle** handle) {
+ Status s;
+ Status persist_options_status;
+ *handle = nullptr;
+
+ DBOptions db_options =
+ BuildDBOptions(immutable_db_options_, mutable_db_options_);
+ s = ColumnFamilyData::ValidateOptions(db_options, cf_options);
+ if (s.ok()) {
+ for (auto& cf_path : cf_options.cf_paths) {
+ s = env_->CreateDirIfMissing(cf_path.path);
+ if (!s.ok()) {
+ break;
+ }
+ }
+ }
+ if (!s.ok()) {
+ return s;
+ }
+
+ SuperVersionContext sv_context(/* create_superversion */ true);
+ {
+ InstrumentedMutexLock l(&mutex_);
+
+ if (versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name) !=
+ nullptr) {
+ return Status::InvalidArgument("Column family already exists");
+ }
+ VersionEdit edit;
+ edit.AddColumnFamily(column_family_name);
+ uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID();
+ edit.SetColumnFamily(new_id);
+ edit.SetLogNumber(logfile_number_);
+ edit.SetComparatorName(cf_options.comparator->Name());
+
+ // LogAndApply will both write the creation in MANIFEST and create
+ // ColumnFamilyData object
+ { // write thread
+ WriteThread::Writer w;
+ write_thread_.EnterUnbatched(&w, &mutex_);
+ // LogAndApply will both write the creation in MANIFEST and create
+ // ColumnFamilyData object
+ s = versions_->LogAndApply(nullptr, MutableCFOptions(cf_options), &edit,
+ &mutex_, directories_.GetDbDir(), false,
+ &cf_options);
+ write_thread_.ExitUnbatched(&w);
+ }
+ if (s.ok()) {
+ auto* cfd =
+ versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name);
+ assert(cfd != nullptr);
+ std::map<std::string, std::shared_ptr<Directory>> dummy_created_dirs;
+ s = cfd->AddDirectories(&dummy_created_dirs);
+ }
+ if (s.ok()) {
+ single_column_family_mode_ = false;
+ auto* cfd =
+ versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name);
+ assert(cfd != nullptr);
+ InstallSuperVersionAndScheduleWork(cfd, &sv_context,
+ *cfd->GetLatestMutableCFOptions());
+
+ if (!cfd->mem()->IsSnapshotSupported()) {
+ is_snapshot_supported_ = false;
+ }
+
+ cfd->set_initialized();
+
+ *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_);
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Created column family [%s] (ID %u)",
+ column_family_name.c_str(), (unsigned)cfd->GetID());
+ } else {
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "Creating column family [%s] FAILED -- %s",
+ column_family_name.c_str(), s.ToString().c_str());
+ }
+ } // InstrumentedMutexLock l(&mutex_)
+
+ sv_context.Clean();
+ // this is outside the mutex
+ if (s.ok()) {
+ NewThreadStatusCfInfo(
+ reinterpret_cast<ColumnFamilyHandleImpl*>(*handle)->cfd());
+ }
+ return s;
+}
+
+Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
+ assert(column_family != nullptr);
+ Status s = DropColumnFamilyImpl(column_family);
+ if (s.ok()) {
+ s = WriteOptionsFile(true /*need_mutex_lock*/,
+ true /*need_enter_write_thread*/);
+ }
+ return s;
+}
+
+Status DBImpl::DropColumnFamilies(
+ const std::vector<ColumnFamilyHandle*>& column_families) {
+ Status s;
+ bool success_once = false;
+ for (auto* handle : column_families) {
+ s = DropColumnFamilyImpl(handle);
+ if (!s.ok()) {
+ break;
+ }
+ success_once = true;
+ }
+ if (success_once) {
+ Status persist_options_status = WriteOptionsFile(
+ true /*need_mutex_lock*/, true /*need_enter_write_thread*/);
+ if (s.ok() && !persist_options_status.ok()) {
+ s = persist_options_status;
+ }
+ }
+ return s;
+}
+
+Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ auto cfd = cfh->cfd();
+ if (cfd->GetID() == 0) {
+ return Status::InvalidArgument("Can't drop default column family");
+ }
+
+ bool cf_support_snapshot = cfd->mem()->IsSnapshotSupported();
+
+ VersionEdit edit;
+ edit.DropColumnFamily();
+ edit.SetColumnFamily(cfd->GetID());
+
+ Status s;
+ {
+ InstrumentedMutexLock l(&mutex_);
+ if (cfd->IsDropped()) {
+ s = Status::InvalidArgument("Column family already dropped!\n");
+ }
+ if (s.ok()) {
+ // we drop column family from a single write thread
+ WriteThread::Writer w;
+ write_thread_.EnterUnbatched(&w, &mutex_);
+ s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit,
+ &mutex_);
+ write_thread_.ExitUnbatched(&w);
+ }
+ if (s.ok()) {
+ auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+ max_total_in_memory_state_ -= mutable_cf_options->write_buffer_size *
+ mutable_cf_options->max_write_buffer_number;
+ }
+
+ if (!cf_support_snapshot) {
+ // Dropped Column Family doesn't support snapshot. Need to recalculate
+ // is_snapshot_supported_.
+ bool new_is_snapshot_supported = true;
+ for (auto c : *versions_->GetColumnFamilySet()) {
+ if (!c->IsDropped() && !c->mem()->IsSnapshotSupported()) {
+ new_is_snapshot_supported = false;
+ break;
+ }
+ }
+ is_snapshot_supported_ = new_is_snapshot_supported;
+ }
+ bg_cv_.SignalAll();
+ }
+
+ if (s.ok()) {
+ // Note that here we erase the associated cf_info of the to-be-dropped
+ // cfd before its ref-count goes to zero to avoid having to erase cf_info
+ // later inside db_mutex.
+ EraseThreadStatusCfInfo(cfd);
+ assert(cfd->IsDropped());
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Dropped column family with id %u\n", cfd->GetID());
+ } else {
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "Dropping column family with id %u FAILED -- %s\n",
+ cfd->GetID(), s.ToString().c_str());
+ }
+
+ return s;
+}
+
+bool DBImpl::KeyMayExist(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ std::string* value, bool* value_found) {
+ assert(value != nullptr);
+ if (value_found != nullptr) {
+ // falsify later if key-may-exist but can't fetch value
+ *value_found = true;
+ }
+ ReadOptions roptions = read_options;
+ roptions.read_tier = kBlockCacheTier; // read from block cache only
+ PinnableSlice pinnable_val;
+ GetImplOptions get_impl_options;
+ get_impl_options.column_family = column_family;
+ get_impl_options.value = &pinnable_val;
+ get_impl_options.value_found = value_found;
+ auto s = GetImpl(roptions, key, get_impl_options);
+ value->assign(pinnable_val.data(), pinnable_val.size());
+
+ // If block_cache is enabled and the index block of the table didn't
+ // not present in block_cache, the return value will be Status::Incomplete.
+ // In this case, key may still exist in the table.
+ return s.ok() || s.IsIncomplete();
+}
+
+Iterator* DBImpl::NewIterator(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family) {
+ if (read_options.managed) {
+ return NewErrorIterator(
+ Status::NotSupported("Managed iterator is not supported anymore."));
+ }
+ Iterator* result = nullptr;
+ if (read_options.read_tier == kPersistedTier) {
+ return NewErrorIterator(Status::NotSupported(
+ "ReadTier::kPersistedData is not yet supported in iterators."));
+ }
+ // if iterator wants internal keys, we can only proceed if
+ // we can guarantee the deletes haven't been processed yet
+ if (immutable_db_options_.preserve_deletes &&
+ read_options.iter_start_seqnum > 0 &&
+ read_options.iter_start_seqnum < preserve_deletes_seqnum_.load()) {
+ return NewErrorIterator(Status::InvalidArgument(
+ "Iterator requested internal keys which are too old and are not"
+ " guaranteed to be preserved, try larger iter_start_seqnum opt."));
+ }
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ auto cfd = cfh->cfd();
+ ReadCallback* read_callback = nullptr; // No read callback provided.
+ if (read_options.tailing) {
+#ifdef ROCKSDB_LITE
+ // not supported in lite version
+ result = nullptr;
+
+#else
+ SuperVersion* sv = cfd->GetReferencedSuperVersion(this);
+ auto iter = new ForwardIterator(this, read_options, cfd, sv);
+ result = NewDBIterator(
+ env_, read_options, *cfd->ioptions(), sv->mutable_cf_options,
+ cfd->user_comparator(), iter, kMaxSequenceNumber,
+ sv->mutable_cf_options.max_sequential_skip_in_iterations, read_callback,
+ this, cfd);
+#endif
+ } else {
+ // Note: no need to consider the special case of
+ // last_seq_same_as_publish_seq_==false since NewIterator is overridden in
+ // WritePreparedTxnDB
+ auto snapshot = read_options.snapshot != nullptr
+ ? read_options.snapshot->GetSequenceNumber()
+ : versions_->LastSequence();
+ result = NewIteratorImpl(read_options, cfd, snapshot, read_callback);
+ }
+ return result;
+}
+
+ArenaWrappedDBIter* DBImpl::NewIteratorImpl(const ReadOptions& read_options,
+ ColumnFamilyData* cfd,
+ SequenceNumber snapshot,
+ ReadCallback* read_callback,
+ bool allow_blob,
+ bool allow_refresh) {
+ SuperVersion* sv = cfd->GetReferencedSuperVersion(this);
+
+ // Try to generate a DB iterator tree in continuous memory area to be
+ // cache friendly. Here is an example of result:
+ // +-------------------------------+
+ // | |
+ // | ArenaWrappedDBIter |
+ // | + |
+ // | +---> Inner Iterator ------------+
+ // | | | |
+ // | | +-- -- -- -- -- -- -- --+ |
+ // | +--- | Arena | |
+ // | | | |
+ // | Allocated Memory: | |
+ // | | +-------------------+ |
+ // | | | DBIter | <---+
+ // | | + |
+ // | | | +-> iter_ ------------+
+ // | | | | |
+ // | | +-------------------+ |
+ // | | | MergingIterator | <---+
+ // | | + |
+ // | | | +->child iter1 ------------+
+ // | | | | | |
+ // | | +->child iter2 ----------+ |
+ // | | | | | | |
+ // | | | +->child iter3 --------+ | |
+ // | | | | | |
+ // | | +-------------------+ | | |
+ // | | | Iterator1 | <--------+
+ // | | +-------------------+ | |
+ // | | | Iterator2 | <------+
+ // | | +-------------------+ |
+ // | | | Iterator3 | <----+
+ // | | +-------------------+
+ // | | |
+ // +-------+-----------------------+
+ //
+ // ArenaWrappedDBIter inlines an arena area where all the iterators in
+ // the iterator tree are allocated in the order of being accessed when
+ // querying.
+ // Laying out the iterators in the order of being accessed makes it more
+ // likely that any iterator pointer is close to the iterator it points to so
+ // that they are likely to be in the same cache line and/or page.
+ ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
+ env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, snapshot,
+ sv->mutable_cf_options.max_sequential_skip_in_iterations,
+ sv->version_number, read_callback, this, cfd, allow_blob,
+ read_options.snapshot != nullptr ? false : allow_refresh);
+
+ InternalIterator* internal_iter =
+ NewInternalIterator(read_options, cfd, sv, db_iter->GetArena(),
+ db_iter->GetRangeDelAggregator(), snapshot);
+ db_iter->SetIterUnderDBIter(internal_iter);
+
+ return db_iter;
+}
+
+Status DBImpl::NewIterators(
+ const ReadOptions& read_options,
+ const std::vector<ColumnFamilyHandle*>& column_families,
+ std::vector<Iterator*>* iterators) {
+ if (read_options.managed) {
+ return Status::NotSupported("Managed iterator is not supported anymore.");
+ }
+ if (read_options.read_tier == kPersistedTier) {
+ return Status::NotSupported(
+ "ReadTier::kPersistedData is not yet supported in iterators.");
+ }
+ ReadCallback* read_callback = nullptr; // No read callback provided.
+ iterators->clear();
+ iterators->reserve(column_families.size());
+ if (read_options.tailing) {
+#ifdef ROCKSDB_LITE
+ return Status::InvalidArgument(
+ "Tailing iterator not supported in RocksDB lite");
+#else
+ for (auto cfh : column_families) {
+ auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
+ SuperVersion* sv = cfd->GetReferencedSuperVersion(this);
+ auto iter = new ForwardIterator(this, read_options, cfd, sv);
+ iterators->push_back(NewDBIterator(
+ env_, read_options, *cfd->ioptions(), sv->mutable_cf_options,
+ cfd->user_comparator(), iter, kMaxSequenceNumber,
+ sv->mutable_cf_options.max_sequential_skip_in_iterations,
+ read_callback, this, cfd));
+ }
+#endif
+ } else {
+ // Note: no need to consider the special case of
+ // last_seq_same_as_publish_seq_==false since NewIterators is overridden in
+ // WritePreparedTxnDB
+ auto snapshot = read_options.snapshot != nullptr
+ ? read_options.snapshot->GetSequenceNumber()
+ : versions_->LastSequence();
+ for (size_t i = 0; i < column_families.size(); ++i) {
+ auto* cfd =
+ reinterpret_cast<ColumnFamilyHandleImpl*>(column_families[i])->cfd();
+ iterators->push_back(
+ NewIteratorImpl(read_options, cfd, snapshot, read_callback));
+ }
+ }
+
+ return Status::OK();
+}
+
+const Snapshot* DBImpl::GetSnapshot() { return GetSnapshotImpl(false); }
+
+#ifndef ROCKSDB_LITE
+const Snapshot* DBImpl::GetSnapshotForWriteConflictBoundary() {
+ return GetSnapshotImpl(true);
+}
+#endif // ROCKSDB_LITE
+
+SnapshotImpl* DBImpl::GetSnapshotImpl(bool is_write_conflict_boundary,
+ bool lock) {
+ int64_t unix_time = 0;
+ env_->GetCurrentTime(&unix_time); // Ignore error
+ SnapshotImpl* s = new SnapshotImpl;
+
+ if (lock) {
+ mutex_.Lock();
+ }
+ // returns null if the underlying memtable does not support snapshot.
+ if (!is_snapshot_supported_) {
+ if (lock) {
+ mutex_.Unlock();
+ }
+ delete s;
+ return nullptr;
+ }
+ auto snapshot_seq = last_seq_same_as_publish_seq_
+ ? versions_->LastSequence()
+ : versions_->LastPublishedSequence();
+ SnapshotImpl* snapshot =
+ snapshots_.New(s, snapshot_seq, unix_time, is_write_conflict_boundary);
+ if (lock) {
+ mutex_.Unlock();
+ }
+ return snapshot;
+}
+
+namespace {
+typedef autovector<ColumnFamilyData*, 2> CfdList;
+bool CfdListContains(const CfdList& list, ColumnFamilyData* cfd) {
+ for (const ColumnFamilyData* t : list) {
+ if (t == cfd) {
+ return true;
+ }
+ }
+ return false;
+}
+} // namespace
+
+void DBImpl::ReleaseSnapshot(const Snapshot* s) {
+ const SnapshotImpl* casted_s = reinterpret_cast<const SnapshotImpl*>(s);
+ {
+ InstrumentedMutexLock l(&mutex_);
+ snapshots_.Delete(casted_s);
+ uint64_t oldest_snapshot;
+ if (snapshots_.empty()) {
+ oldest_snapshot = last_seq_same_as_publish_seq_
+ ? versions_->LastSequence()
+ : versions_->LastPublishedSequence();
+ } else {
+ oldest_snapshot = snapshots_.oldest()->number_;
+ }
+ // Avoid to go through every column family by checking a global threshold
+ // first.
+ if (oldest_snapshot > bottommost_files_mark_threshold_) {
+ CfdList cf_scheduled;
+ for (auto* cfd : *versions_->GetColumnFamilySet()) {
+ cfd->current()->storage_info()->UpdateOldestSnapshot(oldest_snapshot);
+ if (!cfd->current()
+ ->storage_info()
+ ->BottommostFilesMarkedForCompaction()
+ .empty()) {
+ SchedulePendingCompaction(cfd);
+ MaybeScheduleFlushOrCompaction();
+ cf_scheduled.push_back(cfd);
+ }
+ }
+
+ // Calculate a new threshold, skipping those CFs where compactions are
+ // scheduled. We do not do the same pass as the previous loop because
+ // mutex might be unlocked during the loop, making the result inaccurate.
+ SequenceNumber new_bottommost_files_mark_threshold = kMaxSequenceNumber;
+ for (auto* cfd : *versions_->GetColumnFamilySet()) {
+ if (CfdListContains(cf_scheduled, cfd)) {
+ continue;
+ }
+ new_bottommost_files_mark_threshold = std::min(
+ new_bottommost_files_mark_threshold,
+ cfd->current()->storage_info()->bottommost_files_mark_threshold());
+ }
+ bottommost_files_mark_threshold_ = new_bottommost_files_mark_threshold;
+ }
+ }
+ delete casted_s;
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
+ TablePropertiesCollection* props) {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ auto cfd = cfh->cfd();
+
+ // Increment the ref count
+ mutex_.Lock();
+ auto version = cfd->current();
+ version->Ref();
+ mutex_.Unlock();
+
+ auto s = version->GetPropertiesOfAllTables(props);
+
+ // Decrement the ref count
+ mutex_.Lock();
+ version->Unref();
+ mutex_.Unlock();
+
+ return s;
+}
+
+Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family,
+ const Range* range, std::size_t n,
+ TablePropertiesCollection* props) {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ auto cfd = cfh->cfd();
+
+ // Increment the ref count
+ mutex_.Lock();
+ auto version = cfd->current();
+ version->Ref();
+ mutex_.Unlock();
+
+ auto s = version->GetPropertiesOfTablesInRange(range, n, props);
+
+ // Decrement the ref count
+ mutex_.Lock();
+ version->Unref();
+ mutex_.Unlock();
+
+ return s;
+}
+
+#endif // ROCKSDB_LITE
+
+const std::string& DBImpl::GetName() const { return dbname_; }
+
+Env* DBImpl::GetEnv() const { return env_; }
+
+FileSystem* DB::GetFileSystem() const {
+ static LegacyFileSystemWrapper fs_wrap(GetEnv());
+ return &fs_wrap;
+}
+
+FileSystem* DBImpl::GetFileSystem() const {
+ return immutable_db_options_.fs.get();
+}
+
+Options DBImpl::GetOptions(ColumnFamilyHandle* column_family) const {
+ InstrumentedMutexLock l(&mutex_);
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ return Options(BuildDBOptions(immutable_db_options_, mutable_db_options_),
+ cfh->cfd()->GetLatestCFOptions());
+}
+
+DBOptions DBImpl::GetDBOptions() const {
+ InstrumentedMutexLock l(&mutex_);
+ return BuildDBOptions(immutable_db_options_, mutable_db_options_);
+}
+
+bool DBImpl::GetProperty(ColumnFamilyHandle* column_family,
+ const Slice& property, std::string* value) {
+ const DBPropertyInfo* property_info = GetPropertyInfo(property);
+ value->clear();
+ auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+ if (property_info == nullptr) {
+ return false;
+ } else if (property_info->handle_int) {
+ uint64_t int_value;
+ bool ret_value =
+ GetIntPropertyInternal(cfd, *property_info, false, &int_value);
+ if (ret_value) {
+ *value = ToString(int_value);
+ }
+ return ret_value;
+ } else if (property_info->handle_string) {
+ InstrumentedMutexLock l(&mutex_);
+ return cfd->internal_stats()->GetStringProperty(*property_info, property,
+ value);
+ } else if (property_info->handle_string_dbimpl) {
+ std::string tmp_value;
+ bool ret_value = (this->*(property_info->handle_string_dbimpl))(&tmp_value);
+ if (ret_value) {
+ *value = tmp_value;
+ }
+ return ret_value;
+ }
+ // Shouldn't reach here since exactly one of handle_string and handle_int
+ // should be non-nullptr.
+ assert(false);
+ return false;
+}
+
+bool DBImpl::GetMapProperty(ColumnFamilyHandle* column_family,
+ const Slice& property,
+ std::map<std::string, std::string>* value) {
+ const DBPropertyInfo* property_info = GetPropertyInfo(property);
+ value->clear();
+ auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+ if (property_info == nullptr) {
+ return false;
+ } else if (property_info->handle_map) {
+ InstrumentedMutexLock l(&mutex_);
+ return cfd->internal_stats()->GetMapProperty(*property_info, property,
+ value);
+ }
+ // If we reach this point it means that handle_map is not provided for the
+ // requested property
+ return false;
+}
+
+bool DBImpl::GetIntProperty(ColumnFamilyHandle* column_family,
+ const Slice& property, uint64_t* value) {
+ const DBPropertyInfo* property_info = GetPropertyInfo(property);
+ if (property_info == nullptr || property_info->handle_int == nullptr) {
+ return false;
+ }
+ auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+ return GetIntPropertyInternal(cfd, *property_info, false, value);
+}
+
+bool DBImpl::GetIntPropertyInternal(ColumnFamilyData* cfd,
+ const DBPropertyInfo& property_info,
+ bool is_locked, uint64_t* value) {
+ assert(property_info.handle_int != nullptr);
+ if (!property_info.need_out_of_mutex) {
+ if (is_locked) {
+ mutex_.AssertHeld();
+ return cfd->internal_stats()->GetIntProperty(property_info, value, this);
+ } else {
+ InstrumentedMutexLock l(&mutex_);
+ return cfd->internal_stats()->GetIntProperty(property_info, value, this);
+ }
+ } else {
+ SuperVersion* sv = nullptr;
+ if (!is_locked) {
+ sv = GetAndRefSuperVersion(cfd);
+ } else {
+ sv = cfd->GetSuperVersion();
+ }
+
+ bool ret = cfd->internal_stats()->GetIntPropertyOutOfMutex(
+ property_info, sv->current, value);
+
+ if (!is_locked) {
+ ReturnAndCleanupSuperVersion(cfd, sv);
+ }
+
+ return ret;
+ }
+}
+
+bool DBImpl::GetPropertyHandleOptionsStatistics(std::string* value) {
+ assert(value != nullptr);
+ Statistics* statistics = immutable_db_options_.statistics.get();
+ if (!statistics) {
+ return false;
+ }
+ *value = statistics->ToString();
+ return true;
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::ResetStats() {
+ InstrumentedMutexLock l(&mutex_);
+ for (auto* cfd : *versions_->GetColumnFamilySet()) {
+ if (cfd->initialized()) {
+ cfd->internal_stats()->Clear();
+ }
+ }
+ return Status::OK();
+}
+#endif // ROCKSDB_LITE
+
+bool DBImpl::GetAggregatedIntProperty(const Slice& property,
+ uint64_t* aggregated_value) {
+ const DBPropertyInfo* property_info = GetPropertyInfo(property);
+ if (property_info == nullptr || property_info->handle_int == nullptr) {
+ return false;
+ }
+
+ uint64_t sum = 0;
+ {
+ // Needs mutex to protect the list of column families.
+ InstrumentedMutexLock l(&mutex_);
+ uint64_t value;
+ for (auto* cfd : *versions_->GetColumnFamilySet()) {
+ if (!cfd->initialized()) {
+ continue;
+ }
+ if (GetIntPropertyInternal(cfd, *property_info, true, &value)) {
+ sum += value;
+ } else {
+ return false;
+ }
+ }
+ }
+ *aggregated_value = sum;
+ return true;
+}
+
+SuperVersion* DBImpl::GetAndRefSuperVersion(ColumnFamilyData* cfd) {
+ // TODO(ljin): consider using GetReferencedSuperVersion() directly
+ return cfd->GetThreadLocalSuperVersion(this);
+}
+
+// REQUIRED: this function should only be called on the write thread or if the
+// mutex is held.
+SuperVersion* DBImpl::GetAndRefSuperVersion(uint32_t column_family_id) {
+ auto column_family_set = versions_->GetColumnFamilySet();
+ auto cfd = column_family_set->GetColumnFamily(column_family_id);
+ if (!cfd) {
+ return nullptr;
+ }
+
+ return GetAndRefSuperVersion(cfd);
+}
+
+void DBImpl::CleanupSuperVersion(SuperVersion* sv) {
+ // Release SuperVersion
+ if (sv->Unref()) {
+ bool defer_purge =
+ immutable_db_options().avoid_unnecessary_blocking_io;
+ {
+ InstrumentedMutexLock l(&mutex_);
+ sv->Cleanup();
+ if (defer_purge) {
+ AddSuperVersionsToFreeQueue(sv);
+ SchedulePurge();
+ }
+ }
+ if (!defer_purge) {
+ delete sv;
+ }
+ RecordTick(stats_, NUMBER_SUPERVERSION_CLEANUPS);
+ }
+ RecordTick(stats_, NUMBER_SUPERVERSION_RELEASES);
+}
+
+void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd,
+ SuperVersion* sv) {
+ if (!cfd->ReturnThreadLocalSuperVersion(sv)) {
+ CleanupSuperVersion(sv);
+ }
+}
+
+// REQUIRED: this function should only be called on the write thread.
+void DBImpl::ReturnAndCleanupSuperVersion(uint32_t column_family_id,
+ SuperVersion* sv) {
+ auto column_family_set = versions_->GetColumnFamilySet();
+ auto cfd = column_family_set->GetColumnFamily(column_family_id);
+
+ // If SuperVersion is held, and we successfully fetched a cfd using
+ // GetAndRefSuperVersion(), it must still exist.
+ assert(cfd != nullptr);
+ ReturnAndCleanupSuperVersion(cfd, sv);
+}
+
+// REQUIRED: this function should only be called on the write thread or if the
+// mutex is held.
+ColumnFamilyHandle* DBImpl::GetColumnFamilyHandle(uint32_t column_family_id) {
+ ColumnFamilyMemTables* cf_memtables = column_family_memtables_.get();
+
+ if (!cf_memtables->Seek(column_family_id)) {
+ return nullptr;
+ }
+
+ return cf_memtables->GetColumnFamilyHandle();
+}
+
+// REQUIRED: mutex is NOT held.
+std::unique_ptr<ColumnFamilyHandle> DBImpl::GetColumnFamilyHandleUnlocked(
+ uint32_t column_family_id) {
+ InstrumentedMutexLock l(&mutex_);
+
+ auto* cfd =
+ versions_->GetColumnFamilySet()->GetColumnFamily(column_family_id);
+ if (cfd == nullptr) {
+ return nullptr;
+ }
+
+ return std::unique_ptr<ColumnFamilyHandleImpl>(
+ new ColumnFamilyHandleImpl(cfd, this, &mutex_));
+}
+
+void DBImpl::GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
+ const Range& range,
+ uint64_t* const count,
+ uint64_t* const size) {
+ ColumnFamilyHandleImpl* cfh =
+ reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ ColumnFamilyData* cfd = cfh->cfd();
+ SuperVersion* sv = GetAndRefSuperVersion(cfd);
+
+ // Convert user_key into a corresponding internal key.
+ InternalKey k1(range.start, kMaxSequenceNumber, kValueTypeForSeek);
+ InternalKey k2(range.limit, kMaxSequenceNumber, kValueTypeForSeek);
+ MemTable::MemTableStats memStats =
+ sv->mem->ApproximateStats(k1.Encode(), k2.Encode());
+ MemTable::MemTableStats immStats =
+ sv->imm->ApproximateStats(k1.Encode(), k2.Encode());
+ *count = memStats.count + immStats.count;
+ *size = memStats.size + immStats.size;
+
+ ReturnAndCleanupSuperVersion(cfd, sv);
+}
+
+Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Range* range, int n, uint64_t* sizes) {
+ if (!options.include_memtabtles && !options.include_files) {
+ return Status::InvalidArgument("Invalid options");
+ }
+
+ Version* v;
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ auto cfd = cfh->cfd();
+ SuperVersion* sv = GetAndRefSuperVersion(cfd);
+ v = sv->current;
+
+ for (int i = 0; i < n; i++) {
+ // Convert user_key into a corresponding internal key.
+ InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
+ InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
+ sizes[i] = 0;
+ if (options.include_files) {
+ sizes[i] += versions_->ApproximateSize(
+ options, v, k1.Encode(), k2.Encode(), /*start_level=*/0,
+ /*end_level=*/-1, TableReaderCaller::kUserApproximateSize);
+ }
+ if (options.include_memtabtles) {
+ sizes[i] += sv->mem->ApproximateStats(k1.Encode(), k2.Encode()).size;
+ sizes[i] += sv->imm->ApproximateStats(k1.Encode(), k2.Encode()).size;
+ }
+ }
+
+ ReturnAndCleanupSuperVersion(cfd, sv);
+ return Status::OK();
+}
+
+std::list<uint64_t>::iterator
+DBImpl::CaptureCurrentFileNumberInPendingOutputs() {
+ // We need to remember the iterator of our insert, because after the
+ // background job is done, we need to remove that element from
+ // pending_outputs_.
+ pending_outputs_.push_back(versions_->current_next_file_number());
+ auto pending_outputs_inserted_elem = pending_outputs_.end();
+ --pending_outputs_inserted_elem;
+ return pending_outputs_inserted_elem;
+}
+
+void DBImpl::ReleaseFileNumberFromPendingOutputs(
+ std::unique_ptr<std::list<uint64_t>::iterator>& v) {
+ if (v.get() != nullptr) {
+ pending_outputs_.erase(*v.get());
+ v.reset();
+ }
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::GetUpdatesSince(
+ SequenceNumber seq, std::unique_ptr<TransactionLogIterator>* iter,
+ const TransactionLogIterator::ReadOptions& read_options) {
+ RecordTick(stats_, GET_UPDATES_SINCE_CALLS);
+ if (seq > versions_->LastSequence()) {
+ return Status::NotFound("Requested sequence not yet written in the db");
+ }
+ return wal_manager_.GetUpdatesSince(seq, iter, read_options, versions_.get());
+}
+
+Status DBImpl::DeleteFile(std::string name) {
+ uint64_t number;
+ FileType type;
+ WalFileType log_type;
+ if (!ParseFileName(name, &number, &type, &log_type) ||
+ (type != kTableFile && type != kLogFile)) {
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log, "DeleteFile %s failed.\n",
+ name.c_str());
+ return Status::InvalidArgument("Invalid file name");
+ }
+
+ Status status;
+ if (type == kLogFile) {
+ // Only allow deleting archived log files
+ if (log_type != kArchivedLogFile) {
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "DeleteFile %s failed - not archived log.\n",
+ name.c_str());
+ return Status::NotSupported("Delete only supported for archived logs");
+ }
+ status = wal_manager_.DeleteFile(name, number);
+ if (!status.ok()) {
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "DeleteFile %s failed -- %s.\n", name.c_str(),
+ status.ToString().c_str());
+ }
+ return status;
+ }
+
+ int level;
+ FileMetaData* metadata;
+ ColumnFamilyData* cfd;
+ VersionEdit edit;
+ JobContext job_context(next_job_id_.fetch_add(1), true);
+ {
+ InstrumentedMutexLock l(&mutex_);
+ status = versions_->GetMetadataForFile(number, &level, &metadata, &cfd);
+ if (!status.ok()) {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "DeleteFile %s failed. File not found\n", name.c_str());
+ job_context.Clean();
+ return Status::InvalidArgument("File not found");
+ }
+ assert(level < cfd->NumberLevels());
+
+ // If the file is being compacted no need to delete.
+ if (metadata->being_compacted) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "DeleteFile %s Skipped. File about to be compacted\n",
+ name.c_str());
+ job_context.Clean();
+ return Status::OK();
+ }
+
+ // Only the files in the last level can be deleted externally.
+ // This is to make sure that any deletion tombstones are not
+ // lost. Check that the level passed is the last level.
+ auto* vstoreage = cfd->current()->storage_info();
+ for (int i = level + 1; i < cfd->NumberLevels(); i++) {
+ if (vstoreage->NumLevelFiles(i) != 0) {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "DeleteFile %s FAILED. File not in last level\n",
+ name.c_str());
+ job_context.Clean();
+ return Status::InvalidArgument("File not in last level");
+ }
+ }
+ // if level == 0, it has to be the oldest file
+ if (level == 0 &&
+ vstoreage->LevelFiles(0).back()->fd.GetNumber() != number) {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "DeleteFile %s failed ---"
+ " target file in level 0 must be the oldest.",
+ name.c_str());
+ job_context.Clean();
+ return Status::InvalidArgument("File in level 0, but not oldest");
+ }
+ edit.SetColumnFamily(cfd->GetID());
+ edit.DeleteFile(level, number);
+ status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+ &edit, &mutex_, directories_.GetDbDir());
+ if (status.ok()) {
+ InstallSuperVersionAndScheduleWork(cfd,
+ &job_context.superversion_contexts[0],
+ *cfd->GetLatestMutableCFOptions());
+ }
+ FindObsoleteFiles(&job_context, false);
+ } // lock released here
+
+ LogFlush(immutable_db_options_.info_log);
+ // remove files outside the db-lock
+ if (job_context.HaveSomethingToDelete()) {
+ // Call PurgeObsoleteFiles() without holding mutex.
+ PurgeObsoleteFiles(job_context);
+ }
+ job_context.Clean();
+ return status;
+}
+
+Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family,
+ const RangePtr* ranges, size_t n,
+ bool include_end) {
+ Status status;
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ ColumnFamilyData* cfd = cfh->cfd();
+ VersionEdit edit;
+ std::set<FileMetaData*> deleted_files;
+ JobContext job_context(next_job_id_.fetch_add(1), true);
+ {
+ InstrumentedMutexLock l(&mutex_);
+ Version* input_version = cfd->current();
+
+ auto* vstorage = input_version->storage_info();
+ for (size_t r = 0; r < n; r++) {
+ auto begin = ranges[r].start, end = ranges[r].limit;
+ for (int i = 1; i < cfd->NumberLevels(); i++) {
+ if (vstorage->LevelFiles(i).empty() ||
+ !vstorage->OverlapInLevel(i, begin, end)) {
+ continue;
+ }
+ std::vector<FileMetaData*> level_files;
+ InternalKey begin_storage, end_storage, *begin_key, *end_key;
+ if (begin == nullptr) {
+ begin_key = nullptr;
+ } else {
+ begin_storage.SetMinPossibleForUserKey(*begin);
+ begin_key = &begin_storage;
+ }
+ if (end == nullptr) {
+ end_key = nullptr;
+ } else {
+ end_storage.SetMaxPossibleForUserKey(*end);
+ end_key = &end_storage;
+ }
+
+ vstorage->GetCleanInputsWithinInterval(
+ i, begin_key, end_key, &level_files, -1 /* hint_index */,
+ nullptr /* file_index */);
+ FileMetaData* level_file;
+ for (uint32_t j = 0; j < level_files.size(); j++) {
+ level_file = level_files[j];
+ if (level_file->being_compacted) {
+ continue;
+ }
+ if (deleted_files.find(level_file) != deleted_files.end()) {
+ continue;
+ }
+ if (!include_end && end != nullptr &&
+ cfd->user_comparator()->Compare(level_file->largest.user_key(),
+ *end) == 0) {
+ continue;
+ }
+ edit.SetColumnFamily(cfd->GetID());
+ edit.DeleteFile(i, level_file->fd.GetNumber());
+ deleted_files.insert(level_file);
+ level_file->being_compacted = true;
+ }
+ }
+ }
+ if (edit.GetDeletedFiles().empty()) {
+ job_context.Clean();
+ return Status::OK();
+ }
+ input_version->Ref();
+ status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+ &edit, &mutex_, directories_.GetDbDir());
+ if (status.ok()) {
+ InstallSuperVersionAndScheduleWork(cfd,
+ &job_context.superversion_contexts[0],
+ *cfd->GetLatestMutableCFOptions());
+ }
+ for (auto* deleted_file : deleted_files) {
+ deleted_file->being_compacted = false;
+ }
+ input_version->Unref();
+ FindObsoleteFiles(&job_context, false);
+ } // lock released here
+
+ LogFlush(immutable_db_options_.info_log);
+ // remove files outside the db-lock
+ if (job_context.HaveSomethingToDelete()) {
+ // Call PurgeObsoleteFiles() without holding mutex.
+ PurgeObsoleteFiles(job_context);
+ }
+ job_context.Clean();
+ return status;
+}
+
+void DBImpl::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
+ InstrumentedMutexLock l(&mutex_);
+ versions_->GetLiveFilesMetaData(metadata);
+}
+
+void DBImpl::GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
+ ColumnFamilyMetaData* cf_meta) {
+ assert(column_family);
+ auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+ auto* sv = GetAndRefSuperVersion(cfd);
+ {
+ // Without mutex, Version::GetColumnFamilyMetaData will have data race with
+ // Compaction::MarkFilesBeingCompacted. One solution is to use mutex, but
+ // this may cause regression. An alternative is to make
+ // FileMetaData::being_compacted atomic, but it will make FileMetaData
+ // non-copy-able. Another option is to separate these variables from
+ // original FileMetaData struct, and this requires re-organization of data
+ // structures. For now, we take the easy approach. If
+ // DB::GetColumnFamilyMetaData is not called frequently, the regression
+ // should not be big. We still need to keep an eye on it.
+ InstrumentedMutexLock l(&mutex_);
+ sv->current->GetColumnFamilyMetaData(cf_meta);
+ }
+ ReturnAndCleanupSuperVersion(cfd, sv);
+}
+
+#endif // ROCKSDB_LITE
+
+Status DBImpl::CheckConsistency() {
+ mutex_.AssertHeld();
+ std::vector<LiveFileMetaData> metadata;
+ versions_->GetLiveFilesMetaData(&metadata);
+ TEST_SYNC_POINT("DBImpl::CheckConsistency:AfterGetLiveFilesMetaData");
+
+ std::string corruption_messages;
+
+ if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) {
+ // Instead of calling GetFileSize() for each expected file, call
+ // GetChildren() for the DB directory and check that all expected files
+ // are listed, without checking their sizes.
+ // Since sst files might be in different directories, do it for each
+ // directory separately.
+ std::map<std::string, std::vector<std::string>> files_by_directory;
+ for (const auto& md : metadata) {
+ // md.name has a leading "/". Remove it.
+ std::string fname = md.name;
+ if (!fname.empty() && fname[0] == '/') {
+ fname = fname.substr(1);
+ }
+ files_by_directory[md.db_path].push_back(fname);
+ }
+ for (const auto& dir_files : files_by_directory) {
+ std::string directory = dir_files.first;
+ std::vector<std::string> existing_files;
+ Status s = env_->GetChildren(directory, &existing_files);
+ if (!s.ok()) {
+ corruption_messages +=
+ "Can't list files in " + directory + ": " + s.ToString() + "\n";
+ continue;
+ }
+ std::sort(existing_files.begin(), existing_files.end());
+
+ for (const std::string& fname : dir_files.second) {
+ if (!std::binary_search(existing_files.begin(), existing_files.end(),
+ fname) &&
+ !std::binary_search(existing_files.begin(), existing_files.end(),
+ Rocks2LevelTableFileName(fname))) {
+ corruption_messages +=
+ "Missing sst file " + fname + " in " + directory + "\n";
+ }
+ }
+ }
+ } else {
+ for (const auto& md : metadata) {
+ // md.name has a leading "/".
+ std::string file_path = md.db_path + md.name;
+
+ uint64_t fsize = 0;
+ TEST_SYNC_POINT("DBImpl::CheckConsistency:BeforeGetFileSize");
+ Status s = env_->GetFileSize(file_path, &fsize);
+ if (!s.ok() &&
+ env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok()) {
+ s = Status::OK();
+ }
+ if (!s.ok()) {
+ corruption_messages +=
+ "Can't access " + md.name + ": " + s.ToString() + "\n";
+ } else if (fsize != md.size) {
+ corruption_messages += "Sst file size mismatch: " + file_path +
+ ". Size recorded in manifest " +
+ ToString(md.size) + ", actual size " +
+ ToString(fsize) + "\n";
+ }
+ }
+ }
+
+ if (corruption_messages.size() == 0) {
+ return Status::OK();
+ } else {
+ return Status::Corruption(corruption_messages);
+ }
+}
+
+Status DBImpl::GetDbIdentity(std::string& identity) const {
+ identity.assign(db_id_);
+ return Status::OK();
+}
+
+Status DBImpl::GetDbIdentityFromIdentityFile(std::string* identity) const {
+ std::string idfilename = IdentityFileName(dbname_);
+ const FileOptions soptions;
+
+ Status s = ReadFileToString(fs_.get(), idfilename, identity);
+ if (!s.ok()) {
+ return s;
+ }
+
+ // If last character is '\n' remove it from identity
+ if (identity->size() > 0 && identity->back() == '\n') {
+ identity->pop_back();
+ }
+ return s;
+}
+
+// Default implementation -- returns not supported status
+Status DB::CreateColumnFamily(const ColumnFamilyOptions& /*cf_options*/,
+ const std::string& /*column_family_name*/,
+ ColumnFamilyHandle** /*handle*/) {
+ return Status::NotSupported("");
+}
+
+Status DB::CreateColumnFamilies(
+ const ColumnFamilyOptions& /*cf_options*/,
+ const std::vector<std::string>& /*column_family_names*/,
+ std::vector<ColumnFamilyHandle*>* /*handles*/) {
+ return Status::NotSupported("");
+}
+
+Status DB::CreateColumnFamilies(
+ const std::vector<ColumnFamilyDescriptor>& /*column_families*/,
+ std::vector<ColumnFamilyHandle*>* /*handles*/) {
+ return Status::NotSupported("");
+}
+
+Status DB::DropColumnFamily(ColumnFamilyHandle* /*column_family*/) {
+ return Status::NotSupported("");
+}
+
+Status DB::DropColumnFamilies(
+ const std::vector<ColumnFamilyHandle*>& /*column_families*/) {
+ return Status::NotSupported("");
+}
+
+Status DB::DestroyColumnFamilyHandle(ColumnFamilyHandle* column_family) {
+ delete column_family;
+ return Status::OK();
+}
+
+DB::~DB() {}
+
+Status DBImpl::Close() {
+ if (!closed_) {
+ {
+ InstrumentedMutexLock l(&mutex_);
+ // If there is unreleased snapshot, fail the close call
+ if (!snapshots_.empty()) {
+ return Status::Aborted("Cannot close DB with unreleased snapshot.");
+ }
+ }
+
+ closed_ = true;
+ return CloseImpl();
+ }
+ return Status::OK();
+}
+
+Status DB::ListColumnFamilies(const DBOptions& db_options,
+ const std::string& name,
+ std::vector<std::string>* column_families) {
+ FileSystem* fs = db_options.file_system.get();
+ LegacyFileSystemWrapper legacy_fs(db_options.env);
+ if (!fs) {
+ fs = &legacy_fs;
+ }
+ return VersionSet::ListColumnFamilies(column_families, name, fs);
+}
+
+Snapshot::~Snapshot() {}
+
+Status DestroyDB(const std::string& dbname, const Options& options,
+ const std::vector<ColumnFamilyDescriptor>& column_families) {
+ ImmutableDBOptions soptions(SanitizeOptions(dbname, options));
+ Env* env = soptions.env;
+ std::vector<std::string> filenames;
+ bool wal_in_db_path = IsWalDirSameAsDBPath(&soptions);
+
+ // Reset the logger because it holds a handle to the
+ // log file and prevents cleanup and directory removal
+ soptions.info_log.reset();
+ // Ignore error in case directory does not exist
+ env->GetChildren(dbname, &filenames);
+
+ FileLock* lock;
+ const std::string lockname = LockFileName(dbname);
+ Status result = env->LockFile(lockname, &lock);
+ if (result.ok()) {
+ uint64_t number;
+ FileType type;
+ InfoLogPrefix info_log_prefix(!soptions.db_log_dir.empty(), dbname);
+ for (const auto& fname : filenames) {
+ if (ParseFileName(fname, &number, info_log_prefix.prefix, &type) &&
+ type != kDBLockFile) { // Lock file will be deleted at end
+ Status del;
+ std::string path_to_delete = dbname + "/" + fname;
+ if (type == kMetaDatabase) {
+ del = DestroyDB(path_to_delete, options);
+ } else if (type == kTableFile || type == kLogFile) {
+ del = DeleteDBFile(&soptions, path_to_delete, dbname,
+ /*force_bg=*/false, /*force_fg=*/!wal_in_db_path);
+ } else {
+ del = env->DeleteFile(path_to_delete);
+ }
+ if (result.ok() && !del.ok()) {
+ result = del;
+ }
+ }
+ }
+
+ std::vector<std::string> paths;
+
+ for (const auto& path : options.db_paths) {
+ paths.emplace_back(path.path);
+ }
+ for (const auto& cf : column_families) {
+ for (const auto& path : cf.options.cf_paths) {
+ paths.emplace_back(path.path);
+ }
+ }
+
+ // Remove duplicate paths.
+ // Note that we compare only the actual paths but not path ids.
+ // This reason is that same path can appear at different path_ids
+ // for different column families.
+ std::sort(paths.begin(), paths.end());
+ paths.erase(std::unique(paths.begin(), paths.end()), paths.end());
+
+ for (const auto& path : paths) {
+ if (env->GetChildren(path, &filenames).ok()) {
+ for (const auto& fname : filenames) {
+ if (ParseFileName(fname, &number, &type) &&
+ type == kTableFile) { // Lock file will be deleted at end
+ std::string table_path = path + "/" + fname;
+ Status del = DeleteDBFile(&soptions, table_path, dbname,
+ /*force_bg=*/false, /*force_fg=*/false);
+ if (result.ok() && !del.ok()) {
+ result = del;
+ }
+ }
+ }
+ env->DeleteDir(path);
+ }
+ }
+
+ std::vector<std::string> walDirFiles;
+ std::string archivedir = ArchivalDirectory(dbname);
+ bool wal_dir_exists = false;
+ if (dbname != soptions.wal_dir) {
+ wal_dir_exists = env->GetChildren(soptions.wal_dir, &walDirFiles).ok();
+ archivedir = ArchivalDirectory(soptions.wal_dir);
+ }
+
+ // Archive dir may be inside wal dir or dbname and should be
+ // processed and removed before those otherwise we have issues
+ // removing them
+ std::vector<std::string> archiveFiles;
+ if (env->GetChildren(archivedir, &archiveFiles).ok()) {
+ // Delete archival files.
+ for (const auto& file : archiveFiles) {
+ if (ParseFileName(file, &number, &type) && type == kLogFile) {
+ Status del =
+ DeleteDBFile(&soptions, archivedir + "/" + file, archivedir,
+ /*force_bg=*/false, /*force_fg=*/!wal_in_db_path);
+ if (result.ok() && !del.ok()) {
+ result = del;
+ }
+ }
+ }
+ env->DeleteDir(archivedir);
+ }
+
+ // Delete log files in the WAL dir
+ if (wal_dir_exists) {
+ for (const auto& file : walDirFiles) {
+ if (ParseFileName(file, &number, &type) && type == kLogFile) {
+ Status del =
+ DeleteDBFile(&soptions, LogFileName(soptions.wal_dir, number),
+ soptions.wal_dir, /*force_bg=*/false,
+ /*force_fg=*/!wal_in_db_path);
+ if (result.ok() && !del.ok()) {
+ result = del;
+ }
+ }
+ }
+ env->DeleteDir(soptions.wal_dir);
+ }
+
+ env->UnlockFile(lock); // Ignore error since state is already gone
+ env->DeleteFile(lockname);
+
+ // sst_file_manager holds a ref to the logger. Make sure the logger is
+ // gone before trying to remove the directory.
+ soptions.sst_file_manager.reset();
+
+ env->DeleteDir(dbname); // Ignore error in case dir contains other files
+ }
+ return result;
+}
+
+Status DBImpl::WriteOptionsFile(bool need_mutex_lock,
+ bool need_enter_write_thread) {
+#ifndef ROCKSDB_LITE
+ WriteThread::Writer w;
+ if (need_mutex_lock) {
+ mutex_.Lock();
+ } else {
+ mutex_.AssertHeld();
+ }
+ if (need_enter_write_thread) {
+ write_thread_.EnterUnbatched(&w, &mutex_);
+ }
+
+ std::vector<std::string> cf_names;
+ std::vector<ColumnFamilyOptions> cf_opts;
+
+ // This part requires mutex to protect the column family options
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ cf_names.push_back(cfd->GetName());
+ cf_opts.push_back(cfd->GetLatestCFOptions());
+ }
+
+ // Unlock during expensive operations. New writes cannot get here
+ // because the single write thread ensures all new writes get queued.
+ DBOptions db_options =
+ BuildDBOptions(immutable_db_options_, mutable_db_options_);
+ mutex_.Unlock();
+
+ TEST_SYNC_POINT("DBImpl::WriteOptionsFile:1");
+ TEST_SYNC_POINT("DBImpl::WriteOptionsFile:2");
+
+ std::string file_name =
+ TempOptionsFileName(GetName(), versions_->NewFileNumber());
+ Status s = PersistRocksDBOptions(db_options, cf_names, cf_opts, file_name,
+ GetFileSystem());
+
+ if (s.ok()) {
+ s = RenameTempFileToOptionsFile(file_name);
+ }
+ // restore lock
+ if (!need_mutex_lock) {
+ mutex_.Lock();
+ }
+ if (need_enter_write_thread) {
+ write_thread_.ExitUnbatched(&w);
+ }
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "Unnable to persist options -- %s", s.ToString().c_str());
+ if (immutable_db_options_.fail_if_options_file_error) {
+ return Status::IOError("Unable to persist options.",
+ s.ToString().c_str());
+ }
+ }
+#else
+ (void)need_mutex_lock;
+ (void)need_enter_write_thread;
+#endif // !ROCKSDB_LITE
+ return Status::OK();
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+void DeleteOptionsFilesHelper(const std::map<uint64_t, std::string>& filenames,
+ const size_t num_files_to_keep,
+ const std::shared_ptr<Logger>& info_log,
+ Env* env) {
+ if (filenames.size() <= num_files_to_keep) {
+ return;
+ }
+ for (auto iter = std::next(filenames.begin(), num_files_to_keep);
+ iter != filenames.end(); ++iter) {
+ if (!env->DeleteFile(iter->second).ok()) {
+ ROCKS_LOG_WARN(info_log, "Unable to delete options file %s",
+ iter->second.c_str());
+ }
+ }
+}
+} // namespace
+#endif // !ROCKSDB_LITE
+
+Status DBImpl::DeleteObsoleteOptionsFiles() {
+#ifndef ROCKSDB_LITE
+ std::vector<std::string> filenames;
+ // use ordered map to store keep the filenames sorted from the newest
+ // to the oldest.
+ std::map<uint64_t, std::string> options_filenames;
+ Status s;
+ s = GetEnv()->GetChildren(GetName(), &filenames);
+ if (!s.ok()) {
+ return s;
+ }
+ for (auto& filename : filenames) {
+ uint64_t file_number;
+ FileType type;
+ if (ParseFileName(filename, &file_number, &type) && type == kOptionsFile) {
+ options_filenames.insert(
+ {std::numeric_limits<uint64_t>::max() - file_number,
+ GetName() + "/" + filename});
+ }
+ }
+
+ // Keeps the latest 2 Options file
+ const size_t kNumOptionsFilesKept = 2;
+ DeleteOptionsFilesHelper(options_filenames, kNumOptionsFilesKept,
+ immutable_db_options_.info_log, GetEnv());
+ return Status::OK();
+#else
+ return Status::OK();
+#endif // !ROCKSDB_LITE
+}
+
+Status DBImpl::RenameTempFileToOptionsFile(const std::string& file_name) {
+#ifndef ROCKSDB_LITE
+ Status s;
+
+ uint64_t options_file_number = versions_->NewFileNumber();
+ std::string options_file_name =
+ OptionsFileName(GetName(), options_file_number);
+ // Retry if the file name happen to conflict with an existing one.
+ s = GetEnv()->RenameFile(file_name, options_file_name);
+ if (s.ok()) {
+ InstrumentedMutexLock l(&mutex_);
+ versions_->options_file_number_ = options_file_number;
+ }
+
+ if (0 == disable_delete_obsolete_files_) {
+ DeleteObsoleteOptionsFiles();
+ }
+ return s;
+#else
+ (void)file_name;
+ return Status::OK();
+#endif // !ROCKSDB_LITE
+}
+
+#ifdef ROCKSDB_USING_THREAD_STATUS
+
+void DBImpl::NewThreadStatusCfInfo(ColumnFamilyData* cfd) const {
+ if (immutable_db_options_.enable_thread_tracking) {
+ ThreadStatusUtil::NewColumnFamilyInfo(this, cfd, cfd->GetName(),
+ cfd->ioptions()->env);
+ }
+}
+
+void DBImpl::EraseThreadStatusCfInfo(ColumnFamilyData* cfd) const {
+ if (immutable_db_options_.enable_thread_tracking) {
+ ThreadStatusUtil::EraseColumnFamilyInfo(cfd);
+ }
+}
+
+void DBImpl::EraseThreadStatusDbInfo() const {
+ if (immutable_db_options_.enable_thread_tracking) {
+ ThreadStatusUtil::EraseDatabaseInfo(this);
+ }
+}
+
+#else
+void DBImpl::NewThreadStatusCfInfo(ColumnFamilyData* /*cfd*/) const {}
+
+void DBImpl::EraseThreadStatusCfInfo(ColumnFamilyData* /*cfd*/) const {}
+
+void DBImpl::EraseThreadStatusDbInfo() const {}
+#endif // ROCKSDB_USING_THREAD_STATUS
+
+//
+// A global method that can dump out the build version
+void DumpRocksDBBuildVersion(Logger* log) {
+#if !defined(IOS_CROSS_COMPILE)
+ // if we compile with Xcode, we don't run build_detect_version, so we don't
+ // generate util/build_version.cc
+ ROCKS_LOG_HEADER(log, "RocksDB version: %d.%d.%d\n", ROCKSDB_MAJOR,
+ ROCKSDB_MINOR, ROCKSDB_PATCH);
+ ROCKS_LOG_HEADER(log, "Git sha %s", rocksdb_build_git_sha);
+ ROCKS_LOG_HEADER(log, "Compile date %s", rocksdb_build_compile_date);
+#else
+ (void)log; // ignore "-Wunused-parameter"
+#endif
+}
+
+#ifndef ROCKSDB_LITE
+SequenceNumber DBImpl::GetEarliestMemTableSequenceNumber(SuperVersion* sv,
+ bool include_history) {
+ // Find the earliest sequence number that we know we can rely on reading
+ // from the memtable without needing to check sst files.
+ SequenceNumber earliest_seq =
+ sv->imm->GetEarliestSequenceNumber(include_history);
+ if (earliest_seq == kMaxSequenceNumber) {
+ earliest_seq = sv->mem->GetEarliestSequenceNumber();
+ }
+ assert(sv->mem->GetEarliestSequenceNumber() >= earliest_seq);
+
+ return earliest_seq;
+}
+#endif // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
+ bool cache_only,
+ SequenceNumber lower_bound_seq,
+ SequenceNumber* seq,
+ bool* found_record_for_key,
+ bool* is_blob_index) {
+ Status s;
+ MergeContext merge_context;
+ SequenceNumber max_covering_tombstone_seq = 0;
+
+ ReadOptions read_options;
+ SequenceNumber current_seq = versions_->LastSequence();
+ LookupKey lkey(key, current_seq);
+
+ *seq = kMaxSequenceNumber;
+ *found_record_for_key = false;
+
+ // Check if there is a record for this key in the latest memtable
+ sv->mem->Get(lkey, nullptr, &s, &merge_context, &max_covering_tombstone_seq,
+ seq, read_options, nullptr /*read_callback*/, is_blob_index);
+
+ if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
+ // unexpected error reading memtable.
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "Unexpected status returned from MemTable::Get: %s\n",
+ s.ToString().c_str());
+
+ return s;
+ }
+
+ if (*seq != kMaxSequenceNumber) {
+ // Found a sequence number, no need to check immutable memtables
+ *found_record_for_key = true;
+ return Status::OK();
+ }
+
+ SequenceNumber lower_bound_in_mem = sv->mem->GetEarliestSequenceNumber();
+ if (lower_bound_in_mem != kMaxSequenceNumber &&
+ lower_bound_in_mem < lower_bound_seq) {
+ *found_record_for_key = false;
+ return Status::OK();
+ }
+
+ // Check if there is a record for this key in the immutable memtables
+ sv->imm->Get(lkey, nullptr, &s, &merge_context, &max_covering_tombstone_seq,
+ seq, read_options, nullptr /*read_callback*/, is_blob_index);
+
+ if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
+ // unexpected error reading memtable.
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "Unexpected status returned from MemTableList::Get: %s\n",
+ s.ToString().c_str());
+
+ return s;
+ }
+
+ if (*seq != kMaxSequenceNumber) {
+ // Found a sequence number, no need to check memtable history
+ *found_record_for_key = true;
+ return Status::OK();
+ }
+
+ SequenceNumber lower_bound_in_imm = sv->imm->GetEarliestSequenceNumber();
+ if (lower_bound_in_imm != kMaxSequenceNumber &&
+ lower_bound_in_imm < lower_bound_seq) {
+ *found_record_for_key = false;
+ return Status::OK();
+ }
+
+ // Check if there is a record for this key in the immutable memtables
+ sv->imm->GetFromHistory(lkey, nullptr, &s, &merge_context,
+ &max_covering_tombstone_seq, seq, read_options,
+ is_blob_index);
+
+ if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
+ // unexpected error reading memtable.
+ ROCKS_LOG_ERROR(
+ immutable_db_options_.info_log,
+ "Unexpected status returned from MemTableList::GetFromHistory: %s\n",
+ s.ToString().c_str());
+
+ return s;
+ }
+
+ if (*seq != kMaxSequenceNumber) {
+ // Found a sequence number, no need to check SST files
+ *found_record_for_key = true;
+ return Status::OK();
+ }
+
+ // We could do a sv->imm->GetEarliestSequenceNumber(/*include_history*/ true)
+ // check here to skip the history if possible. But currently the caller
+ // already does that. Maybe we should move the logic here later.
+
+ // TODO(agiardullo): possible optimization: consider checking cached
+ // SST files if cache_only=true?
+ if (!cache_only) {
+ // Check tables
+ sv->current->Get(read_options, lkey, nullptr, &s, &merge_context,
+ &max_covering_tombstone_seq, nullptr /* value_found */,
+ found_record_for_key, seq, nullptr /*read_callback*/,
+ is_blob_index);
+
+ if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
+ // unexpected error reading SST files
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "Unexpected status returned from Version::Get: %s\n",
+ s.ToString().c_str());
+ }
+ }
+
+ return s;
+}
+
+Status DBImpl::IngestExternalFile(
+ ColumnFamilyHandle* column_family,
+ const std::vector<std::string>& external_files,
+ const IngestExternalFileOptions& ingestion_options) {
+ IngestExternalFileArg arg;
+ arg.column_family = column_family;
+ arg.external_files = external_files;
+ arg.options = ingestion_options;
+ return IngestExternalFiles({arg});
+}
+
+Status DBImpl::IngestExternalFiles(
+ const std::vector<IngestExternalFileArg>& args) {
+ if (args.empty()) {
+ return Status::InvalidArgument("ingestion arg list is empty");
+ }
+ {
+ std::unordered_set<ColumnFamilyHandle*> unique_cfhs;
+ for (const auto& arg : args) {
+ if (arg.column_family == nullptr) {
+ return Status::InvalidArgument("column family handle is null");
+ } else if (unique_cfhs.count(arg.column_family) > 0) {
+ return Status::InvalidArgument(
+ "ingestion args have duplicate column families");
+ }
+ unique_cfhs.insert(arg.column_family);
+ }
+ }
+ // Ingest multiple external SST files atomically.
+ size_t num_cfs = args.size();
+ for (size_t i = 0; i != num_cfs; ++i) {
+ if (args[i].external_files.empty()) {
+ char err_msg[128] = {0};
+ snprintf(err_msg, 128, "external_files[%zu] is empty", i);
+ return Status::InvalidArgument(err_msg);
+ }
+ }
+ for (const auto& arg : args) {
+ const IngestExternalFileOptions& ingest_opts = arg.options;
+ if (ingest_opts.ingest_behind &&
+ !immutable_db_options_.allow_ingest_behind) {
+ return Status::InvalidArgument(
+ "can't ingest_behind file in DB with allow_ingest_behind=false");
+ }
+ }
+
+ // TODO (yanqin) maybe handle the case in which column_families have
+ // duplicates
+ std::unique_ptr<std::list<uint64_t>::iterator> pending_output_elem;
+ size_t total = 0;
+ for (const auto& arg : args) {
+ total += arg.external_files.size();
+ }
+ uint64_t next_file_number = 0;
+ Status status = ReserveFileNumbersBeforeIngestion(
+ static_cast<ColumnFamilyHandleImpl*>(args[0].column_family)->cfd(), total,
+ pending_output_elem, &next_file_number);
+ if (!status.ok()) {
+ InstrumentedMutexLock l(&mutex_);
+ ReleaseFileNumberFromPendingOutputs(pending_output_elem);
+ return status;
+ }
+
+ std::vector<ExternalSstFileIngestionJob> ingestion_jobs;
+ for (const auto& arg : args) {
+ auto* cfd = static_cast<ColumnFamilyHandleImpl*>(arg.column_family)->cfd();
+ ingestion_jobs.emplace_back(
+ env_, versions_.get(), cfd, immutable_db_options_, file_options_,
+ &snapshots_, arg.options, &directories_, &event_logger_);
+ }
+ std::vector<std::pair<bool, Status>> exec_results;
+ for (size_t i = 0; i != num_cfs; ++i) {
+ exec_results.emplace_back(false, Status::OK());
+ }
+ // TODO(yanqin) maybe make jobs run in parallel
+ uint64_t start_file_number = next_file_number;
+ for (size_t i = 1; i != num_cfs; ++i) {
+ start_file_number += args[i - 1].external_files.size();
+ auto* cfd =
+ static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+ SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+ exec_results[i].second = ingestion_jobs[i].Prepare(
+ args[i].external_files, start_file_number, super_version);
+ exec_results[i].first = true;
+ CleanupSuperVersion(super_version);
+ }
+ TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:0");
+ TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:1");
+ {
+ auto* cfd =
+ static_cast<ColumnFamilyHandleImpl*>(args[0].column_family)->cfd();
+ SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+ exec_results[0].second = ingestion_jobs[0].Prepare(
+ args[0].external_files, next_file_number, super_version);
+ exec_results[0].first = true;
+ CleanupSuperVersion(super_version);
+ }
+ for (const auto& exec_result : exec_results) {
+ if (!exec_result.second.ok()) {
+ status = exec_result.second;
+ break;
+ }
+ }
+ if (!status.ok()) {
+ for (size_t i = 0; i != num_cfs; ++i) {
+ if (exec_results[i].first) {
+ ingestion_jobs[i].Cleanup(status);
+ }
+ }
+ InstrumentedMutexLock l(&mutex_);
+ ReleaseFileNumberFromPendingOutputs(pending_output_elem);
+ return status;
+ }
+
+ std::vector<SuperVersionContext> sv_ctxs;
+ for (size_t i = 0; i != num_cfs; ++i) {
+ sv_ctxs.emplace_back(true /* create_superversion */);
+ }
+ TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeJobsRun:0");
+ TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeJobsRun:1");
+ TEST_SYNC_POINT("DBImpl::AddFile:Start");
+ {
+ InstrumentedMutexLock l(&mutex_);
+ TEST_SYNC_POINT("DBImpl::AddFile:MutexLock");
+
+ // Stop writes to the DB by entering both write threads
+ WriteThread::Writer w;
+ write_thread_.EnterUnbatched(&w, &mutex_);
+ WriteThread::Writer nonmem_w;
+ if (two_write_queues_) {
+ nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+ }
+
+ // When unordered_write is enabled, the keys are writing to memtable in an
+ // unordered way. If the ingestion job checks memtable key range before the
+ // key landing in memtable, the ingestion job may skip the necessary
+ // memtable flush.
+ // So wait here to ensure there is no pending write to memtable.
+ WaitForPendingWrites();
+
+ num_running_ingest_file_ += static_cast<int>(num_cfs);
+ TEST_SYNC_POINT("DBImpl::IngestExternalFile:AfterIncIngestFileCounter");
+
+ bool at_least_one_cf_need_flush = false;
+ std::vector<bool> need_flush(num_cfs, false);
+ for (size_t i = 0; i != num_cfs; ++i) {
+ auto* cfd =
+ static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+ if (cfd->IsDropped()) {
+ // TODO (yanqin) investigate whether we should abort ingestion or
+ // proceed with other non-dropped column families.
+ status = Status::InvalidArgument(
+ "cannot ingest an external file into a dropped CF");
+ break;
+ }
+ bool tmp = false;
+ status = ingestion_jobs[i].NeedsFlush(&tmp, cfd->GetSuperVersion());
+ need_flush[i] = tmp;
+ at_least_one_cf_need_flush = (at_least_one_cf_need_flush || tmp);
+ if (!status.ok()) {
+ break;
+ }
+ }
+ TEST_SYNC_POINT_CALLBACK("DBImpl::IngestExternalFile:NeedFlush",
+ &at_least_one_cf_need_flush);
+
+ if (status.ok() && at_least_one_cf_need_flush) {
+ FlushOptions flush_opts;
+ flush_opts.allow_write_stall = true;
+ if (immutable_db_options_.atomic_flush) {
+ autovector<ColumnFamilyData*> cfds_to_flush;
+ SelectColumnFamiliesForAtomicFlush(&cfds_to_flush);
+ mutex_.Unlock();
+ status = AtomicFlushMemTables(cfds_to_flush, flush_opts,
+ FlushReason::kExternalFileIngestion,
+ true /* writes_stopped */);
+ mutex_.Lock();
+ } else {
+ for (size_t i = 0; i != num_cfs; ++i) {
+ if (need_flush[i]) {
+ mutex_.Unlock();
+ auto* cfd =
+ static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)
+ ->cfd();
+ status = FlushMemTable(cfd, flush_opts,
+ FlushReason::kExternalFileIngestion,
+ true /* writes_stopped */);
+ mutex_.Lock();
+ if (!status.ok()) {
+ break;
+ }
+ }
+ }
+ }
+ }
+ // Run ingestion jobs.
+ if (status.ok()) {
+ for (size_t i = 0; i != num_cfs; ++i) {
+ status = ingestion_jobs[i].Run();
+ if (!status.ok()) {
+ break;
+ }
+ }
+ }
+ if (status.ok()) {
+ int consumed_seqno_count =
+ ingestion_jobs[0].ConsumedSequenceNumbersCount();
+#ifndef NDEBUG
+ for (size_t i = 1; i != num_cfs; ++i) {
+ assert(!!consumed_seqno_count ==
+ !!ingestion_jobs[i].ConsumedSequenceNumbersCount());
+ consumed_seqno_count +=
+ ingestion_jobs[i].ConsumedSequenceNumbersCount();
+ }
+#endif
+ if (consumed_seqno_count > 0) {
+ const SequenceNumber last_seqno = versions_->LastSequence();
+ versions_->SetLastAllocatedSequence(last_seqno + consumed_seqno_count);
+ versions_->SetLastPublishedSequence(last_seqno + consumed_seqno_count);
+ versions_->SetLastSequence(last_seqno + consumed_seqno_count);
+ }
+ autovector<ColumnFamilyData*> cfds_to_commit;
+ autovector<const MutableCFOptions*> mutable_cf_options_list;
+ autovector<autovector<VersionEdit*>> edit_lists;
+ uint32_t num_entries = 0;
+ for (size_t i = 0; i != num_cfs; ++i) {
+ auto* cfd =
+ static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ cfds_to_commit.push_back(cfd);
+ mutable_cf_options_list.push_back(cfd->GetLatestMutableCFOptions());
+ autovector<VersionEdit*> edit_list;
+ edit_list.push_back(ingestion_jobs[i].edit());
+ edit_lists.push_back(edit_list);
+ ++num_entries;
+ }
+ // Mark the version edits as an atomic group if the number of version
+ // edits exceeds 1.
+ if (cfds_to_commit.size() > 1) {
+ for (auto& edits : edit_lists) {
+ assert(edits.size() == 1);
+ edits[0]->MarkAtomicGroup(--num_entries);
+ }
+ assert(0 == num_entries);
+ }
+ status =
+ versions_->LogAndApply(cfds_to_commit, mutable_cf_options_list,
+ edit_lists, &mutex_, directories_.GetDbDir());
+ }
+
+ if (status.ok()) {
+ for (size_t i = 0; i != num_cfs; ++i) {
+ auto* cfd =
+ static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+ if (!cfd->IsDropped()) {
+ InstallSuperVersionAndScheduleWork(cfd, &sv_ctxs[i],
+ *cfd->GetLatestMutableCFOptions());
+#ifndef NDEBUG
+ if (0 == i && num_cfs > 1) {
+ TEST_SYNC_POINT(
+ "DBImpl::IngestExternalFiles:InstallSVForFirstCF:0");
+ TEST_SYNC_POINT(
+ "DBImpl::IngestExternalFiles:InstallSVForFirstCF:1");
+ }
+#endif // !NDEBUG
+ }
+ }
+ }
+
+ // Resume writes to the DB
+ if (two_write_queues_) {
+ nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+ }
+ write_thread_.ExitUnbatched(&w);
+
+ if (status.ok()) {
+ for (auto& job : ingestion_jobs) {
+ job.UpdateStats();
+ }
+ }
+ ReleaseFileNumberFromPendingOutputs(pending_output_elem);
+ num_running_ingest_file_ -= static_cast<int>(num_cfs);
+ if (0 == num_running_ingest_file_) {
+ bg_cv_.SignalAll();
+ }
+ TEST_SYNC_POINT("DBImpl::AddFile:MutexUnlock");
+ }
+ // mutex_ is unlocked here
+
+ // Cleanup
+ for (size_t i = 0; i != num_cfs; ++i) {
+ sv_ctxs[i].Clean();
+ // This may rollback jobs that have completed successfully. This is
+ // intended for atomicity.
+ ingestion_jobs[i].Cleanup(status);
+ }
+ if (status.ok()) {
+ for (size_t i = 0; i != num_cfs; ++i) {
+ auto* cfd =
+ static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+ if (!cfd->IsDropped()) {
+ NotifyOnExternalFileIngested(cfd, ingestion_jobs[i]);
+ }
+ }
+ }
+ return status;
+}
+
+Status DBImpl::CreateColumnFamilyWithImport(
+ const ColumnFamilyOptions& options, const std::string& column_family_name,
+ const ImportColumnFamilyOptions& import_options,
+ const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle) {
+ assert(handle != nullptr);
+ assert(*handle == nullptr);
+ std::string cf_comparator_name = options.comparator->Name();
+ if (cf_comparator_name != metadata.db_comparator_name) {
+ return Status::InvalidArgument("Comparator name mismatch");
+ }
+
+ // Create column family.
+ auto status = CreateColumnFamily(options, column_family_name, handle);
+ if (!status.ok()) {
+ return status;
+ }
+
+ // Import sst files from metadata.
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(*handle);
+ auto cfd = cfh->cfd();
+ ImportColumnFamilyJob import_job(env_, versions_.get(), cfd,
+ immutable_db_options_, file_options_,
+ import_options, metadata.files);
+
+ SuperVersionContext dummy_sv_ctx(/* create_superversion */ true);
+ VersionEdit dummy_edit;
+ uint64_t next_file_number = 0;
+ std::unique_ptr<std::list<uint64_t>::iterator> pending_output_elem;
+ {
+ // Lock db mutex
+ InstrumentedMutexLock l(&mutex_);
+ if (error_handler_.IsDBStopped()) {
+ // Don't import files when there is a bg_error
+ status = error_handler_.GetBGError();
+ }
+
+ // Make sure that bg cleanup wont delete the files that we are importing
+ pending_output_elem.reset(new std::list<uint64_t>::iterator(
+ CaptureCurrentFileNumberInPendingOutputs()));
+
+ if (status.ok()) {
+ // If crash happen after a hard link established, Recover function may
+ // reuse the file number that has already assigned to the internal file,
+ // and this will overwrite the external file. To protect the external
+ // file, we have to make sure the file number will never being reused.
+ next_file_number = versions_->FetchAddFileNumber(metadata.files.size());
+ auto cf_options = cfd->GetLatestMutableCFOptions();
+ status = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_,
+ directories_.GetDbDir());
+ if (status.ok()) {
+ InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options);
+ }
+ }
+ }
+ dummy_sv_ctx.Clean();
+
+ if (status.ok()) {
+ SuperVersion* sv = cfd->GetReferencedSuperVersion(this);
+ status = import_job.Prepare(next_file_number, sv);
+ CleanupSuperVersion(sv);
+ }
+
+ if (status.ok()) {
+ SuperVersionContext sv_context(true /*create_superversion*/);
+ {
+ // Lock db mutex
+ InstrumentedMutexLock l(&mutex_);
+
+ // Stop writes to the DB by entering both write threads
+ WriteThread::Writer w;
+ write_thread_.EnterUnbatched(&w, &mutex_);
+ WriteThread::Writer nonmem_w;
+ if (two_write_queues_) {
+ nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+ }
+
+ num_running_ingest_file_++;
+ assert(!cfd->IsDropped());
+ status = import_job.Run();
+
+ // Install job edit [Mutex will be unlocked here]
+ if (status.ok()) {
+ auto cf_options = cfd->GetLatestMutableCFOptions();
+ status = versions_->LogAndApply(cfd, *cf_options, import_job.edit(),
+ &mutex_, directories_.GetDbDir());
+ if (status.ok()) {
+ InstallSuperVersionAndScheduleWork(cfd, &sv_context, *cf_options);
+ }
+ }
+
+ // Resume writes to the DB
+ if (two_write_queues_) {
+ nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+ }
+ write_thread_.ExitUnbatched(&w);
+
+ num_running_ingest_file_--;
+ if (num_running_ingest_file_ == 0) {
+ bg_cv_.SignalAll();
+ }
+ }
+ // mutex_ is unlocked here
+
+ sv_context.Clean();
+ }
+
+ {
+ InstrumentedMutexLock l(&mutex_);
+ ReleaseFileNumberFromPendingOutputs(pending_output_elem);
+ }
+
+ import_job.Cleanup(status);
+ if (!status.ok()) {
+ DropColumnFamily(*handle);
+ DestroyColumnFamilyHandle(*handle);
+ *handle = nullptr;
+ }
+ return status;
+}
+
+Status DBImpl::VerifyChecksum(const ReadOptions& read_options) {
+ Status s;
+ std::vector<ColumnFamilyData*> cfd_list;
+ {
+ InstrumentedMutexLock l(&mutex_);
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ if (!cfd->IsDropped() && cfd->initialized()) {
+ cfd->Ref();
+ cfd_list.push_back(cfd);
+ }
+ }
+ }
+ std::vector<SuperVersion*> sv_list;
+ for (auto cfd : cfd_list) {
+ sv_list.push_back(cfd->GetReferencedSuperVersion(this));
+ }
+ for (auto& sv : sv_list) {
+ VersionStorageInfo* vstorage = sv->current->storage_info();
+ ColumnFamilyData* cfd = sv->current->cfd();
+ Options opts;
+ {
+ InstrumentedMutexLock l(&mutex_);
+ opts = Options(BuildDBOptions(immutable_db_options_, mutable_db_options_),
+ cfd->GetLatestCFOptions());
+ }
+ for (int i = 0; i < vstorage->num_non_empty_levels() && s.ok(); i++) {
+ for (size_t j = 0; j < vstorage->LevelFilesBrief(i).num_files && s.ok();
+ j++) {
+ const auto& fd = vstorage->LevelFilesBrief(i).files[j].fd;
+ std::string fname = TableFileName(cfd->ioptions()->cf_paths,
+ fd.GetNumber(), fd.GetPathId());
+ s = ROCKSDB_NAMESPACE::VerifySstFileChecksum(opts, file_options_,
+ read_options, fname);
+ }
+ }
+ if (!s.ok()) {
+ break;
+ }
+ }
+ bool defer_purge =
+ immutable_db_options().avoid_unnecessary_blocking_io;
+ {
+ InstrumentedMutexLock l(&mutex_);
+ for (auto sv : sv_list) {
+ if (sv && sv->Unref()) {
+ sv->Cleanup();
+ if (defer_purge) {
+ AddSuperVersionsToFreeQueue(sv);
+ } else {
+ delete sv;
+ }
+ }
+ }
+ if (defer_purge) {
+ SchedulePurge();
+ }
+ for (auto cfd : cfd_list) {
+ cfd->UnrefAndTryDelete();
+ }
+ }
+ return s;
+}
+
+void DBImpl::NotifyOnExternalFileIngested(
+ ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job) {
+ if (immutable_db_options_.listeners.empty()) {
+ return;
+ }
+
+ for (const IngestedFileInfo& f : ingestion_job.files_to_ingest()) {
+ ExternalFileIngestionInfo info;
+ info.cf_name = cfd->GetName();
+ info.external_file_path = f.external_file_path;
+ info.internal_file_path = f.internal_file_path;
+ info.global_seqno = f.assigned_seqno;
+ info.table_properties = f.table_properties;
+ for (auto listener : immutable_db_options_.listeners) {
+ listener->OnExternalFileIngested(this, info);
+ }
+ }
+}
+
+void DBImpl::WaitForIngestFile() {
+ mutex_.AssertHeld();
+ while (num_running_ingest_file_ > 0) {
+ bg_cv_.Wait();
+ }
+}
+
+Status DBImpl::StartTrace(const TraceOptions& trace_options,
+ std::unique_ptr<TraceWriter>&& trace_writer) {
+ InstrumentedMutexLock lock(&trace_mutex_);
+ tracer_.reset(new Tracer(env_, trace_options, std::move(trace_writer)));
+ return Status::OK();
+}
+
+Status DBImpl::EndTrace() {
+ InstrumentedMutexLock lock(&trace_mutex_);
+ Status s;
+ if (tracer_ != nullptr) {
+ s = tracer_->Close();
+ tracer_.reset();
+ } else {
+ return Status::IOError("No trace file to close");
+ }
+ return s;
+}
+
+Status DBImpl::StartBlockCacheTrace(
+ const TraceOptions& trace_options,
+ std::unique_ptr<TraceWriter>&& trace_writer) {
+ return block_cache_tracer_.StartTrace(env_, trace_options,
+ std::move(trace_writer));
+}
+
+Status DBImpl::EndBlockCacheTrace() {
+ block_cache_tracer_.EndTrace();
+ return Status::OK();
+}
+
+Status DBImpl::TraceIteratorSeek(const uint32_t& cf_id, const Slice& key) {
+ Status s;
+ if (tracer_) {
+ InstrumentedMutexLock lock(&trace_mutex_);
+ if (tracer_) {
+ s = tracer_->IteratorSeek(cf_id, key);
+ }
+ }
+ return s;
+}
+
+Status DBImpl::TraceIteratorSeekForPrev(const uint32_t& cf_id,
+ const Slice& key) {
+ Status s;
+ if (tracer_) {
+ InstrumentedMutexLock lock(&trace_mutex_);
+ if (tracer_) {
+ s = tracer_->IteratorSeekForPrev(cf_id, key);
+ }
+ }
+ return s;
+}
+
+Status DBImpl::ReserveFileNumbersBeforeIngestion(
+ ColumnFamilyData* cfd, uint64_t num,
+ std::unique_ptr<std::list<uint64_t>::iterator>& pending_output_elem,
+ uint64_t* next_file_number) {
+ Status s;
+ SuperVersionContext dummy_sv_ctx(true /* create_superversion */);
+ assert(nullptr != next_file_number);
+ InstrumentedMutexLock l(&mutex_);
+ if (error_handler_.IsDBStopped()) {
+ // Do not ingest files when there is a bg_error
+ return error_handler_.GetBGError();
+ }
+ pending_output_elem.reset(new std::list<uint64_t>::iterator(
+ CaptureCurrentFileNumberInPendingOutputs()));
+ *next_file_number = versions_->FetchAddFileNumber(static_cast<uint64_t>(num));
+ auto cf_options = cfd->GetLatestMutableCFOptions();
+ VersionEdit dummy_edit;
+ // If crash happen after a hard link established, Recover function may
+ // reuse the file number that has already assigned to the internal file,
+ // and this will overwrite the external file. To protect the external
+ // file, we have to make sure the file number will never being reused.
+ s = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_,
+ directories_.GetDbDir());
+ if (s.ok()) {
+ InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options);
+ }
+ dummy_sv_ctx.Clean();
+ return s;
+}
+
+Status DBImpl::GetCreationTimeOfOldestFile(uint64_t* creation_time) {
+ if (mutable_db_options_.max_open_files == -1) {
+ uint64_t oldest_time = port::kMaxUint64;
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ if (!cfd->IsDropped()) {
+ uint64_t ctime;
+ {
+ SuperVersion* sv = GetAndRefSuperVersion(cfd);
+ Version* version = sv->current;
+ version->GetCreationTimeOfOldestFile(&ctime);
+ ReturnAndCleanupSuperVersion(cfd, sv);
+ }
+
+ if (ctime < oldest_time) {
+ oldest_time = ctime;
+ }
+ if (oldest_time == 0) {
+ break;
+ }
+ }
+ }
+ *creation_time = oldest_time;
+ return Status::OK();
+ } else {
+ return Status::NotSupported("This API only works if max_open_files = -1");
+ }
+}
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl.h b/src/rocksdb/db/db_impl/db_impl.h
new file mode 100644
index 000000000..119555cb4
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl.h
@@ -0,0 +1,2107 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <functional>
+#include <limits>
+#include <list>
+#include <map>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/compaction/compaction_job.h"
+#include "db/dbformat.h"
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "db/external_sst_file_ingestion_job.h"
+#include "db/flush_job.h"
+#include "db/flush_scheduler.h"
+#include "db/import_column_family_job.h"
+#include "db/internal_stats.h"
+#include "db/log_writer.h"
+#include "db/logs_with_prep_tracker.h"
+#include "db/memtable_list.h"
+#include "db/pre_release_callback.h"
+#include "db/range_del_aggregator.h"
+#include "db/read_callback.h"
+#include "db/snapshot_checker.h"
+#include "db/snapshot_impl.h"
+#include "db/trim_history_scheduler.h"
+#include "db/version_edit.h"
+#include "db/wal_manager.h"
+#include "db/write_controller.h"
+#include "db/write_thread.h"
+#include "logging/event_logger.h"
+#include "monitoring/instrumented_mutex.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/status.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "rocksdb/transaction_log.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/scoped_arena_iterator.h"
+#include "trace_replay/block_cache_tracer.h"
+#include "trace_replay/trace_replay.h"
+#include "util/autovector.h"
+#include "util/hash.h"
+#include "util/repeatable_thread.h"
+#include "util/stop_watch.h"
+#include "util/thread_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Arena;
+class ArenaWrappedDBIter;
+class InMemoryStatsHistoryIterator;
+class MemTable;
+class PersistentStatsHistoryIterator;
+class TableCache;
+class TaskLimiterToken;
+class Version;
+class VersionEdit;
+class VersionSet;
+class WriteCallback;
+struct JobContext;
+struct ExternalSstFileInfo;
+struct MemTableInfo;
+
+// Class to maintain directories for all database paths other than main one.
+class Directories {
+ public:
+ Status SetDirectories(Env* env, const std::string& dbname,
+ const std::string& wal_dir,
+ const std::vector<DbPath>& data_paths);
+
+ Directory* GetDataDir(size_t path_id) const {
+ assert(path_id < data_dirs_.size());
+ Directory* ret_dir = data_dirs_[path_id].get();
+ if (ret_dir == nullptr) {
+ // Should use db_dir_
+ return db_dir_.get();
+ }
+ return ret_dir;
+ }
+
+ Directory* GetWalDir() {
+ if (wal_dir_) {
+ return wal_dir_.get();
+ }
+ return db_dir_.get();
+ }
+
+ Directory* GetDbDir() { return db_dir_.get(); }
+
+ private:
+ std::unique_ptr<Directory> db_dir_;
+ std::vector<std::unique_ptr<Directory>> data_dirs_;
+ std::unique_ptr<Directory> wal_dir_;
+};
+
+// While DB is the public interface of RocksDB, and DBImpl is the actual
+// class implementing it. It's the entrance of the core RocksdB engine.
+// All other DB implementations, e.g. TransactionDB, BlobDB, etc, wrap a
+// DBImpl internally.
+// Other than functions implementing the DB interface, some public
+// functions are there for other internal components to call. For
+// example, TransactionDB directly calls DBImpl::WriteImpl() and
+// BlobDB directly calls DBImpl::GetImpl(). Some other functions
+// are for sub-components to call. For example, ColumnFamilyHandleImpl
+// calls DBImpl::FindObsoleteFiles().
+//
+// Since it's a very large class, the definition of the functions is
+// divided in several db_impl_*.cc files, besides db_impl.cc.
+class DBImpl : public DB {
+ public:
+ DBImpl(const DBOptions& options, const std::string& dbname,
+ const bool seq_per_batch = false, const bool batch_per_txn = true);
+ // No copying allowed
+ DBImpl(const DBImpl&) = delete;
+ void operator=(const DBImpl&) = delete;
+
+ virtual ~DBImpl();
+
+ // ---- Implementations of the DB interface ----
+
+ using DB::Resume;
+ virtual Status Resume() override;
+
+ using DB::Put;
+ virtual Status Put(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) override;
+ using DB::Merge;
+ virtual Status Merge(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) override;
+ using DB::Delete;
+ virtual Status Delete(const WriteOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key) override;
+ using DB::SingleDelete;
+ virtual Status SingleDelete(const WriteOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key) override;
+ using DB::Write;
+ virtual Status Write(const WriteOptions& options,
+ WriteBatch* updates) override;
+
+ using DB::Get;
+ virtual Status Get(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value) override;
+
+ using DB::GetMergeOperands;
+ Status GetMergeOperands(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* merge_operands,
+ GetMergeOperandsOptions* get_merge_operands_options,
+ int* number_of_operands) override {
+ GetImplOptions get_impl_options;
+ get_impl_options.column_family = column_family;
+ get_impl_options.merge_operands = merge_operands;
+ get_impl_options.get_merge_operands_options = get_merge_operands_options;
+ get_impl_options.number_of_operands = number_of_operands;
+ get_impl_options.get_value = false;
+ return GetImpl(options, key, get_impl_options);
+ }
+
+ using DB::MultiGet;
+ virtual std::vector<Status> MultiGet(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_family,
+ const std::vector<Slice>& keys,
+ std::vector<std::string>* values) override;
+
+ // This MultiGet is a batched version, which may be faster than calling Get
+ // multiple times, especially if the keys have some spatial locality that
+ // enables them to be queried in the same SST files/set of files. The larger
+ // the batch size, the more scope for batching and performance improvement
+ // The values and statuses parameters are arrays with number of elements
+ // equal to keys.size(). This allows the storage for those to be alloacted
+ // by the caller on the stack for small batches
+ virtual void MultiGet(const ReadOptions& options,
+ ColumnFamilyHandle* column_family,
+ const size_t num_keys, const Slice* keys,
+ PinnableSlice* values, Status* statuses,
+ const bool sorted_input = false) override;
+
+ virtual void MultiGet(const ReadOptions& options, const size_t num_keys,
+ ColumnFamilyHandle** column_families, const Slice* keys,
+ PinnableSlice* values, Status* statuses,
+ const bool sorted_input = false) override;
+
+ virtual void MultiGetWithCallback(
+ const ReadOptions& options, ColumnFamilyHandle* column_family,
+ ReadCallback* callback,
+ autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys);
+
+ virtual Status CreateColumnFamily(const ColumnFamilyOptions& cf_options,
+ const std::string& column_family,
+ ColumnFamilyHandle** handle) override;
+ virtual Status CreateColumnFamilies(
+ const ColumnFamilyOptions& cf_options,
+ const std::vector<std::string>& column_family_names,
+ std::vector<ColumnFamilyHandle*>* handles) override;
+ virtual Status CreateColumnFamilies(
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles) override;
+ virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override;
+ virtual Status DropColumnFamilies(
+ const std::vector<ColumnFamilyHandle*>& column_families) override;
+
+ // Returns false if key doesn't exist in the database and true if it may.
+ // If value_found is not passed in as null, then return the value if found in
+ // memory. On return, if value was found, then value_found will be set to true
+ // , otherwise false.
+ using DB::KeyMayExist;
+ virtual bool KeyMayExist(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ std::string* value,
+ bool* value_found = nullptr) override;
+
+ using DB::NewIterator;
+ virtual Iterator* NewIterator(const ReadOptions& options,
+ ColumnFamilyHandle* column_family) override;
+ virtual Status NewIterators(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_families,
+ std::vector<Iterator*>* iterators) override;
+
+ virtual const Snapshot* GetSnapshot() override;
+ virtual void ReleaseSnapshot(const Snapshot* snapshot) override;
+ using DB::GetProperty;
+ virtual bool GetProperty(ColumnFamilyHandle* column_family,
+ const Slice& property, std::string* value) override;
+ using DB::GetMapProperty;
+ virtual bool GetMapProperty(
+ ColumnFamilyHandle* column_family, const Slice& property,
+ std::map<std::string, std::string>* value) override;
+ using DB::GetIntProperty;
+ virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
+ const Slice& property, uint64_t* value) override;
+ using DB::GetAggregatedIntProperty;
+ virtual bool GetAggregatedIntProperty(const Slice& property,
+ uint64_t* aggregated_value) override;
+ using DB::GetApproximateSizes;
+ virtual Status GetApproximateSizes(const SizeApproximationOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Range* range, int n,
+ uint64_t* sizes) override;
+ using DB::GetApproximateMemTableStats;
+ virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
+ const Range& range,
+ uint64_t* const count,
+ uint64_t* const size) override;
+ using DB::CompactRange;
+ virtual Status CompactRange(const CompactRangeOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice* begin, const Slice* end) override;
+
+ using DB::CompactFiles;
+ virtual Status CompactFiles(
+ const CompactionOptions& compact_options,
+ ColumnFamilyHandle* column_family,
+ const std::vector<std::string>& input_file_names, const int output_level,
+ const int output_path_id = -1,
+ std::vector<std::string>* const output_file_names = nullptr,
+ CompactionJobInfo* compaction_job_info = nullptr) override;
+
+ virtual Status PauseBackgroundWork() override;
+ virtual Status ContinueBackgroundWork() override;
+
+ virtual Status EnableAutoCompaction(
+ const std::vector<ColumnFamilyHandle*>& column_family_handles) override;
+
+ virtual void EnableManualCompaction() override;
+ virtual void DisableManualCompaction() override;
+
+ using DB::SetOptions;
+ Status SetOptions(
+ ColumnFamilyHandle* column_family,
+ const std::unordered_map<std::string, std::string>& options_map) override;
+
+ virtual Status SetDBOptions(
+ const std::unordered_map<std::string, std::string>& options_map) override;
+
+ using DB::NumberLevels;
+ virtual int NumberLevels(ColumnFamilyHandle* column_family) override;
+ using DB::MaxMemCompactionLevel;
+ virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) override;
+ using DB::Level0StopWriteTrigger;
+ virtual int Level0StopWriteTrigger(
+ ColumnFamilyHandle* column_family) override;
+ virtual const std::string& GetName() const override;
+ virtual Env* GetEnv() const override;
+ virtual FileSystem* GetFileSystem() const override;
+ using DB::GetOptions;
+ virtual Options GetOptions(ColumnFamilyHandle* column_family) const override;
+ using DB::GetDBOptions;
+ virtual DBOptions GetDBOptions() const override;
+ using DB::Flush;
+ virtual Status Flush(const FlushOptions& options,
+ ColumnFamilyHandle* column_family) override;
+ virtual Status Flush(
+ const FlushOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_families) override;
+ virtual Status FlushWAL(bool sync) override;
+ bool TEST_WALBufferIsEmpty(bool lock = true);
+ virtual Status SyncWAL() override;
+ virtual Status LockWAL() override;
+ virtual Status UnlockWAL() override;
+
+ virtual SequenceNumber GetLatestSequenceNumber() const override;
+
+ virtual bool SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) override;
+
+ virtual Status GetDbIdentity(std::string& identity) const override;
+
+ virtual Status GetDbIdentityFromIdentityFile(std::string* identity) const;
+
+ ColumnFamilyHandle* DefaultColumnFamily() const override;
+
+ ColumnFamilyHandle* PersistentStatsColumnFamily() const;
+
+ virtual Status Close() override;
+
+ Status GetStatsHistory(
+ uint64_t start_time, uint64_t end_time,
+ std::unique_ptr<StatsHistoryIterator>* stats_iterator) override;
+
+#ifndef ROCKSDB_LITE
+ using DB::ResetStats;
+ virtual Status ResetStats() override;
+ virtual Status DisableFileDeletions() override;
+ virtual Status EnableFileDeletions(bool force) override;
+ virtual int IsFileDeletionsEnabled() const;
+ // All the returned filenames start with "/"
+ virtual Status GetLiveFiles(std::vector<std::string>&,
+ uint64_t* manifest_file_size,
+ bool flush_memtable = true) override;
+ virtual Status GetSortedWalFiles(VectorLogPtr& files) override;
+ virtual Status GetCurrentWalFile(
+ std::unique_ptr<LogFile>* current_log_file) override;
+ virtual Status GetCreationTimeOfOldestFile(
+ uint64_t* creation_time) override;
+
+ virtual Status GetUpdatesSince(
+ SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
+ const TransactionLogIterator::ReadOptions& read_options =
+ TransactionLogIterator::ReadOptions()) override;
+ virtual Status DeleteFile(std::string name) override;
+ Status DeleteFilesInRanges(ColumnFamilyHandle* column_family,
+ const RangePtr* ranges, size_t n,
+ bool include_end = true);
+
+ virtual void GetLiveFilesMetaData(
+ std::vector<LiveFileMetaData>* metadata) override;
+
+ // Obtains the meta data of the specified column family of the DB.
+ // Status::NotFound() will be returned if the current DB does not have
+ // any column family match the specified name.
+ // TODO(yhchiang): output parameter is placed in the end in this codebase.
+ virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
+ ColumnFamilyMetaData* metadata) override;
+
+ Status SuggestCompactRange(ColumnFamilyHandle* column_family,
+ const Slice* begin, const Slice* end) override;
+
+ Status PromoteL0(ColumnFamilyHandle* column_family,
+ int target_level) override;
+
+ using DB::IngestExternalFile;
+ virtual Status IngestExternalFile(
+ ColumnFamilyHandle* column_family,
+ const std::vector<std::string>& external_files,
+ const IngestExternalFileOptions& ingestion_options) override;
+
+ using DB::IngestExternalFiles;
+ virtual Status IngestExternalFiles(
+ const std::vector<IngestExternalFileArg>& args) override;
+
+ using DB::CreateColumnFamilyWithImport;
+ virtual Status CreateColumnFamilyWithImport(
+ const ColumnFamilyOptions& options, const std::string& column_family_name,
+ const ImportColumnFamilyOptions& import_options,
+ const ExportImportFilesMetaData& metadata,
+ ColumnFamilyHandle** handle) override;
+
+ using DB::VerifyChecksum;
+ virtual Status VerifyChecksum(const ReadOptions& /*read_options*/) override;
+
+ using DB::StartTrace;
+ virtual Status StartTrace(
+ const TraceOptions& options,
+ std::unique_ptr<TraceWriter>&& trace_writer) override;
+
+ using DB::EndTrace;
+ virtual Status EndTrace() override;
+
+ using DB::StartBlockCacheTrace;
+ Status StartBlockCacheTrace(
+ const TraceOptions& options,
+ std::unique_ptr<TraceWriter>&& trace_writer) override;
+
+ using DB::EndBlockCacheTrace;
+ Status EndBlockCacheTrace() override;
+
+ using DB::GetPropertiesOfAllTables;
+ virtual Status GetPropertiesOfAllTables(
+ ColumnFamilyHandle* column_family,
+ TablePropertiesCollection* props) override;
+ virtual Status GetPropertiesOfTablesInRange(
+ ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
+ TablePropertiesCollection* props) override;
+
+#endif // ROCKSDB_LITE
+
+ // ---- End of implementations of the DB interface ----
+
+ struct GetImplOptions {
+ ColumnFamilyHandle* column_family = nullptr;
+ PinnableSlice* value = nullptr;
+ bool* value_found = nullptr;
+ ReadCallback* callback = nullptr;
+ bool* is_blob_index = nullptr;
+ // If true return value associated with key via value pointer else return
+ // all merge operands for key via merge_operands pointer
+ bool get_value = true;
+ // Pointer to an array of size
+ // get_merge_operands_options.expected_max_number_of_operands allocated by
+ // user
+ PinnableSlice* merge_operands = nullptr;
+ GetMergeOperandsOptions* get_merge_operands_options = nullptr;
+ int* number_of_operands = nullptr;
+ };
+
+ // Function that Get and KeyMayExist call with no_io true or false
+ // Note: 'value_found' from KeyMayExist propagates here
+ // This function is also called by GetMergeOperands
+ // If get_impl_options.get_value = true get value associated with
+ // get_impl_options.key via get_impl_options.value
+ // If get_impl_options.get_value = false get merge operands associated with
+ // get_impl_options.key via get_impl_options.merge_operands
+ Status GetImpl(const ReadOptions& options, const Slice& key,
+ GetImplOptions get_impl_options);
+
+ ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& options,
+ ColumnFamilyData* cfd,
+ SequenceNumber snapshot,
+ ReadCallback* read_callback,
+ bool allow_blob = false,
+ bool allow_refresh = true);
+
+ virtual SequenceNumber GetLastPublishedSequence() const {
+ if (last_seq_same_as_publish_seq_) {
+ return versions_->LastSequence();
+ } else {
+ return versions_->LastPublishedSequence();
+ }
+ }
+
+ // REQUIRES: joined the main write queue if two_write_queues is disabled, and
+ // the second write queue otherwise.
+ virtual void SetLastPublishedSequence(SequenceNumber seq);
+ // Returns LastSequence in last_seq_same_as_publish_seq_
+ // mode and LastAllocatedSequence otherwise. This is useful when visiblility
+ // depends also on data written to the WAL but not to the memtable.
+ SequenceNumber TEST_GetLastVisibleSequence() const;
+
+#ifndef ROCKSDB_LITE
+ // Similar to Write() but will call the callback once on the single write
+ // thread to determine whether it is safe to perform the write.
+ virtual Status WriteWithCallback(const WriteOptions& write_options,
+ WriteBatch* my_batch,
+ WriteCallback* callback);
+
+ // Returns the sequence number that is guaranteed to be smaller than or equal
+ // to the sequence number of any key that could be inserted into the current
+ // memtables. It can then be assumed that any write with a larger(or equal)
+ // sequence number will be present in this memtable or a later memtable.
+ //
+ // If the earliest sequence number could not be determined,
+ // kMaxSequenceNumber will be returned.
+ //
+ // If include_history=true, will also search Memtables in MemTableList
+ // History.
+ SequenceNumber GetEarliestMemTableSequenceNumber(SuperVersion* sv,
+ bool include_history);
+
+ // For a given key, check to see if there are any records for this key
+ // in the memtables, including memtable history. If cache_only is false,
+ // SST files will also be checked.
+ //
+ // If a key is found, *found_record_for_key will be set to true and
+ // *seq will be set to the stored sequence number for the latest
+ // operation on this key or kMaxSequenceNumber if unknown.
+ // If no key is found, *found_record_for_key will be set to false.
+ //
+ // Note: If cache_only=false, it is possible for *seq to be set to 0 if
+ // the sequence number has been cleared from the record. If the caller is
+ // holding an active db snapshot, we know the missing sequence must be less
+ // than the snapshot's sequence number (sequence numbers are only cleared
+ // when there are no earlier active snapshots).
+ //
+ // If NotFound is returned and found_record_for_key is set to false, then no
+ // record for this key was found. If the caller is holding an active db
+ // snapshot, we know that no key could have existing after this snapshot
+ // (since we do not compact keys that have an earlier snapshot).
+ //
+ // Only records newer than or at `lower_bound_seq` are guaranteed to be
+ // returned. Memtables and files may not be checked if it only contains data
+ // older than `lower_bound_seq`.
+ //
+ // Returns OK or NotFound on success,
+ // other status on unexpected error.
+ // TODO(andrewkr): this API need to be aware of range deletion operations
+ Status GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
+ bool cache_only,
+ SequenceNumber lower_bound_seq,
+ SequenceNumber* seq,
+ bool* found_record_for_key,
+ bool* is_blob_index = nullptr);
+
+ Status TraceIteratorSeek(const uint32_t& cf_id, const Slice& key);
+ Status TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key);
+#endif // ROCKSDB_LITE
+
+ // Similar to GetSnapshot(), but also lets the db know that this snapshot
+ // will be used for transaction write-conflict checking. The DB can then
+ // make sure not to compact any keys that would prevent a write-conflict from
+ // being detected.
+ const Snapshot* GetSnapshotForWriteConflictBoundary();
+
+ // checks if all live files exist on file system and that their file sizes
+ // match to our in-memory records
+ virtual Status CheckConsistency();
+
+ // max_file_num_to_ignore allows bottom level compaction to filter out newly
+ // compacted SST files. Setting max_file_num_to_ignore to kMaxUint64 will
+ // disable the filtering
+ Status RunManualCompaction(ColumnFamilyData* cfd, int input_level,
+ int output_level,
+ const CompactRangeOptions& compact_range_options,
+ const Slice* begin, const Slice* end,
+ bool exclusive, bool disallow_trivial_move,
+ uint64_t max_file_num_to_ignore);
+
+ // Return an internal iterator over the current state of the database.
+ // The keys of this iterator are internal keys (see format.h).
+ // The returned iterator should be deleted when no longer needed.
+ InternalIterator* NewInternalIterator(
+ Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence,
+ ColumnFamilyHandle* column_family = nullptr);
+
+ LogsWithPrepTracker* logs_with_prep_tracker() {
+ return &logs_with_prep_tracker_;
+ }
+
+ struct BGJobLimits {
+ int max_flushes;
+ int max_compactions;
+ };
+ // Returns maximum background flushes and compactions allowed to be scheduled
+ BGJobLimits GetBGJobLimits() const;
+ // Need a static version that can be called during SanitizeOptions().
+ static BGJobLimits GetBGJobLimits(int max_background_flushes,
+ int max_background_compactions,
+ int max_background_jobs,
+ bool parallelize_compactions);
+
+ // move logs pending closing from job_context to the DB queue and
+ // schedule a purge
+ void ScheduleBgLogWriterClose(JobContext* job_context);
+
+ uint64_t MinLogNumberToKeep();
+
+ // Returns the lower bound file number for SSTs that won't be deleted, even if
+ // they're obsolete. This lower bound is used internally to prevent newly
+ // created flush/compaction output files from being deleted before they're
+ // installed. This technique avoids the need for tracking the exact numbers of
+ // files pending creation, although it prevents more files than necessary from
+ // being deleted.
+ uint64_t MinObsoleteSstNumberToKeep();
+
+ // Returns the list of live files in 'live' and the list
+ // of all files in the filesystem in 'candidate_files'.
+ // If force == false and the last call was less than
+ // db_options_.delete_obsolete_files_period_micros microseconds ago,
+ // it will not fill up the job_context
+ void FindObsoleteFiles(JobContext* job_context, bool force,
+ bool no_full_scan = false);
+
+ // Diffs the files listed in filenames and those that do not
+ // belong to live files are possibly removed. Also, removes all the
+ // files in sst_delete_files and log_delete_files.
+ // It is not necessary to hold the mutex when invoking this method.
+ // If FindObsoleteFiles() was run, we need to also run
+ // PurgeObsoleteFiles(), even if disable_delete_obsolete_files_ is true
+ void PurgeObsoleteFiles(JobContext& background_contet,
+ bool schedule_only = false);
+
+ // Schedule a background job to actually delete obsolete files.
+ void SchedulePurge();
+
+ const SnapshotList& snapshots() const { return snapshots_; }
+
+ // load list of snapshots to `snap_vector` that is no newer than `max_seq`
+ // in ascending order.
+ // `oldest_write_conflict_snapshot` is filled with the oldest snapshot
+ // which satisfies SnapshotImpl.is_write_conflict_boundary_ = true.
+ void LoadSnapshots(std::vector<SequenceNumber>* snap_vector,
+ SequenceNumber* oldest_write_conflict_snapshot,
+ const SequenceNumber& max_seq) const {
+ InstrumentedMutexLock l(mutex());
+ snapshots().GetAll(snap_vector, oldest_write_conflict_snapshot, max_seq);
+ }
+
+ const ImmutableDBOptions& immutable_db_options() const {
+ return immutable_db_options_;
+ }
+
+ // Cancel all background jobs, including flush, compaction, background
+ // purging, stats dumping threads, etc. If `wait` = true, wait for the
+ // running jobs to abort or finish before returning. Otherwise, only
+ // sends the signals.
+ void CancelAllBackgroundWork(bool wait);
+
+ // Find Super version and reference it. Based on options, it might return
+ // the thread local cached one.
+ // Call ReturnAndCleanupSuperVersion() when it is no longer needed.
+ SuperVersion* GetAndRefSuperVersion(ColumnFamilyData* cfd);
+
+ // Similar to the previous function but looks up based on a column family id.
+ // nullptr will be returned if this column family no longer exists.
+ // REQUIRED: this function should only be called on the write thread or if the
+ // mutex is held.
+ SuperVersion* GetAndRefSuperVersion(uint32_t column_family_id);
+
+ // Un-reference the super version and clean it up if it is the last reference.
+ void CleanupSuperVersion(SuperVersion* sv);
+
+ // Un-reference the super version and return it to thread local cache if
+ // needed. If it is the last reference of the super version. Clean it up
+ // after un-referencing it.
+ void ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd, SuperVersion* sv);
+
+ // Similar to the previous function but looks up based on a column family id.
+ // nullptr will be returned if this column family no longer exists.
+ // REQUIRED: this function should only be called on the write thread.
+ void ReturnAndCleanupSuperVersion(uint32_t colun_family_id, SuperVersion* sv);
+
+ // REQUIRED: this function should only be called on the write thread or if the
+ // mutex is held. Return value only valid until next call to this function or
+ // mutex is released.
+ ColumnFamilyHandle* GetColumnFamilyHandle(uint32_t column_family_id);
+
+ // Same as above, should called without mutex held and not on write thread.
+ std::unique_ptr<ColumnFamilyHandle> GetColumnFamilyHandleUnlocked(
+ uint32_t column_family_id);
+
+ // Returns the number of currently running flushes.
+ // REQUIREMENT: mutex_ must be held when calling this function.
+ int num_running_flushes() {
+ mutex_.AssertHeld();
+ return num_running_flushes_;
+ }
+
+ // Returns the number of currently running compactions.
+ // REQUIREMENT: mutex_ must be held when calling this function.
+ int num_running_compactions() {
+ mutex_.AssertHeld();
+ return num_running_compactions_;
+ }
+
+ const WriteController& write_controller() { return write_controller_; }
+
+ InternalIterator* NewInternalIterator(
+ const ReadOptions&, ColumnFamilyData* cfd, SuperVersion* super_version,
+ Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence);
+
+ // hollow transactions shell used for recovery.
+ // these will then be passed to TransactionDB so that
+ // locks can be reacquired before writing can resume.
+ struct RecoveredTransaction {
+ std::string name_;
+ bool unprepared_;
+
+ struct BatchInfo {
+ uint64_t log_number_;
+ // TODO(lth): For unprepared, the memory usage here can be big for
+ // unprepared transactions. This is only useful for rollbacks, and we
+ // can in theory just keep keyset for that.
+ WriteBatch* batch_;
+ // Number of sub-batches. A new sub-batch is created if txn attempts to
+ // insert a duplicate key,seq to memtable. This is currently used in
+ // WritePreparedTxn/WriteUnpreparedTxn.
+ size_t batch_cnt_;
+ };
+
+ // This maps the seq of the first key in the batch to BatchInfo, which
+ // contains WriteBatch and other information relevant to the batch.
+ //
+ // For WriteUnprepared, batches_ can have size greater than 1, but for
+ // other write policies, it must be of size 1.
+ std::map<SequenceNumber, BatchInfo> batches_;
+
+ explicit RecoveredTransaction(const uint64_t log, const std::string& name,
+ WriteBatch* batch, SequenceNumber seq,
+ size_t batch_cnt, bool unprepared)
+ : name_(name), unprepared_(unprepared) {
+ batches_[seq] = {log, batch, batch_cnt};
+ }
+
+ ~RecoveredTransaction() {
+ for (auto& it : batches_) {
+ delete it.second.batch_;
+ }
+ }
+
+ void AddBatch(SequenceNumber seq, uint64_t log_number, WriteBatch* batch,
+ size_t batch_cnt, bool unprepared) {
+ assert(batches_.count(seq) == 0);
+ batches_[seq] = {log_number, batch, batch_cnt};
+ // Prior state must be unprepared, since the prepare batch must be the
+ // last batch.
+ assert(unprepared_);
+ unprepared_ = unprepared;
+ }
+ };
+
+ bool allow_2pc() const { return immutable_db_options_.allow_2pc; }
+
+ std::unordered_map<std::string, RecoveredTransaction*>
+ recovered_transactions() {
+ return recovered_transactions_;
+ }
+
+ RecoveredTransaction* GetRecoveredTransaction(const std::string& name) {
+ auto it = recovered_transactions_.find(name);
+ if (it == recovered_transactions_.end()) {
+ return nullptr;
+ } else {
+ return it->second;
+ }
+ }
+
+ void InsertRecoveredTransaction(const uint64_t log, const std::string& name,
+ WriteBatch* batch, SequenceNumber seq,
+ size_t batch_cnt, bool unprepared_batch) {
+ // For WriteUnpreparedTxn, InsertRecoveredTransaction is called multiple
+ // times for every unprepared batch encountered during recovery.
+ //
+ // If the transaction is prepared, then the last call to
+ // InsertRecoveredTransaction will have unprepared_batch = false.
+ auto rtxn = recovered_transactions_.find(name);
+ if (rtxn == recovered_transactions_.end()) {
+ recovered_transactions_[name] = new RecoveredTransaction(
+ log, name, batch, seq, batch_cnt, unprepared_batch);
+ } else {
+ rtxn->second->AddBatch(seq, log, batch, batch_cnt, unprepared_batch);
+ }
+ logs_with_prep_tracker_.MarkLogAsContainingPrepSection(log);
+ }
+
+ void DeleteRecoveredTransaction(const std::string& name) {
+ auto it = recovered_transactions_.find(name);
+ assert(it != recovered_transactions_.end());
+ auto* trx = it->second;
+ recovered_transactions_.erase(it);
+ for (const auto& info : trx->batches_) {
+ logs_with_prep_tracker_.MarkLogAsHavingPrepSectionFlushed(
+ info.second.log_number_);
+ }
+ delete trx;
+ }
+
+ void DeleteAllRecoveredTransactions() {
+ for (auto it = recovered_transactions_.begin();
+ it != recovered_transactions_.end(); ++it) {
+ delete it->second;
+ }
+ recovered_transactions_.clear();
+ }
+
+ void AddToLogsToFreeQueue(log::Writer* log_writer) {
+ logs_to_free_queue_.push_back(log_writer);
+ }
+
+ void AddSuperVersionsToFreeQueue(SuperVersion* sv) {
+ superversions_to_free_queue_.push_back(sv);
+ }
+
+ void SetSnapshotChecker(SnapshotChecker* snapshot_checker);
+
+ // Fill JobContext with snapshot information needed by flush and compaction.
+ void GetSnapshotContext(JobContext* job_context,
+ std::vector<SequenceNumber>* snapshot_seqs,
+ SequenceNumber* earliest_write_conflict_snapshot,
+ SnapshotChecker** snapshot_checker);
+
+ // Not thread-safe.
+ void SetRecoverableStatePreReleaseCallback(PreReleaseCallback* callback);
+
+ InstrumentedMutex* mutex() const { return &mutex_; }
+
+ // Initialize a brand new DB. The DB directory is expected to be empty before
+ // calling it.
+ Status NewDB();
+
+ // This is to be used only by internal rocksdb classes.
+ static Status Open(const DBOptions& db_options, const std::string& name,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+ const bool seq_per_batch, const bool batch_per_txn);
+
+ static Status CreateAndNewDirectory(Env* env, const std::string& dirname,
+ std::unique_ptr<Directory>* directory);
+
+ // find stats map from stats_history_ with smallest timestamp in
+ // the range of [start_time, end_time)
+ bool FindStatsByTime(uint64_t start_time, uint64_t end_time,
+ uint64_t* new_time,
+ std::map<std::string, uint64_t>* stats_map);
+
+ // Print information of all tombstones of all iterators to the std::string
+ // This is only used by ldb. The output might be capped. Tombstones
+ // printed out are not guaranteed to be in any order.
+ Status TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family,
+ int max_entries_to_print,
+ std::string* out_str);
+
+#ifndef NDEBUG
+ // Compact any files in the named level that overlap [*begin, *end]
+ Status TEST_CompactRange(int level, const Slice* begin, const Slice* end,
+ ColumnFamilyHandle* column_family = nullptr,
+ bool disallow_trivial_move = false);
+
+ void TEST_SwitchWAL();
+
+ bool TEST_UnableToReleaseOldestLog() { return unable_to_release_oldest_log_; }
+
+ bool TEST_IsLogGettingFlushed() {
+ return alive_log_files_.begin()->getting_flushed;
+ }
+
+ Status TEST_SwitchMemtable(ColumnFamilyData* cfd = nullptr);
+
+ // Force current memtable contents to be flushed.
+ Status TEST_FlushMemTable(bool wait = true, bool allow_write_stall = false,
+ ColumnFamilyHandle* cfh = nullptr);
+
+ Status TEST_FlushMemTable(ColumnFamilyData* cfd,
+ const FlushOptions& flush_opts);
+
+ // Flush (multiple) ColumnFamilyData without using ColumnFamilyHandle. This
+ // is because in certain cases, we can flush column families, wait for the
+ // flush to complete, but delete the column family handle before the wait
+ // finishes. For example in CompactRange.
+ Status TEST_AtomicFlushMemTables(const autovector<ColumnFamilyData*>& cfds,
+ const FlushOptions& flush_opts);
+
+ // Wait for memtable compaction
+ Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr);
+
+ // Wait for any compaction
+ // We add a bool parameter to wait for unscheduledCompactions_ == 0, but this
+ // is only for the special test of CancelledCompactions
+ Status TEST_WaitForCompact(bool waitUnscheduled = false);
+
+ // Return the maximum overlapping data (in bytes) at next level for any
+ // file at a level >= 1.
+ int64_t TEST_MaxNextLevelOverlappingBytes(
+ ColumnFamilyHandle* column_family = nullptr);
+
+ // Return the current manifest file no.
+ uint64_t TEST_Current_Manifest_FileNo();
+
+ // Returns the number that'll be assigned to the next file that's created.
+ uint64_t TEST_Current_Next_FileNo();
+
+ // get total level0 file size. Only for testing.
+ uint64_t TEST_GetLevel0TotalSize();
+
+ void TEST_GetFilesMetaData(ColumnFamilyHandle* column_family,
+ std::vector<std::vector<FileMetaData>>* metadata);
+
+ void TEST_LockMutex();
+
+ void TEST_UnlockMutex();
+
+ // REQUIRES: mutex locked
+ void* TEST_BeginWrite();
+
+ // REQUIRES: mutex locked
+ // pass the pointer that you got from TEST_BeginWrite()
+ void TEST_EndWrite(void* w);
+
+ uint64_t TEST_MaxTotalInMemoryState() const {
+ return max_total_in_memory_state_;
+ }
+
+ size_t TEST_LogsToFreeSize();
+
+ uint64_t TEST_LogfileNumber();
+
+ uint64_t TEST_total_log_size() const { return total_log_size_; }
+
+ // Returns column family name to ImmutableCFOptions map.
+ Status TEST_GetAllImmutableCFOptions(
+ std::unordered_map<std::string, const ImmutableCFOptions*>* iopts_map);
+
+ // Return the lastest MutableCFOptions of a column family
+ Status TEST_GetLatestMutableCFOptions(ColumnFamilyHandle* column_family,
+ MutableCFOptions* mutable_cf_options);
+
+ Cache* TEST_table_cache() { return table_cache_.get(); }
+
+ WriteController& TEST_write_controler() { return write_controller_; }
+
+ uint64_t TEST_FindMinLogContainingOutstandingPrep();
+ uint64_t TEST_FindMinPrepLogReferencedByMemTable();
+ size_t TEST_PreparedSectionCompletedSize();
+ size_t TEST_LogsWithPrepSize();
+
+ int TEST_BGCompactionsAllowed() const;
+ int TEST_BGFlushesAllowed() const;
+ size_t TEST_GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
+ void TEST_WaitForDumpStatsRun(std::function<void()> callback) const;
+ void TEST_WaitForPersistStatsRun(std::function<void()> callback) const;
+ bool TEST_IsPersistentStatsEnabled() const;
+ size_t TEST_EstimateInMemoryStatsHistorySize() const;
+#endif // NDEBUG
+
+ protected:
+ const std::string dbname_;
+ std::string db_id_;
+ std::unique_ptr<VersionSet> versions_;
+ // Flag to check whether we allocated and own the info log file
+ bool own_info_log_;
+ const DBOptions initial_db_options_;
+ Env* const env_;
+ std::shared_ptr<FileSystem> fs_;
+ const ImmutableDBOptions immutable_db_options_;
+ MutableDBOptions mutable_db_options_;
+ Statistics* stats_;
+ std::unordered_map<std::string, RecoveredTransaction*>
+ recovered_transactions_;
+ std::unique_ptr<Tracer> tracer_;
+ InstrumentedMutex trace_mutex_;
+ BlockCacheTracer block_cache_tracer_;
+
+ // State below is protected by mutex_
+ // With two_write_queues enabled, some of the variables that accessed during
+ // WriteToWAL need different synchronization: log_empty_, alive_log_files_,
+ // logs_, logfile_number_. Refer to the definition of each variable below for
+ // more description.
+ mutable InstrumentedMutex mutex_;
+
+ ColumnFamilyHandleImpl* default_cf_handle_;
+ InternalStats* default_cf_internal_stats_;
+
+ // only used for dynamically adjusting max_total_wal_size. it is a sum of
+ // [write_buffer_size * max_write_buffer_number] over all column families
+ uint64_t max_total_in_memory_state_;
+ // If true, we have only one (default) column family. We use this to optimize
+ // some code-paths
+ bool single_column_family_mode_;
+
+ // The options to access storage files
+ const FileOptions file_options_;
+
+ // Additonal options for compaction and flush
+ FileOptions file_options_for_compaction_;
+
+ std::unique_ptr<ColumnFamilyMemTablesImpl> column_family_memtables_;
+
+ // Increase the sequence number after writing each batch, whether memtable is
+ // disabled for that or not. Otherwise the sequence number is increased after
+ // writing each key into memtable. This implies that when disable_memtable is
+ // set, the seq is not increased at all.
+ //
+ // Default: false
+ const bool seq_per_batch_;
+ // This determines during recovery whether we expect one writebatch per
+ // recovered transaction, or potentially multiple writebatches per
+ // transaction. For WriteUnprepared, this is set to false, since multiple
+ // batches can exist per transaction.
+ //
+ // Default: true
+ const bool batch_per_txn_;
+
+ // Except in DB::Open(), WriteOptionsFile can only be called when:
+ // Persist options to options file.
+ // If need_mutex_lock = false, the method will lock DB mutex.
+ // If need_enter_write_thread = false, the method will enter write thread.
+ Status WriteOptionsFile(bool need_mutex_lock, bool need_enter_write_thread);
+
+ // The following two functions can only be called when:
+ // 1. WriteThread::Writer::EnterUnbatched() is used.
+ // 2. db_mutex is NOT held
+ Status RenameTempFileToOptionsFile(const std::string& file_name);
+ Status DeleteObsoleteOptionsFiles();
+
+ void NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
+ const MutableCFOptions& mutable_cf_options,
+ int job_id);
+
+ void NotifyOnFlushCompleted(
+ ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+ std::list<std::unique_ptr<FlushJobInfo>>* flush_jobs_info);
+
+ void NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c,
+ const Status& st,
+ const CompactionJobStats& job_stats, int job_id);
+
+ void NotifyOnCompactionCompleted(ColumnFamilyData* cfd, Compaction* c,
+ const Status& st,
+ const CompactionJobStats& job_stats,
+ int job_id);
+ void NotifyOnMemTableSealed(ColumnFamilyData* cfd,
+ const MemTableInfo& mem_table_info);
+
+#ifndef ROCKSDB_LITE
+ void NotifyOnExternalFileIngested(
+ ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job);
+#endif // !ROCKSDB_LITE
+
+ void NewThreadStatusCfInfo(ColumnFamilyData* cfd) const;
+
+ void EraseThreadStatusCfInfo(ColumnFamilyData* cfd) const;
+
+ void EraseThreadStatusDbInfo() const;
+
+ // If disable_memtable is set the application logic must guarantee that the
+ // batch will still be skipped from memtable during the recovery. An excption
+ // to this is seq_per_batch_ mode, in which since each batch already takes one
+ // seq, it is ok for the batch to write to memtable during recovery as long as
+ // it only takes one sequence number: i.e., no duplicate keys.
+ // In WriteCommitted it is guarnateed since disable_memtable is used for
+ // prepare batch which will be written to memtable later during the commit,
+ // and in WritePrepared it is guaranteed since it will be used only for WAL
+ // markers which will never be written to memtable. If the commit marker is
+ // accompanied with CommitTimeWriteBatch that is not written to memtable as
+ // long as it has no duplicate keys, it does not violate the one-seq-per-batch
+ // policy.
+ // batch_cnt is expected to be non-zero in seq_per_batch mode and
+ // indicates the number of sub-patches. A sub-patch is a subset of the write
+ // batch that does not have duplicate keys.
+ Status WriteImpl(const WriteOptions& options, WriteBatch* updates,
+ WriteCallback* callback = nullptr,
+ uint64_t* log_used = nullptr, uint64_t log_ref = 0,
+ bool disable_memtable = false, uint64_t* seq_used = nullptr,
+ size_t batch_cnt = 0,
+ PreReleaseCallback* pre_release_callback = nullptr);
+
+ Status PipelinedWriteImpl(const WriteOptions& options, WriteBatch* updates,
+ WriteCallback* callback = nullptr,
+ uint64_t* log_used = nullptr, uint64_t log_ref = 0,
+ bool disable_memtable = false,
+ uint64_t* seq_used = nullptr);
+
+ // Write only to memtables without joining any write queue
+ Status UnorderedWriteMemtable(const WriteOptions& write_options,
+ WriteBatch* my_batch, WriteCallback* callback,
+ uint64_t log_ref, SequenceNumber seq,
+ const size_t sub_batch_cnt);
+
+ // Whether the batch requires to be assigned with an order
+ enum AssignOrder : bool { kDontAssignOrder, kDoAssignOrder };
+ // Whether it requires publishing last sequence or not
+ enum PublishLastSeq : bool { kDontPublishLastSeq, kDoPublishLastSeq };
+
+ // Join the write_thread to write the batch only to the WAL. It is the
+ // responsibility of the caller to also write the write batch to the memtable
+ // if it required.
+ //
+ // sub_batch_cnt is expected to be non-zero when assign_order = kDoAssignOrder
+ // indicating the number of sub-batches in my_batch. A sub-patch is a subset
+ // of the write batch that does not have duplicate keys. When seq_per_batch is
+ // not set, each key is a separate sub_batch. Otherwise each duplicate key
+ // marks start of a new sub-batch.
+ Status WriteImplWALOnly(
+ WriteThread* write_thread, const WriteOptions& options,
+ WriteBatch* updates, WriteCallback* callback, uint64_t* log_used,
+ const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt,
+ PreReleaseCallback* pre_release_callback, const AssignOrder assign_order,
+ const PublishLastSeq publish_last_seq, const bool disable_memtable);
+
+ // write cached_recoverable_state_ to memtable if it is not empty
+ // The writer must be the leader in write_thread_ and holding mutex_
+ Status WriteRecoverableState();
+
+ // Actual implementation of Close()
+ Status CloseImpl();
+
+ // Recover the descriptor from persistent storage. May do a significant
+ // amount of work to recover recently logged updates. Any changes to
+ // be made to the descriptor are added to *edit.
+ // recovered_seq is set to less than kMaxSequenceNumber if the log's tail is
+ // skipped.
+ virtual Status Recover(
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ bool read_only = false, bool error_if_log_file_exist = false,
+ bool error_if_data_exists_in_logs = false,
+ uint64_t* recovered_seq = nullptr);
+
+ virtual bool OwnTablesAndLogs() const { return true; }
+
+ private:
+ friend class DB;
+ friend class ErrorHandler;
+ friend class InternalStats;
+ friend class PessimisticTransaction;
+ friend class TransactionBaseImpl;
+ friend class WriteCommittedTxn;
+ friend class WritePreparedTxn;
+ friend class WritePreparedTxnDB;
+ friend class WriteBatchWithIndex;
+ friend class WriteUnpreparedTxnDB;
+ friend class WriteUnpreparedTxn;
+
+#ifndef ROCKSDB_LITE
+ friend class ForwardIterator;
+#endif
+ friend struct SuperVersion;
+ friend class CompactedDBImpl;
+ friend class DBTest_ConcurrentFlushWAL_Test;
+ friend class DBTest_MixedSlowdownOptionsStop_Test;
+ friend class DBCompactionTest_CompactBottomLevelFilesWithDeletions_Test;
+ friend class DBCompactionTest_CompactionDuringShutdown_Test;
+ friend class StatsHistoryTest_PersistentStatsCreateColumnFamilies_Test;
+#ifndef NDEBUG
+ friend class DBTest2_ReadCallbackTest_Test;
+ friend class WriteCallbackTest_WriteWithCallbackTest_Test;
+ friend class XFTransactionWriteHandler;
+ friend class DBBlobIndexTest;
+ friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
+#endif
+
+ struct CompactionState;
+ struct PrepickedCompaction;
+ struct PurgeFileInfo;
+
+ struct WriteContext {
+ SuperVersionContext superversion_context;
+ autovector<MemTable*> memtables_to_free_;
+
+ explicit WriteContext(bool create_superversion = false)
+ : superversion_context(create_superversion) {}
+
+ ~WriteContext() {
+ superversion_context.Clean();
+ for (auto& m : memtables_to_free_) {
+ delete m;
+ }
+ }
+ };
+
+ struct LogFileNumberSize {
+ explicit LogFileNumberSize(uint64_t _number) : number(_number) {}
+ void AddSize(uint64_t new_size) { size += new_size; }
+ uint64_t number;
+ uint64_t size = 0;
+ bool getting_flushed = false;
+ };
+
+ struct LogWriterNumber {
+ // pass ownership of _writer
+ LogWriterNumber(uint64_t _number, log::Writer* _writer)
+ : number(_number), writer(_writer) {}
+
+ log::Writer* ReleaseWriter() {
+ auto* w = writer;
+ writer = nullptr;
+ return w;
+ }
+ Status ClearWriter() {
+ Status s = writer->WriteBuffer();
+ delete writer;
+ writer = nullptr;
+ return s;
+ }
+
+ uint64_t number;
+ // Visual Studio doesn't support deque's member to be noncopyable because
+ // of a std::unique_ptr as a member.
+ log::Writer* writer; // own
+ // true for some prefix of logs_
+ bool getting_synced = false;
+ };
+
+ // PurgeFileInfo is a structure to hold information of files to be deleted in
+ // purge_files_
+ struct PurgeFileInfo {
+ std::string fname;
+ std::string dir_to_sync;
+ FileType type;
+ uint64_t number;
+ int job_id;
+ PurgeFileInfo(std::string fn, std::string d, FileType t, uint64_t num,
+ int jid)
+ : fname(fn), dir_to_sync(d), type(t), number(num), job_id(jid) {}
+ };
+
+ // Argument required by background flush thread.
+ struct BGFlushArg {
+ BGFlushArg()
+ : cfd_(nullptr), max_memtable_id_(0), superversion_context_(nullptr) {}
+ BGFlushArg(ColumnFamilyData* cfd, uint64_t max_memtable_id,
+ SuperVersionContext* superversion_context)
+ : cfd_(cfd),
+ max_memtable_id_(max_memtable_id),
+ superversion_context_(superversion_context) {}
+
+ // Column family to flush.
+ ColumnFamilyData* cfd_;
+ // Maximum ID of memtable to flush. In this column family, memtables with
+ // IDs smaller than this value must be flushed before this flush completes.
+ uint64_t max_memtable_id_;
+ // Pointer to a SuperVersionContext object. After flush completes, RocksDB
+ // installs a new superversion for the column family. This operation
+ // requires a SuperVersionContext object (currently embedded in JobContext).
+ SuperVersionContext* superversion_context_;
+ };
+
+ // Argument passed to flush thread.
+ struct FlushThreadArg {
+ DBImpl* db_;
+
+ Env::Priority thread_pri_;
+ };
+
+ // Information for a manual compaction
+ struct ManualCompactionState {
+ ColumnFamilyData* cfd;
+ int input_level;
+ int output_level;
+ uint32_t output_path_id;
+ Status status;
+ bool done;
+ bool in_progress; // compaction request being processed?
+ bool incomplete; // only part of requested range compacted
+ bool exclusive; // current behavior of only one manual
+ bool disallow_trivial_move; // Force actual compaction to run
+ const InternalKey* begin; // nullptr means beginning of key range
+ const InternalKey* end; // nullptr means end of key range
+ InternalKey* manual_end; // how far we are compacting
+ InternalKey tmp_storage; // Used to keep track of compaction progress
+ InternalKey tmp_storage1; // Used to keep track of compaction progress
+ };
+ struct PrepickedCompaction {
+ // background compaction takes ownership of `compaction`.
+ Compaction* compaction;
+ // caller retains ownership of `manual_compaction_state` as it is reused
+ // across background compactions.
+ ManualCompactionState* manual_compaction_state; // nullptr if non-manual
+ // task limiter token is requested during compaction picking.
+ std::unique_ptr<TaskLimiterToken> task_token;
+ };
+
+ struct CompactionArg {
+ // caller retains ownership of `db`.
+ DBImpl* db;
+ // background compaction takes ownership of `prepicked_compaction`.
+ PrepickedCompaction* prepicked_compaction;
+ };
+
+ // Initialize the built-in column family for persistent stats. Depending on
+ // whether on-disk persistent stats have been enabled before, it may either
+ // create a new column family and column family handle or just a column family
+ // handle.
+ // Required: DB mutex held
+ Status InitPersistStatsColumnFamily();
+
+ // Persistent Stats column family has two format version key which are used
+ // for compatibility check. Write format version if it's created for the
+ // first time, read format version and check compatibility if recovering
+ // from disk. This function requires DB mutex held at entrance but may
+ // release and re-acquire DB mutex in the process.
+ // Required: DB mutex held
+ Status PersistentStatsProcessFormatVersion();
+
+ Status ResumeImpl();
+
+ void MaybeIgnoreError(Status* s) const;
+
+ const Status CreateArchivalDirectory();
+
+ Status CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options,
+ const std::string& cf_name,
+ ColumnFamilyHandle** handle);
+
+ Status DropColumnFamilyImpl(ColumnFamilyHandle* column_family);
+
+ // Delete any unneeded files and stale in-memory entries.
+ void DeleteObsoleteFiles();
+ // Delete obsolete files and log status and information of file deletion
+ void DeleteObsoleteFileImpl(int job_id, const std::string& fname,
+ const std::string& path_to_sync, FileType type,
+ uint64_t number);
+
+ // Background process needs to call
+ // auto x = CaptureCurrentFileNumberInPendingOutputs()
+ // auto file_num = versions_->NewFileNumber();
+ // <do something>
+ // ReleaseFileNumberFromPendingOutputs(x)
+ // This will protect any file with number `file_num` or greater from being
+ // deleted while <do something> is running.
+ // -----------
+ // This function will capture current file number and append it to
+ // pending_outputs_. This will prevent any background process to delete any
+ // file created after this point.
+ std::list<uint64_t>::iterator CaptureCurrentFileNumberInPendingOutputs();
+ // This function should be called with the result of
+ // CaptureCurrentFileNumberInPendingOutputs(). It then marks that any file
+ // created between the calls CaptureCurrentFileNumberInPendingOutputs() and
+ // ReleaseFileNumberFromPendingOutputs() can now be deleted (if it's not live
+ // and blocked by any other pending_outputs_ calls)
+ void ReleaseFileNumberFromPendingOutputs(
+ std::unique_ptr<std::list<uint64_t>::iterator>& v);
+
+ Status SyncClosedLogs(JobContext* job_context);
+
+ // Flush the in-memory write buffer to storage. Switches to a new
+ // log-file/memtable and writes a new descriptor iff successful. Then
+ // installs a new super version for the column family.
+ Status FlushMemTableToOutputFile(
+ ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+ bool* madeProgress, JobContext* job_context,
+ SuperVersionContext* superversion_context,
+ std::vector<SequenceNumber>& snapshot_seqs,
+ SequenceNumber earliest_write_conflict_snapshot,
+ SnapshotChecker* snapshot_checker, LogBuffer* log_buffer,
+ Env::Priority thread_pri);
+
+ // Flush the memtables of (multiple) column families to multiple files on
+ // persistent storage.
+ Status FlushMemTablesToOutputFiles(
+ const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
+ JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri);
+
+ Status AtomicFlushMemTablesToOutputFiles(
+ const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
+ JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri);
+
+ // REQUIRES: log_numbers are sorted in ascending order
+ // corrupted_log_found is set to true if we recover from a corrupted log file.
+ Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
+ SequenceNumber* next_sequence, bool read_only,
+ bool* corrupted_log_found);
+
+ // The following two methods are used to flush a memtable to
+ // storage. The first one is used at database RecoveryTime (when the
+ // database is opened) and is heavyweight because it holds the mutex
+ // for the entire period. The second method WriteLevel0Table supports
+ // concurrent flush memtables to storage.
+ Status WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
+ MemTable* mem, VersionEdit* edit);
+
+ // Restore alive_log_files_ and total_log_size_ after recovery.
+ // It needs to run only when there's no flush during recovery
+ // (e.g. avoid_flush_during_recovery=true). May also trigger flush
+ // in case total_log_size > max_total_wal_size.
+ Status RestoreAliveLogFiles(const std::vector<uint64_t>& log_numbers);
+
+ // num_bytes: for slowdown case, delay time is calculated based on
+ // `num_bytes` going through.
+ Status DelayWrite(uint64_t num_bytes, const WriteOptions& write_options);
+
+ Status ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
+ WriteBatch* my_batch);
+
+ // REQUIRES: mutex locked and in write thread.
+ Status ScheduleFlushes(WriteContext* context);
+
+ void MaybeFlushStatsCF(autovector<ColumnFamilyData*>* cfds);
+
+ Status TrimMemtableHistory(WriteContext* context);
+
+ Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context);
+
+ void SelectColumnFamiliesForAtomicFlush(autovector<ColumnFamilyData*>* cfds);
+
+ // Force current memtable contents to be flushed.
+ Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options,
+ FlushReason flush_reason, bool writes_stopped = false);
+
+ Status AtomicFlushMemTables(
+ const autovector<ColumnFamilyData*>& column_family_datas,
+ const FlushOptions& options, FlushReason flush_reason,
+ bool writes_stopped = false);
+
+ // Wait until flushing this column family won't stall writes
+ Status WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
+ bool* flush_needed);
+
+ // Wait for memtable flushed.
+ // If flush_memtable_id is non-null, wait until the memtable with the ID
+ // gets flush. Otherwise, wait until the column family don't have any
+ // memtable pending flush.
+ // resuming_from_bg_err indicates whether the caller is attempting to resume
+ // from background error.
+ Status WaitForFlushMemTable(ColumnFamilyData* cfd,
+ const uint64_t* flush_memtable_id = nullptr,
+ bool resuming_from_bg_err = false) {
+ return WaitForFlushMemTables({cfd}, {flush_memtable_id},
+ resuming_from_bg_err);
+ }
+ // Wait for memtables to be flushed for multiple column families.
+ Status WaitForFlushMemTables(
+ const autovector<ColumnFamilyData*>& cfds,
+ const autovector<const uint64_t*>& flush_memtable_ids,
+ bool resuming_from_bg_err);
+
+ inline void WaitForPendingWrites() {
+ mutex_.AssertHeld();
+ TEST_SYNC_POINT("DBImpl::WaitForPendingWrites:BeforeBlock");
+ // In case of pipelined write is enabled, wait for all pending memtable
+ // writers.
+ if (immutable_db_options_.enable_pipelined_write) {
+ // Memtable writers may call DB::Get in case max_successive_merges > 0,
+ // which may lock mutex. Unlocking mutex here to avoid deadlock.
+ mutex_.Unlock();
+ write_thread_.WaitForMemTableWriters();
+ mutex_.Lock();
+ }
+
+ if (!immutable_db_options_.unordered_write) {
+ // Then the writes are finished before the next write group starts
+ return;
+ }
+
+ // Wait for the ones who already wrote to the WAL to finish their
+ // memtable write.
+ if (pending_memtable_writes_.load() != 0) {
+ std::unique_lock<std::mutex> guard(switch_mutex_);
+ switch_cv_.wait(guard,
+ [&] { return pending_memtable_writes_.load() == 0; });
+ }
+ }
+
+ // REQUIRES: mutex locked and in write thread.
+ void AssignAtomicFlushSeq(const autovector<ColumnFamilyData*>& cfds);
+
+ // REQUIRES: mutex locked and in write thread.
+ Status SwitchWAL(WriteContext* write_context);
+
+ // REQUIRES: mutex locked and in write thread.
+ Status HandleWriteBufferFull(WriteContext* write_context);
+
+ // REQUIRES: mutex locked
+ Status PreprocessWrite(const WriteOptions& write_options, bool* need_log_sync,
+ WriteContext* write_context);
+
+ WriteBatch* MergeBatch(const WriteThread::WriteGroup& write_group,
+ WriteBatch* tmp_batch, size_t* write_with_wal,
+ WriteBatch** to_be_cached_state);
+
+ Status WriteToWAL(const WriteBatch& merged_batch, log::Writer* log_writer,
+ uint64_t* log_used, uint64_t* log_size);
+
+ Status WriteToWAL(const WriteThread::WriteGroup& write_group,
+ log::Writer* log_writer, uint64_t* log_used,
+ bool need_log_sync, bool need_log_dir_sync,
+ SequenceNumber sequence);
+
+ Status ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group,
+ uint64_t* log_used, SequenceNumber* last_sequence,
+ size_t seq_inc);
+
+ // Used by WriteImpl to update bg_error_ if paranoid check is enabled.
+ void WriteStatusCheck(const Status& status);
+
+ // Used by WriteImpl to update bg_error_ in case of memtable insert error.
+ void MemTableInsertStatusCheck(const Status& memtable_insert_status);
+
+#ifndef ROCKSDB_LITE
+
+ Status CompactFilesImpl(const CompactionOptions& compact_options,
+ ColumnFamilyData* cfd, Version* version,
+ const std::vector<std::string>& input_file_names,
+ std::vector<std::string>* const output_file_names,
+ const int output_level, int output_path_id,
+ JobContext* job_context, LogBuffer* log_buffer,
+ CompactionJobInfo* compaction_job_info);
+
+ // Wait for current IngestExternalFile() calls to finish.
+ // REQUIRES: mutex_ held
+ void WaitForIngestFile();
+
+#else
+ // IngestExternalFile is not supported in ROCKSDB_LITE so this function
+ // will be no-op
+ void WaitForIngestFile() {}
+#endif // ROCKSDB_LITE
+
+ ColumnFamilyData* GetColumnFamilyDataByName(const std::string& cf_name);
+
+ void MaybeScheduleFlushOrCompaction();
+
+ // A flush request specifies the column families to flush as well as the
+ // largest memtable id to persist for each column family. Once all the
+ // memtables whose IDs are smaller than or equal to this per-column-family
+ // specified value, this flush request is considered to have completed its
+ // work of flushing this column family. After completing the work for all
+ // column families in this request, this flush is considered complete.
+ typedef std::vector<std::pair<ColumnFamilyData*, uint64_t>> FlushRequest;
+
+ void GenerateFlushRequest(const autovector<ColumnFamilyData*>& cfds,
+ FlushRequest* req);
+
+ void SchedulePendingFlush(const FlushRequest& req, FlushReason flush_reason);
+
+ void SchedulePendingCompaction(ColumnFamilyData* cfd);
+ void SchedulePendingPurge(std::string fname, std::string dir_to_sync,
+ FileType type, uint64_t number, int job_id);
+ static void BGWorkCompaction(void* arg);
+ // Runs a pre-chosen universal compaction involving bottom level in a
+ // separate, bottom-pri thread pool.
+ static void BGWorkBottomCompaction(void* arg);
+ static void BGWorkFlush(void* arg);
+ static void BGWorkPurge(void* arg);
+ static void UnscheduleCompactionCallback(void* arg);
+ static void UnscheduleFlushCallback(void* arg);
+ void BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
+ Env::Priority thread_pri);
+ void BackgroundCallFlush(Env::Priority thread_pri);
+ void BackgroundCallPurge();
+ Status BackgroundCompaction(bool* madeProgress, JobContext* job_context,
+ LogBuffer* log_buffer,
+ PrepickedCompaction* prepicked_compaction,
+ Env::Priority thread_pri);
+ Status BackgroundFlush(bool* madeProgress, JobContext* job_context,
+ LogBuffer* log_buffer, FlushReason* reason,
+ Env::Priority thread_pri);
+
+ bool EnoughRoomForCompaction(ColumnFamilyData* cfd,
+ const std::vector<CompactionInputFiles>& inputs,
+ bool* sfm_bookkeeping, LogBuffer* log_buffer);
+
+ // Request compaction tasks token from compaction thread limiter.
+ // It always succeeds if force = true or limiter is disable.
+ bool RequestCompactionToken(ColumnFamilyData* cfd, bool force,
+ std::unique_ptr<TaskLimiterToken>* token,
+ LogBuffer* log_buffer);
+
+ // Schedule background tasks
+ void StartTimedTasks();
+
+ void PrintStatistics();
+
+ size_t EstimateInMemoryStatsHistorySize() const;
+
+ // persist stats to column family "_persistent_stats"
+ void PersistStats();
+
+ // dump rocksdb.stats to LOG
+ void DumpStats();
+
+ // Return the minimum empty level that could hold the total data in the
+ // input level. Return the input level, if such level could not be found.
+ int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd,
+ const MutableCFOptions& mutable_cf_options,
+ int level);
+
+ // Move the files in the input level to the target level.
+ // If target_level < 0, automatically calculate the minimum level that could
+ // hold the data set.
+ Status ReFitLevel(ColumnFamilyData* cfd, int level, int target_level = -1);
+
+ // helper functions for adding and removing from flush & compaction queues
+ void AddToCompactionQueue(ColumnFamilyData* cfd);
+ ColumnFamilyData* PopFirstFromCompactionQueue();
+ FlushRequest PopFirstFromFlushQueue();
+
+ // Pick the first unthrottled compaction with task token from queue.
+ ColumnFamilyData* PickCompactionFromQueue(
+ std::unique_ptr<TaskLimiterToken>* token, LogBuffer* log_buffer);
+
+ // helper function to call after some of the logs_ were synced
+ void MarkLogsSynced(uint64_t up_to, bool synced_dir, const Status& status);
+
+ SnapshotImpl* GetSnapshotImpl(bool is_write_conflict_boundary,
+ bool lock = true);
+
+ uint64_t GetMaxTotalWalSize() const;
+
+ Directory* GetDataDir(ColumnFamilyData* cfd, size_t path_id) const;
+
+ Status CloseHelper();
+
+ void WaitForBackgroundWork();
+
+ // Background threads call this function, which is just a wrapper around
+ // the InstallSuperVersion() function. Background threads carry
+ // sv_context which can have new_superversion already
+ // allocated.
+ // All ColumnFamily state changes go through this function. Here we analyze
+ // the new state and we schedule background work if we detect that the new
+ // state needs flush or compaction.
+ void InstallSuperVersionAndScheduleWork(
+ ColumnFamilyData* cfd, SuperVersionContext* sv_context,
+ const MutableCFOptions& mutable_cf_options);
+
+ bool GetIntPropertyInternal(ColumnFamilyData* cfd,
+ const DBPropertyInfo& property_info,
+ bool is_locked, uint64_t* value);
+ bool GetPropertyHandleOptionsStatistics(std::string* value);
+
+ bool HasPendingManualCompaction();
+ bool HasExclusiveManualCompaction();
+ void AddManualCompaction(ManualCompactionState* m);
+ void RemoveManualCompaction(ManualCompactionState* m);
+ bool ShouldntRunManualCompaction(ManualCompactionState* m);
+ bool HaveManualCompaction(ColumnFamilyData* cfd);
+ bool MCOverlap(ManualCompactionState* m, ManualCompactionState* m1);
+#ifndef ROCKSDB_LITE
+ void BuildCompactionJobInfo(const ColumnFamilyData* cfd, Compaction* c,
+ const Status& st,
+ const CompactionJobStats& compaction_job_stats,
+ const int job_id, const Version* current,
+ CompactionJobInfo* compaction_job_info) const;
+ // Reserve the next 'num' file numbers for to-be-ingested external SST files,
+ // and return the current file_number in 'next_file_number'.
+ // Write a version edit to the MANIFEST.
+ Status ReserveFileNumbersBeforeIngestion(
+ ColumnFamilyData* cfd, uint64_t num,
+ std::unique_ptr<std::list<uint64_t>::iterator>& pending_output_elem,
+ uint64_t* next_file_number);
+#endif //! ROCKSDB_LITE
+
+ bool ShouldPurge(uint64_t file_number) const;
+ void MarkAsGrabbedForPurge(uint64_t file_number);
+
+ size_t GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
+ Env::WriteLifeTimeHint CalculateWALWriteHint() { return Env::WLTH_SHORT; }
+
+ Status CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number,
+ size_t preallocate_block_size, log::Writer** new_log);
+
+ // Validate self-consistency of DB options
+ static Status ValidateOptions(const DBOptions& db_options);
+ // Validate self-consistency of DB options and its consistency with cf options
+ static Status ValidateOptions(
+ const DBOptions& db_options,
+ const std::vector<ColumnFamilyDescriptor>& column_families);
+
+ // Utility function to do some debug validation and sort the given vector
+ // of MultiGet keys
+ void PrepareMultiGetKeys(
+ const size_t num_keys, bool sorted,
+ autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* key_ptrs);
+
+ // A structure to hold the information required to process MultiGet of keys
+ // belonging to one column family. For a multi column family MultiGet, there
+ // will be a container of these objects.
+ struct MultiGetColumnFamilyData {
+ ColumnFamilyHandle* cf;
+ ColumnFamilyData* cfd;
+
+ // For the batched MultiGet which relies on sorted keys, start specifies
+ // the index of first key belonging to this column family in the sorted
+ // list.
+ size_t start;
+
+ // For the batched MultiGet case, num_keys specifies the number of keys
+ // belonging to this column family in the sorted list
+ size_t num_keys;
+
+ // SuperVersion for the column family obtained in a manner that ensures a
+ // consistent view across all column families in the DB
+ SuperVersion* super_version;
+ MultiGetColumnFamilyData(ColumnFamilyHandle* column_family,
+ SuperVersion* sv)
+ : cf(column_family),
+ cfd(static_cast<ColumnFamilyHandleImpl*>(cf)->cfd()),
+ start(0),
+ num_keys(0),
+ super_version(sv) {}
+
+ MultiGetColumnFamilyData(ColumnFamilyHandle* column_family, size_t first,
+ size_t count, SuperVersion* sv)
+ : cf(column_family),
+ cfd(static_cast<ColumnFamilyHandleImpl*>(cf)->cfd()),
+ start(first),
+ num_keys(count),
+ super_version(sv) {}
+
+ MultiGetColumnFamilyData() = default;
+ };
+
+ // A common function to obtain a consistent snapshot, which can be implicit
+ // if the user doesn't specify a snapshot in read_options, across
+ // multiple column families for MultiGet. It will attempt to get an implicit
+ // snapshot without acquiring the db_mutes, but will give up after a few
+ // tries and acquire the mutex if a memtable flush happens. The template
+ // allows both the batched and non-batched MultiGet to call this with
+ // either an std::unordered_map or autovector of column families.
+ //
+ // If callback is non-null, the callback is refreshed with the snapshot
+ // sequence number
+ //
+ // A return value of true indicates that the SuperVersions were obtained
+ // from the ColumnFamilyData, whereas false indicates they are thread
+ // local
+ template <class T>
+ bool MultiCFSnapshot(
+ const ReadOptions& read_options, ReadCallback* callback,
+ std::function<MultiGetColumnFamilyData*(typename T::iterator&)>&
+ iter_deref_func,
+ T* cf_list, SequenceNumber* snapshot);
+
+ // The actual implementation of the batching MultiGet. The caller is expected
+ // to have acquired the SuperVersion and pass in a snapshot sequence number
+ // in order to construct the LookupKeys. The start_key and num_keys specify
+ // the range of keys in the sorted_keys vector for a single column family.
+ void MultiGetImpl(
+ const ReadOptions& read_options, size_t start_key, size_t num_keys,
+ autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys,
+ SuperVersion* sv, SequenceNumber snap_seqnum, ReadCallback* callback,
+ bool* is_blob_index);
+
+ // table_cache_ provides its own synchronization
+ std::shared_ptr<Cache> table_cache_;
+
+ // Lock over the persistent DB state. Non-nullptr iff successfully acquired.
+ FileLock* db_lock_;
+
+ // In addition to mutex_, log_write_mutex_ protected writes to stats_history_
+ InstrumentedMutex stats_history_mutex_;
+ // In addition to mutex_, log_write_mutex_ protected writes to logs_ and
+ // logfile_number_. With two_write_queues it also protects alive_log_files_,
+ // and log_empty_. Refer to the definition of each variable below for more
+ // details.
+ // Note: to avoid dealock, if needed to acquire both log_write_mutex_ and
+ // mutex_, the order should be first mutex_ and then log_write_mutex_.
+ InstrumentedMutex log_write_mutex_;
+
+ std::atomic<bool> shutting_down_;
+ std::atomic<bool> manual_compaction_paused_;
+ // This condition variable is signaled on these conditions:
+ // * whenever bg_compaction_scheduled_ goes down to 0
+ // * if AnyManualCompaction, whenever a compaction finishes, even if it hasn't
+ // made any progress
+ // * whenever a compaction made any progress
+ // * whenever bg_flush_scheduled_ or bg_purge_scheduled_ value decreases
+ // (i.e. whenever a flush is done, even if it didn't make any progress)
+ // * whenever there is an error in background purge, flush or compaction
+ // * whenever num_running_ingest_file_ goes to 0.
+ // * whenever pending_purge_obsolete_files_ goes to 0.
+ // * whenever disable_delete_obsolete_files_ goes to 0.
+ // * whenever SetOptions successfully updates options.
+ // * whenever a column family is dropped.
+ InstrumentedCondVar bg_cv_;
+ // Writes are protected by locking both mutex_ and log_write_mutex_, and reads
+ // must be under either mutex_ or log_write_mutex_. Since after ::Open,
+ // logfile_number_ is currently updated only in write_thread_, it can be read
+ // from the same write_thread_ without any locks.
+ uint64_t logfile_number_;
+ std::deque<uint64_t>
+ log_recycle_files_; // a list of log files that we can recycle
+ bool log_dir_synced_;
+ // Without two_write_queues, read and writes to log_empty_ are protected by
+ // mutex_. Since it is currently updated/read only in write_thread_, it can be
+ // accessed from the same write_thread_ without any locks. With
+ // two_write_queues writes, where it can be updated in different threads,
+ // read and writes are protected by log_write_mutex_ instead. This is to avoid
+ // expesnive mutex_ lock during WAL write, which update log_empty_.
+ bool log_empty_;
+
+ ColumnFamilyHandleImpl* persist_stats_cf_handle_;
+
+ bool persistent_stats_cfd_exists_ = true;
+
+ // Without two_write_queues, read and writes to alive_log_files_ are
+ // protected by mutex_. However since back() is never popped, and push_back()
+ // is done only from write_thread_, the same thread can access the item
+ // reffered by back() without mutex_. With two_write_queues_, writes
+ // are protected by locking both mutex_ and log_write_mutex_, and reads must
+ // be under either mutex_ or log_write_mutex_.
+ std::deque<LogFileNumberSize> alive_log_files_;
+ // Log files that aren't fully synced, and the current log file.
+ // Synchronization:
+ // - push_back() is done from write_thread_ with locked mutex_ and
+ // log_write_mutex_
+ // - pop_front() is done from any thread with locked mutex_ and
+ // log_write_mutex_
+ // - reads are done with either locked mutex_ or log_write_mutex_
+ // - back() and items with getting_synced=true are not popped,
+ // - The same thread that sets getting_synced=true will reset it.
+ // - it follows that the object referred by back() can be safely read from
+ // the write_thread_ without using mutex
+ // - it follows that the items with getting_synced=true can be safely read
+ // from the same thread that has set getting_synced=true
+ std::deque<LogWriterNumber> logs_;
+ // Signaled when getting_synced becomes false for some of the logs_.
+ InstrumentedCondVar log_sync_cv_;
+ // This is the app-level state that is written to the WAL but will be used
+ // only during recovery. Using this feature enables not writing the state to
+ // memtable on normal writes and hence improving the throughput. Each new
+ // write of the state will replace the previous state entirely even if the
+ // keys in the two consecuitive states do not overlap.
+ // It is protected by log_write_mutex_ when two_write_queues_ is enabled.
+ // Otherwise only the heaad of write_thread_ can access it.
+ WriteBatch cached_recoverable_state_;
+ std::atomic<bool> cached_recoverable_state_empty_ = {true};
+ std::atomic<uint64_t> total_log_size_;
+
+ // If this is non-empty, we need to delete these log files in background
+ // threads. Protected by db mutex.
+ autovector<log::Writer*> logs_to_free_;
+
+ bool is_snapshot_supported_;
+
+ std::map<uint64_t, std::map<std::string, uint64_t>> stats_history_;
+
+ std::map<std::string, uint64_t> stats_slice_;
+
+ bool stats_slice_initialized_ = false;
+
+ Directories directories_;
+
+ WriteBufferManager* write_buffer_manager_;
+
+ WriteThread write_thread_;
+ WriteBatch tmp_batch_;
+ // The write thread when the writers have no memtable write. This will be used
+ // in 2PC to batch the prepares separately from the serial commit.
+ WriteThread nonmem_write_thread_;
+
+ WriteController write_controller_;
+
+ // Size of the last batch group. In slowdown mode, next write needs to
+ // sleep if it uses up the quota.
+ // Note: This is to protect memtable and compaction. If the batch only writes
+ // to the WAL its size need not to be included in this.
+ uint64_t last_batch_group_size_;
+
+ FlushScheduler flush_scheduler_;
+
+ TrimHistoryScheduler trim_history_scheduler_;
+
+ SnapshotList snapshots_;
+
+ // For each background job, pending_outputs_ keeps the current file number at
+ // the time that background job started.
+ // FindObsoleteFiles()/PurgeObsoleteFiles() never deletes any file that has
+ // number bigger than any of the file number in pending_outputs_. Since file
+ // numbers grow monotonically, this also means that pending_outputs_ is always
+ // sorted. After a background job is done executing, its file number is
+ // deleted from pending_outputs_, which allows PurgeObsoleteFiles() to clean
+ // it up.
+ // State is protected with db mutex.
+ std::list<uint64_t> pending_outputs_;
+
+ // flush_queue_ and compaction_queue_ hold column families that we need to
+ // flush and compact, respectively.
+ // A column family is inserted into flush_queue_ when it satisfies condition
+ // cfd->imm()->IsFlushPending()
+ // A column family is inserted into compaction_queue_ when it satisfied
+ // condition cfd->NeedsCompaction()
+ // Column families in this list are all Ref()-erenced
+ // TODO(icanadi) Provide some kind of ReferencedColumnFamily class that will
+ // do RAII on ColumnFamilyData
+ // Column families are in this queue when they need to be flushed or
+ // compacted. Consumers of these queues are flush and compaction threads. When
+ // column family is put on this queue, we increase unscheduled_flushes_ and
+ // unscheduled_compactions_. When these variables are bigger than zero, that
+ // means we need to schedule background threads for flush and compaction.
+ // Once the background threads are scheduled, we decrease unscheduled_flushes_
+ // and unscheduled_compactions_. That way we keep track of number of
+ // compaction and flush threads we need to schedule. This scheduling is done
+ // in MaybeScheduleFlushOrCompaction()
+ // invariant(column family present in flush_queue_ <==>
+ // ColumnFamilyData::pending_flush_ == true)
+ std::deque<FlushRequest> flush_queue_;
+ // invariant(column family present in compaction_queue_ <==>
+ // ColumnFamilyData::pending_compaction_ == true)
+ std::deque<ColumnFamilyData*> compaction_queue_;
+
+ // A map to store file numbers and filenames of the files to be purged
+ std::unordered_map<uint64_t, PurgeFileInfo> purge_files_;
+
+ // A vector to store the file numbers that have been assigned to certain
+ // JobContext. Current implementation tracks ssts only.
+ std::unordered_set<uint64_t> files_grabbed_for_purge_;
+
+ // A queue to store log writers to close
+ std::deque<log::Writer*> logs_to_free_queue_;
+ std::deque<SuperVersion*> superversions_to_free_queue_;
+ int unscheduled_flushes_;
+ int unscheduled_compactions_;
+
+ // count how many background compactions are running or have been scheduled in
+ // the BOTTOM pool
+ int bg_bottom_compaction_scheduled_;
+
+ // count how many background compactions are running or have been scheduled
+ int bg_compaction_scheduled_;
+
+ // stores the number of compactions are currently running
+ int num_running_compactions_;
+
+ // number of background memtable flush jobs, submitted to the HIGH pool
+ int bg_flush_scheduled_;
+
+ // stores the number of flushes are currently running
+ int num_running_flushes_;
+
+ // number of background obsolete file purge jobs, submitted to the HIGH pool
+ int bg_purge_scheduled_;
+
+ std::deque<ManualCompactionState*> manual_compaction_dequeue_;
+
+ // shall we disable deletion of obsolete files
+ // if 0 the deletion is enabled.
+ // if non-zero, files will not be getting deleted
+ // This enables two different threads to call
+ // EnableFileDeletions() and DisableFileDeletions()
+ // without any synchronization
+ int disable_delete_obsolete_files_;
+
+ // Number of times FindObsoleteFiles has found deletable files and the
+ // corresponding call to PurgeObsoleteFiles has not yet finished.
+ int pending_purge_obsolete_files_;
+
+ // last time when DeleteObsoleteFiles with full scan was executed. Originally
+ // initialized with startup time.
+ uint64_t delete_obsolete_files_last_run_;
+
+ // last time stats were dumped to LOG
+ std::atomic<uint64_t> last_stats_dump_time_microsec_;
+
+ // The thread that wants to switch memtable, can wait on this cv until the
+ // pending writes to memtable finishes.
+ std::condition_variable switch_cv_;
+ // The mutex used by switch_cv_. mutex_ should be acquired beforehand.
+ std::mutex switch_mutex_;
+ // Number of threads intending to write to memtable
+ std::atomic<size_t> pending_memtable_writes_ = {};
+
+ // Each flush or compaction gets its own job id. this counter makes sure
+ // they're unique
+ std::atomic<int> next_job_id_;
+
+ // A flag indicating whether the current rocksdb database has any
+ // data that is not yet persisted into either WAL or SST file.
+ // Used when disableWAL is true.
+ std::atomic<bool> has_unpersisted_data_;
+
+ // if an attempt was made to flush all column families that
+ // the oldest log depends on but uncommitted data in the oldest
+ // log prevents the log from being released.
+ // We must attempt to free the dependent memtables again
+ // at a later time after the transaction in the oldest
+ // log is fully commited.
+ bool unable_to_release_oldest_log_;
+
+ static const int KEEP_LOG_FILE_NUM = 1000;
+ // MSVC version 1800 still does not have constexpr for ::max()
+ static const uint64_t kNoTimeOut = port::kMaxUint64;
+
+ std::string db_absolute_path_;
+
+ // Number of running IngestExternalFile() or CreateColumnFamilyWithImport()
+ // calls.
+ // REQUIRES: mutex held
+ int num_running_ingest_file_;
+
+#ifndef ROCKSDB_LITE
+ WalManager wal_manager_;
+#endif // ROCKSDB_LITE
+
+ // Unified interface for logging events
+ EventLogger event_logger_;
+
+ // A value of > 0 temporarily disables scheduling of background work
+ int bg_work_paused_;
+
+ // A value of > 0 temporarily disables scheduling of background compaction
+ int bg_compaction_paused_;
+
+ // Guard against multiple concurrent refitting
+ bool refitting_level_;
+
+ // Indicate DB was opened successfully
+ bool opened_successfully_;
+
+ // The min threshold to triggere bottommost compaction for removing
+ // garbages, among all column families.
+ SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber;
+
+ LogsWithPrepTracker logs_with_prep_tracker_;
+
+ // Callback for compaction to check if a key is visible to a snapshot.
+ // REQUIRES: mutex held
+ std::unique_ptr<SnapshotChecker> snapshot_checker_;
+
+ // Callback for when the cached_recoverable_state_ is written to memtable
+ // Only to be set during initialization
+ std::unique_ptr<PreReleaseCallback> recoverable_state_pre_release_callback_;
+
+ // handle for scheduling stats dumping at fixed intervals
+ // REQUIRES: mutex locked
+ std::unique_ptr<ROCKSDB_NAMESPACE::RepeatableThread> thread_dump_stats_;
+
+ // handle for scheduling stats snapshoting at fixed intervals
+ // REQUIRES: mutex locked
+ std::unique_ptr<ROCKSDB_NAMESPACE::RepeatableThread> thread_persist_stats_;
+
+ // When set, we use a separate queue for writes that dont write to memtable.
+ // In 2PC these are the writes at Prepare phase.
+ const bool two_write_queues_;
+ const bool manual_wal_flush_;
+
+ // LastSequence also indicates last published sequence visibile to the
+ // readers. Otherwise LastPublishedSequence should be used.
+ const bool last_seq_same_as_publish_seq_;
+ // It indicates that a customized gc algorithm must be used for
+ // flush/compaction and if it is not provided vis SnapshotChecker, we should
+ // disable gc to be safe.
+ const bool use_custom_gc_;
+ // Flag to indicate that the DB instance shutdown has been initiated. This
+ // different from shutting_down_ atomic in that it is set at the beginning
+ // of shutdown sequence, specifically in order to prevent any background
+ // error recovery from going on in parallel. The latter, shutting_down_,
+ // is set a little later during the shutdown after scheduling memtable
+ // flushes
+ std::atomic<bool> shutdown_initiated_;
+ // Flag to indicate whether sst_file_manager object was allocated in
+ // DB::Open() or passed to us
+ bool own_sfm_;
+
+ // Clients must periodically call SetPreserveDeletesSequenceNumber()
+ // to advance this seqnum. Default value is 0 which means ALL deletes are
+ // preserved. Note that this has no effect if DBOptions.preserve_deletes
+ // is set to false.
+ std::atomic<SequenceNumber> preserve_deletes_seqnum_;
+ const bool preserve_deletes_;
+
+ // Flag to check whether Close() has been called on this DB
+ bool closed_;
+
+ ErrorHandler error_handler_;
+
+ // Conditional variable to coordinate installation of atomic flush results.
+ // With atomic flush, each bg thread installs the result of flushing multiple
+ // column families, and different threads can flush different column
+ // families. It's difficult to rely on one thread to perform batch
+ // installation for all threads. This is different from the non-atomic flush
+ // case.
+ // atomic_flush_install_cv_ makes sure that threads install atomic flush
+ // results sequentially. Flush results of memtables with lower IDs get
+ // installed to MANIFEST first.
+ InstrumentedCondVar atomic_flush_install_cv_;
+
+ bool wal_in_db_path_;
+};
+
+extern Options SanitizeOptions(const std::string& db, const Options& src);
+
+extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src);
+
+extern CompressionType GetCompressionFlush(
+ const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options);
+
+// Return the earliest log file to keep after the memtable flush is
+// finalized.
+// `cfd_to_flush` is the column family whose memtable (specified in
+// `memtables_to_flush`) will be flushed and thus will not depend on any WAL
+// file.
+// The function is only applicable to 2pc mode.
+extern uint64_t PrecomputeMinLogNumberToKeep(
+ VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
+ autovector<VersionEdit*> edit_list,
+ const autovector<MemTable*>& memtables_to_flush,
+ LogsWithPrepTracker* prep_tracker);
+
+// `cfd_to_flush` is the column family whose memtable will be flushed and thus
+// will not depend on any WAL file. nullptr means no memtable is being flushed.
+// The function is only applicable to 2pc mode.
+extern uint64_t FindMinPrepLogReferencedByMemTable(
+ VersionSet* vset, const ColumnFamilyData* cfd_to_flush,
+ const autovector<MemTable*>& memtables_to_flush);
+
+// Fix user-supplied options to be reasonable
+template <class T, class V>
+static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
+ if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
+ if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_compaction_flush.cc b/src/rocksdb/db/db_impl/db_impl_compaction_flush.cc
new file mode 100644
index 000000000..c7b3510c3
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_compaction_flush.cc
@@ -0,0 +1,3116 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/db_impl/db_impl.h"
+
+#include <cinttypes>
+
+#include "db/builder.h"
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "file/sst_file_manager_impl.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/thread_status_updater.h"
+#include "monitoring/thread_status_util.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/concurrent_task_limiter_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool DBImpl::EnoughRoomForCompaction(
+ ColumnFamilyData* cfd, const std::vector<CompactionInputFiles>& inputs,
+ bool* sfm_reserved_compact_space, LogBuffer* log_buffer) {
+ // Check if we have enough room to do the compaction
+ bool enough_room = true;
+#ifndef ROCKSDB_LITE
+ auto sfm = static_cast<SstFileManagerImpl*>(
+ immutable_db_options_.sst_file_manager.get());
+ if (sfm) {
+ // Pass the current bg_error_ to SFM so it can decide what checks to
+ // perform. If this DB instance hasn't seen any error yet, the SFM can be
+ // optimistic and not do disk space checks
+ enough_room =
+ sfm->EnoughRoomForCompaction(cfd, inputs, error_handler_.GetBGError());
+ if (enough_room) {
+ *sfm_reserved_compact_space = true;
+ }
+ }
+#else
+ (void)cfd;
+ (void)inputs;
+ (void)sfm_reserved_compact_space;
+#endif // ROCKSDB_LITE
+ if (!enough_room) {
+ // Just in case tests want to change the value of enough_room
+ TEST_SYNC_POINT_CALLBACK(
+ "DBImpl::BackgroundCompaction():CancelledCompaction", &enough_room);
+ ROCKS_LOG_BUFFER(log_buffer,
+ "Cancelled compaction because not enough room");
+ RecordTick(stats_, COMPACTION_CANCELLED, 1);
+ }
+ return enough_room;
+}
+
+bool DBImpl::RequestCompactionToken(ColumnFamilyData* cfd, bool force,
+ std::unique_ptr<TaskLimiterToken>* token,
+ LogBuffer* log_buffer) {
+ assert(*token == nullptr);
+ auto limiter = static_cast<ConcurrentTaskLimiterImpl*>(
+ cfd->ioptions()->compaction_thread_limiter.get());
+ if (limiter == nullptr) {
+ return true;
+ }
+ *token = limiter->GetToken(force);
+ if (*token != nullptr) {
+ ROCKS_LOG_BUFFER(log_buffer,
+ "Thread limiter [%s] increase [%s] compaction task, "
+ "force: %s, tasks after: %d",
+ limiter->GetName().c_str(), cfd->GetName().c_str(),
+ force ? "true" : "false", limiter->GetOutstandingTask());
+ return true;
+ }
+ return false;
+}
+
+Status DBImpl::SyncClosedLogs(JobContext* job_context) {
+ TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Start");
+ mutex_.AssertHeld();
+ autovector<log::Writer*, 1> logs_to_sync;
+ uint64_t current_log_number = logfile_number_;
+ while (logs_.front().number < current_log_number &&
+ logs_.front().getting_synced) {
+ log_sync_cv_.Wait();
+ }
+ for (auto it = logs_.begin();
+ it != logs_.end() && it->number < current_log_number; ++it) {
+ auto& log = *it;
+ assert(!log.getting_synced);
+ log.getting_synced = true;
+ logs_to_sync.push_back(log.writer);
+ }
+
+ Status s;
+ if (!logs_to_sync.empty()) {
+ mutex_.Unlock();
+
+ for (log::Writer* log : logs_to_sync) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[JOB %d] Syncing log #%" PRIu64, job_context->job_id,
+ log->get_log_number());
+ s = log->file()->Sync(immutable_db_options_.use_fsync);
+ if (!s.ok()) {
+ break;
+ }
+
+ if (immutable_db_options_.recycle_log_file_num > 0) {
+ s = log->Close();
+ if (!s.ok()) {
+ break;
+ }
+ }
+ }
+ if (s.ok()) {
+ s = directories_.GetWalDir()->Fsync();
+ }
+
+ mutex_.Lock();
+
+ // "number <= current_log_number - 1" is equivalent to
+ // "number < current_log_number".
+ MarkLogsSynced(current_log_number - 1, true, s);
+ if (!s.ok()) {
+ error_handler_.SetBGError(s, BackgroundErrorReason::kFlush);
+ TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Failed");
+ return s;
+ }
+ }
+ return s;
+}
+
+Status DBImpl::FlushMemTableToOutputFile(
+ ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+ bool* made_progress, JobContext* job_context,
+ SuperVersionContext* superversion_context,
+ std::vector<SequenceNumber>& snapshot_seqs,
+ SequenceNumber earliest_write_conflict_snapshot,
+ SnapshotChecker* snapshot_checker, LogBuffer* log_buffer,
+ Env::Priority thread_pri) {
+ mutex_.AssertHeld();
+ assert(cfd->imm()->NumNotFlushed() != 0);
+ assert(cfd->imm()->IsFlushPending());
+
+ FlushJob flush_job(
+ dbname_, cfd, immutable_db_options_, mutable_cf_options,
+ nullptr /* memtable_id */, file_options_for_compaction_, versions_.get(),
+ &mutex_, &shutting_down_, snapshot_seqs, earliest_write_conflict_snapshot,
+ snapshot_checker, job_context, log_buffer, directories_.GetDbDir(),
+ GetDataDir(cfd, 0U),
+ GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_,
+ &event_logger_, mutable_cf_options.report_bg_io_stats,
+ true /* sync_output_directory */, true /* write_manifest */, thread_pri);
+
+ FileMetaData file_meta;
+
+ TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:BeforePickMemtables");
+ flush_job.PickMemTable();
+ TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:AfterPickMemtables");
+
+#ifndef ROCKSDB_LITE
+ // may temporarily unlock and lock the mutex.
+ NotifyOnFlushBegin(cfd, &file_meta, mutable_cf_options, job_context->job_id);
+#endif // ROCKSDB_LITE
+
+ Status s;
+ if (logfile_number_ > 0 &&
+ versions_->GetColumnFamilySet()->NumberOfColumnFamilies() > 1) {
+ // If there are more than one column families, we need to make sure that
+ // all the log files except the most recent one are synced. Otherwise if
+ // the host crashes after flushing and before WAL is persistent, the
+ // flushed SST may contain data from write batches whose updates to
+ // other column families are missing.
+ // SyncClosedLogs() may unlock and re-lock the db_mutex.
+ s = SyncClosedLogs(job_context);
+ } else {
+ TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Skip");
+ }
+
+ // Within flush_job.Run, rocksdb may call event listener to notify
+ // file creation and deletion.
+ //
+ // Note that flush_job.Run will unlock and lock the db_mutex,
+ // and EventListener callback will be called when the db_mutex
+ // is unlocked by the current thread.
+ if (s.ok()) {
+ s = flush_job.Run(&logs_with_prep_tracker_, &file_meta);
+ } else {
+ flush_job.Cancel();
+ }
+
+ if (s.ok()) {
+ InstallSuperVersionAndScheduleWork(cfd, superversion_context,
+ mutable_cf_options);
+ if (made_progress) {
+ *made_progress = true;
+ }
+ VersionStorageInfo::LevelSummaryStorage tmp;
+ ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n",
+ cfd->GetName().c_str(),
+ cfd->current()->storage_info()->LevelSummary(&tmp));
+ }
+
+ if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) {
+ Status new_bg_error = s;
+ error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
+ }
+ if (s.ok()) {
+#ifndef ROCKSDB_LITE
+ // may temporarily unlock and lock the mutex.
+ NotifyOnFlushCompleted(cfd, mutable_cf_options,
+ flush_job.GetCommittedFlushJobsInfo());
+ auto sfm = static_cast<SstFileManagerImpl*>(
+ immutable_db_options_.sst_file_manager.get());
+ if (sfm) {
+ // Notify sst_file_manager that a new file was added
+ std::string file_path = MakeTableFileName(
+ cfd->ioptions()->cf_paths[0].path, file_meta.fd.GetNumber());
+ sfm->OnAddFile(file_path);
+ if (sfm->IsMaxAllowedSpaceReached()) {
+ Status new_bg_error =
+ Status::SpaceLimit("Max allowed space was reached");
+ TEST_SYNC_POINT_CALLBACK(
+ "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached",
+ &new_bg_error);
+ error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
+ }
+ }
+#endif // ROCKSDB_LITE
+ }
+ TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:Finish");
+ return s;
+}
+
+Status DBImpl::FlushMemTablesToOutputFiles(
+ const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
+ JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri) {
+ if (immutable_db_options_.atomic_flush) {
+ return AtomicFlushMemTablesToOutputFiles(
+ bg_flush_args, made_progress, job_context, log_buffer, thread_pri);
+ }
+ std::vector<SequenceNumber> snapshot_seqs;
+ SequenceNumber earliest_write_conflict_snapshot;
+ SnapshotChecker* snapshot_checker;
+ GetSnapshotContext(job_context, &snapshot_seqs,
+ &earliest_write_conflict_snapshot, &snapshot_checker);
+ Status status;
+ for (auto& arg : bg_flush_args) {
+ ColumnFamilyData* cfd = arg.cfd_;
+ MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
+ SuperVersionContext* superversion_context = arg.superversion_context_;
+ Status s = FlushMemTableToOutputFile(
+ cfd, mutable_cf_options, made_progress, job_context,
+ superversion_context, snapshot_seqs, earliest_write_conflict_snapshot,
+ snapshot_checker, log_buffer, thread_pri);
+ if (!s.ok()) {
+ status = s;
+ if (!s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) {
+ // At this point, DB is not shutting down, nor is cfd dropped.
+ // Something is wrong, thus we break out of the loop.
+ break;
+ }
+ }
+ }
+ return status;
+}
+
+/*
+ * Atomically flushes multiple column families.
+ *
+ * For each column family, all memtables with ID smaller than or equal to the
+ * ID specified in bg_flush_args will be flushed. Only after all column
+ * families finish flush will this function commit to MANIFEST. If any of the
+ * column families are not flushed successfully, this function does not have
+ * any side-effect on the state of the database.
+ */
+Status DBImpl::AtomicFlushMemTablesToOutputFiles(
+ const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
+ JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri) {
+ mutex_.AssertHeld();
+
+ autovector<ColumnFamilyData*> cfds;
+ for (const auto& arg : bg_flush_args) {
+ cfds.emplace_back(arg.cfd_);
+ }
+
+#ifndef NDEBUG
+ for (const auto cfd : cfds) {
+ assert(cfd->imm()->NumNotFlushed() != 0);
+ assert(cfd->imm()->IsFlushPending());
+ }
+#endif /* !NDEBUG */
+
+ std::vector<SequenceNumber> snapshot_seqs;
+ SequenceNumber earliest_write_conflict_snapshot;
+ SnapshotChecker* snapshot_checker;
+ GetSnapshotContext(job_context, &snapshot_seqs,
+ &earliest_write_conflict_snapshot, &snapshot_checker);
+
+ autovector<Directory*> distinct_output_dirs;
+ autovector<std::string> distinct_output_dir_paths;
+ std::vector<std::unique_ptr<FlushJob>> jobs;
+ std::vector<MutableCFOptions> all_mutable_cf_options;
+ int num_cfs = static_cast<int>(cfds.size());
+ all_mutable_cf_options.reserve(num_cfs);
+ for (int i = 0; i < num_cfs; ++i) {
+ auto cfd = cfds[i];
+ Directory* data_dir = GetDataDir(cfd, 0U);
+ const std::string& curr_path = cfd->ioptions()->cf_paths[0].path;
+
+ // Add to distinct output directories if eligible. Use linear search. Since
+ // the number of elements in the vector is not large, performance should be
+ // tolerable.
+ bool found = false;
+ for (const auto& path : distinct_output_dir_paths) {
+ if (path == curr_path) {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ distinct_output_dir_paths.emplace_back(curr_path);
+ distinct_output_dirs.emplace_back(data_dir);
+ }
+
+ all_mutable_cf_options.emplace_back(*cfd->GetLatestMutableCFOptions());
+ const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.back();
+ const uint64_t* max_memtable_id = &(bg_flush_args[i].max_memtable_id_);
+ jobs.emplace_back(new FlushJob(
+ dbname_, cfd, immutable_db_options_, mutable_cf_options,
+ max_memtable_id, file_options_for_compaction_, versions_.get(), &mutex_,
+ &shutting_down_, snapshot_seqs, earliest_write_conflict_snapshot,
+ snapshot_checker, job_context, log_buffer, directories_.GetDbDir(),
+ data_dir, GetCompressionFlush(*cfd->ioptions(), mutable_cf_options),
+ stats_, &event_logger_, mutable_cf_options.report_bg_io_stats,
+ false /* sync_output_directory */, false /* write_manifest */,
+ thread_pri));
+ jobs.back()->PickMemTable();
+ }
+
+ std::vector<FileMetaData> file_meta(num_cfs);
+ Status s;
+ assert(num_cfs == static_cast<int>(jobs.size()));
+
+#ifndef ROCKSDB_LITE
+ for (int i = 0; i != num_cfs; ++i) {
+ const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.at(i);
+ // may temporarily unlock and lock the mutex.
+ NotifyOnFlushBegin(cfds[i], &file_meta[i], mutable_cf_options,
+ job_context->job_id);
+ }
+#endif /* !ROCKSDB_LITE */
+
+ if (logfile_number_ > 0) {
+ // TODO (yanqin) investigate whether we should sync the closed logs for
+ // single column family case.
+ s = SyncClosedLogs(job_context);
+ }
+
+ // exec_status stores the execution status of flush_jobs as
+ // <bool /* executed */, Status /* status code */>
+ autovector<std::pair<bool, Status>> exec_status;
+ for (int i = 0; i != num_cfs; ++i) {
+ // Initially all jobs are not executed, with status OK.
+ exec_status.emplace_back(false, Status::OK());
+ }
+
+ if (s.ok()) {
+ // TODO (yanqin): parallelize jobs with threads.
+ for (int i = 1; i != num_cfs; ++i) {
+ exec_status[i].second =
+ jobs[i]->Run(&logs_with_prep_tracker_, &file_meta[i]);
+ exec_status[i].first = true;
+ }
+ if (num_cfs > 1) {
+ TEST_SYNC_POINT(
+ "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:1");
+ TEST_SYNC_POINT(
+ "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:2");
+ }
+ assert(exec_status.size() > 0);
+ assert(!file_meta.empty());
+ exec_status[0].second =
+ jobs[0]->Run(&logs_with_prep_tracker_, &file_meta[0]);
+ exec_status[0].first = true;
+
+ Status error_status;
+ for (const auto& e : exec_status) {
+ if (!e.second.ok()) {
+ s = e.second;
+ if (!e.second.IsShutdownInProgress() &&
+ !e.second.IsColumnFamilyDropped()) {
+ // If a flush job did not return OK, and the CF is not dropped, and
+ // the DB is not shutting down, then we have to return this result to
+ // caller later.
+ error_status = e.second;
+ }
+ }
+ }
+
+ s = error_status.ok() ? s : error_status;
+ }
+
+ if (s.IsColumnFamilyDropped()) {
+ s = Status::OK();
+ }
+
+ if (s.ok() || s.IsShutdownInProgress()) {
+ // Sync on all distinct output directories.
+ for (auto dir : distinct_output_dirs) {
+ if (dir != nullptr) {
+ Status error_status = dir->Fsync();
+ if (!error_status.ok()) {
+ s = error_status;
+ break;
+ }
+ }
+ }
+ } else {
+ // Need to undo atomic flush if something went wrong, i.e. s is not OK and
+ // it is not because of CF drop.
+ // Have to cancel the flush jobs that have NOT executed because we need to
+ // unref the versions.
+ for (int i = 0; i != num_cfs; ++i) {
+ if (!exec_status[i].first) {
+ jobs[i]->Cancel();
+ }
+ }
+ for (int i = 0; i != num_cfs; ++i) {
+ if (exec_status[i].first && exec_status[i].second.ok()) {
+ auto& mems = jobs[i]->GetMemTables();
+ cfds[i]->imm()->RollbackMemtableFlush(mems,
+ file_meta[i].fd.GetNumber());
+ }
+ }
+ }
+
+ if (s.ok()) {
+ auto wait_to_install_func = [&]() {
+ bool ready = true;
+ for (size_t i = 0; i != cfds.size(); ++i) {
+ const auto& mems = jobs[i]->GetMemTables();
+ if (cfds[i]->IsDropped()) {
+ // If the column family is dropped, then do not wait.
+ continue;
+ } else if (!mems.empty() &&
+ cfds[i]->imm()->GetEarliestMemTableID() < mems[0]->GetID()) {
+ // If a flush job needs to install the flush result for mems and
+ // mems[0] is not the earliest memtable, it means another thread must
+ // be installing flush results for the same column family, then the
+ // current thread needs to wait.
+ ready = false;
+ break;
+ } else if (mems.empty() && cfds[i]->imm()->GetEarliestMemTableID() <=
+ bg_flush_args[i].max_memtable_id_) {
+ // If a flush job does not need to install flush results, then it has
+ // to wait until all memtables up to max_memtable_id_ (inclusive) are
+ // installed.
+ ready = false;
+ break;
+ }
+ }
+ return ready;
+ };
+
+ bool resuming_from_bg_err = error_handler_.IsDBStopped();
+ while ((!error_handler_.IsDBStopped() ||
+ error_handler_.GetRecoveryError().ok()) &&
+ !wait_to_install_func()) {
+ atomic_flush_install_cv_.Wait();
+ }
+
+ s = resuming_from_bg_err ? error_handler_.GetRecoveryError()
+ : error_handler_.GetBGError();
+ }
+
+ if (s.ok()) {
+ autovector<ColumnFamilyData*> tmp_cfds;
+ autovector<const autovector<MemTable*>*> mems_list;
+ autovector<const MutableCFOptions*> mutable_cf_options_list;
+ autovector<FileMetaData*> tmp_file_meta;
+ for (int i = 0; i != num_cfs; ++i) {
+ const auto& mems = jobs[i]->GetMemTables();
+ if (!cfds[i]->IsDropped() && !mems.empty()) {
+ tmp_cfds.emplace_back(cfds[i]);
+ mems_list.emplace_back(&mems);
+ mutable_cf_options_list.emplace_back(&all_mutable_cf_options[i]);
+ tmp_file_meta.emplace_back(&file_meta[i]);
+ }
+ }
+
+ s = InstallMemtableAtomicFlushResults(
+ nullptr /* imm_lists */, tmp_cfds, mutable_cf_options_list, mems_list,
+ versions_.get(), &mutex_, tmp_file_meta,
+ &job_context->memtables_to_free, directories_.GetDbDir(), log_buffer);
+ }
+
+ if (s.ok()) {
+ assert(num_cfs ==
+ static_cast<int>(job_context->superversion_contexts.size()));
+ for (int i = 0; i != num_cfs; ++i) {
+ if (cfds[i]->IsDropped()) {
+ continue;
+ }
+ InstallSuperVersionAndScheduleWork(cfds[i],
+ &job_context->superversion_contexts[i],
+ all_mutable_cf_options[i]);
+ VersionStorageInfo::LevelSummaryStorage tmp;
+ ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n",
+ cfds[i]->GetName().c_str(),
+ cfds[i]->current()->storage_info()->LevelSummary(&tmp));
+ }
+ if (made_progress) {
+ *made_progress = true;
+ }
+#ifndef ROCKSDB_LITE
+ auto sfm = static_cast<SstFileManagerImpl*>(
+ immutable_db_options_.sst_file_manager.get());
+ assert(all_mutable_cf_options.size() == static_cast<size_t>(num_cfs));
+ for (int i = 0; i != num_cfs; ++i) {
+ if (cfds[i]->IsDropped()) {
+ continue;
+ }
+ NotifyOnFlushCompleted(cfds[i], all_mutable_cf_options[i],
+ jobs[i]->GetCommittedFlushJobsInfo());
+ if (sfm) {
+ std::string file_path = MakeTableFileName(
+ cfds[i]->ioptions()->cf_paths[0].path, file_meta[i].fd.GetNumber());
+ sfm->OnAddFile(file_path);
+ if (sfm->IsMaxAllowedSpaceReached() &&
+ error_handler_.GetBGError().ok()) {
+ Status new_bg_error =
+ Status::SpaceLimit("Max allowed space was reached");
+ error_handler_.SetBGError(new_bg_error,
+ BackgroundErrorReason::kFlush);
+ }
+ }
+ }
+#endif // ROCKSDB_LITE
+ }
+
+ if (!s.ok() && !s.IsShutdownInProgress()) {
+ Status new_bg_error = s;
+ error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
+ }
+
+ return s;
+}
+
+void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
+ const MutableCFOptions& mutable_cf_options,
+ int job_id) {
+#ifndef ROCKSDB_LITE
+ if (immutable_db_options_.listeners.size() == 0U) {
+ return;
+ }
+ mutex_.AssertHeld();
+ if (shutting_down_.load(std::memory_order_acquire)) {
+ return;
+ }
+ bool triggered_writes_slowdown =
+ (cfd->current()->storage_info()->NumLevelFiles(0) >=
+ mutable_cf_options.level0_slowdown_writes_trigger);
+ bool triggered_writes_stop =
+ (cfd->current()->storage_info()->NumLevelFiles(0) >=
+ mutable_cf_options.level0_stop_writes_trigger);
+ // release lock while notifying events
+ mutex_.Unlock();
+ {
+ FlushJobInfo info{};
+ info.cf_id = cfd->GetID();
+ info.cf_name = cfd->GetName();
+ // TODO(yhchiang): make db_paths dynamic in case flush does not
+ // go to L0 in the future.
+ const uint64_t file_number = file_meta->fd.GetNumber();
+ info.file_path =
+ MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_number);
+ info.file_number = file_number;
+ info.thread_id = env_->GetThreadID();
+ info.job_id = job_id;
+ info.triggered_writes_slowdown = triggered_writes_slowdown;
+ info.triggered_writes_stop = triggered_writes_stop;
+ info.smallest_seqno = file_meta->fd.smallest_seqno;
+ info.largest_seqno = file_meta->fd.largest_seqno;
+ info.flush_reason = cfd->GetFlushReason();
+ for (auto listener : immutable_db_options_.listeners) {
+ listener->OnFlushBegin(this, info);
+ }
+ }
+ mutex_.Lock();
+// no need to signal bg_cv_ as it will be signaled at the end of the
+// flush process.
+#else
+ (void)cfd;
+ (void)file_meta;
+ (void)mutable_cf_options;
+ (void)job_id;
+#endif // ROCKSDB_LITE
+}
+
+void DBImpl::NotifyOnFlushCompleted(
+ ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+ std::list<std::unique_ptr<FlushJobInfo>>* flush_jobs_info) {
+#ifndef ROCKSDB_LITE
+ assert(flush_jobs_info != nullptr);
+ if (immutable_db_options_.listeners.size() == 0U) {
+ return;
+ }
+ mutex_.AssertHeld();
+ if (shutting_down_.load(std::memory_order_acquire)) {
+ return;
+ }
+ bool triggered_writes_slowdown =
+ (cfd->current()->storage_info()->NumLevelFiles(0) >=
+ mutable_cf_options.level0_slowdown_writes_trigger);
+ bool triggered_writes_stop =
+ (cfd->current()->storage_info()->NumLevelFiles(0) >=
+ mutable_cf_options.level0_stop_writes_trigger);
+ // release lock while notifying events
+ mutex_.Unlock();
+ {
+ for (auto& info : *flush_jobs_info) {
+ info->triggered_writes_slowdown = triggered_writes_slowdown;
+ info->triggered_writes_stop = triggered_writes_stop;
+ for (auto listener : immutable_db_options_.listeners) {
+ listener->OnFlushCompleted(this, *info);
+ }
+ }
+ flush_jobs_info->clear();
+ }
+ mutex_.Lock();
+ // no need to signal bg_cv_ as it will be signaled at the end of the
+ // flush process.
+#else
+ (void)cfd;
+ (void)mutable_cf_options;
+ (void)flush_jobs_info;
+#endif // ROCKSDB_LITE
+}
+
+Status DBImpl::CompactRange(const CompactRangeOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice* begin, const Slice* end) {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ auto cfd = cfh->cfd();
+
+ if (options.target_path_id >= cfd->ioptions()->cf_paths.size()) {
+ return Status::InvalidArgument("Invalid target path ID");
+ }
+
+ bool exclusive = options.exclusive_manual_compaction;
+
+ bool flush_needed = true;
+ if (begin != nullptr && end != nullptr) {
+ // TODO(ajkr): We could also optimize away the flush in certain cases where
+ // one/both sides of the interval are unbounded. But it requires more
+ // changes to RangesOverlapWithMemtables.
+ Range range(*begin, *end);
+ SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+ cfd->RangesOverlapWithMemtables({range}, super_version, &flush_needed);
+ CleanupSuperVersion(super_version);
+ }
+
+ Status s;
+ if (flush_needed) {
+ FlushOptions fo;
+ fo.allow_write_stall = options.allow_write_stall;
+ if (immutable_db_options_.atomic_flush) {
+ autovector<ColumnFamilyData*> cfds;
+ mutex_.Lock();
+ SelectColumnFamiliesForAtomicFlush(&cfds);
+ mutex_.Unlock();
+ s = AtomicFlushMemTables(cfds, fo, FlushReason::kManualCompaction,
+ false /* writes_stopped */);
+ } else {
+ s = FlushMemTable(cfd, fo, FlushReason::kManualCompaction,
+ false /* writes_stopped*/);
+ }
+ if (!s.ok()) {
+ LogFlush(immutable_db_options_.info_log);
+ return s;
+ }
+ }
+
+ int max_level_with_files = 0;
+ // max_file_num_to_ignore can be used to filter out newly created SST files,
+ // useful for bottom level compaction in a manual compaction
+ uint64_t max_file_num_to_ignore = port::kMaxUint64;
+ uint64_t next_file_number = port::kMaxUint64;
+ {
+ InstrumentedMutexLock l(&mutex_);
+ Version* base = cfd->current();
+ for (int level = 1; level < base->storage_info()->num_non_empty_levels();
+ level++) {
+ if (base->storage_info()->OverlapInLevel(level, begin, end)) {
+ max_level_with_files = level;
+ }
+ }
+ next_file_number = versions_->current_next_file_number();
+ }
+
+ int final_output_level = 0;
+
+ if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal &&
+ cfd->NumberLevels() > 1) {
+ // Always compact all files together.
+ final_output_level = cfd->NumberLevels() - 1;
+ // if bottom most level is reserved
+ if (immutable_db_options_.allow_ingest_behind) {
+ final_output_level--;
+ }
+ s = RunManualCompaction(cfd, ColumnFamilyData::kCompactAllLevels,
+ final_output_level, options, begin, end, exclusive,
+ false, max_file_num_to_ignore);
+ } else {
+ for (int level = 0; level <= max_level_with_files; level++) {
+ int output_level;
+ // in case the compaction is universal or if we're compacting the
+ // bottom-most level, the output level will be the same as input one.
+ // level 0 can never be the bottommost level (i.e. if all files are in
+ // level 0, we will compact to level 1)
+ if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+ cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
+ output_level = level;
+ } else if (level == max_level_with_files && level > 0) {
+ if (options.bottommost_level_compaction ==
+ BottommostLevelCompaction::kSkip) {
+ // Skip bottommost level compaction
+ continue;
+ } else if (options.bottommost_level_compaction ==
+ BottommostLevelCompaction::kIfHaveCompactionFilter &&
+ cfd->ioptions()->compaction_filter == nullptr &&
+ cfd->ioptions()->compaction_filter_factory == nullptr) {
+ // Skip bottommost level compaction since we don't have a compaction
+ // filter
+ continue;
+ }
+ output_level = level;
+ // update max_file_num_to_ignore only for bottom level compaction
+ // because data in newly compacted files in middle levels may still need
+ // to be pushed down
+ max_file_num_to_ignore = next_file_number;
+ } else {
+ output_level = level + 1;
+ if (cfd->ioptions()->compaction_style == kCompactionStyleLevel &&
+ cfd->ioptions()->level_compaction_dynamic_level_bytes &&
+ level == 0) {
+ output_level = ColumnFamilyData::kCompactToBaseLevel;
+ }
+ }
+ s = RunManualCompaction(cfd, level, output_level, options, begin, end,
+ exclusive, false, max_file_num_to_ignore);
+ if (!s.ok()) {
+ break;
+ }
+ if (output_level == ColumnFamilyData::kCompactToBaseLevel) {
+ final_output_level = cfd->NumberLevels() - 1;
+ } else if (output_level > final_output_level) {
+ final_output_level = output_level;
+ }
+ TEST_SYNC_POINT("DBImpl::RunManualCompaction()::1");
+ TEST_SYNC_POINT("DBImpl::RunManualCompaction()::2");
+ }
+ }
+ if (!s.ok()) {
+ LogFlush(immutable_db_options_.info_log);
+ return s;
+ }
+
+ if (options.change_level) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[RefitLevel] waiting for background threads to stop");
+ s = PauseBackgroundWork();
+ if (s.ok()) {
+ s = ReFitLevel(cfd, final_output_level, options.target_level);
+ }
+ ContinueBackgroundWork();
+ }
+ LogFlush(immutable_db_options_.info_log);
+
+ {
+ InstrumentedMutexLock l(&mutex_);
+ // an automatic compaction that has been scheduled might have been
+ // preempted by the manual compactions. Need to schedule it back.
+ MaybeScheduleFlushOrCompaction();
+ }
+
+ return s;
+}
+
+Status DBImpl::CompactFiles(const CompactionOptions& compact_options,
+ ColumnFamilyHandle* column_family,
+ const std::vector<std::string>& input_file_names,
+ const int output_level, const int output_path_id,
+ std::vector<std::string>* const output_file_names,
+ CompactionJobInfo* compaction_job_info) {
+#ifdef ROCKSDB_LITE
+ (void)compact_options;
+ (void)column_family;
+ (void)input_file_names;
+ (void)output_level;
+ (void)output_path_id;
+ (void)output_file_names;
+ (void)compaction_job_info;
+ // not supported in lite version
+ return Status::NotSupported("Not supported in ROCKSDB LITE");
+#else
+ if (column_family == nullptr) {
+ return Status::InvalidArgument("ColumnFamilyHandle must be non-null.");
+ }
+
+ auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+ assert(cfd);
+
+ Status s;
+ JobContext job_context(0, true);
+ LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
+ immutable_db_options_.info_log.get());
+
+ // Perform CompactFiles
+ TEST_SYNC_POINT("TestCompactFiles::IngestExternalFile2");
+ {
+ InstrumentedMutexLock l(&mutex_);
+
+ // This call will unlock/lock the mutex to wait for current running
+ // IngestExternalFile() calls to finish.
+ WaitForIngestFile();
+
+ // We need to get current after `WaitForIngestFile`, because
+ // `IngestExternalFile` may add files that overlap with `input_file_names`
+ auto* current = cfd->current();
+ current->Ref();
+
+ s = CompactFilesImpl(compact_options, cfd, current, input_file_names,
+ output_file_names, output_level, output_path_id,
+ &job_context, &log_buffer, compaction_job_info);
+
+ current->Unref();
+ }
+
+ // Find and delete obsolete files
+ {
+ InstrumentedMutexLock l(&mutex_);
+ // If !s.ok(), this means that Compaction failed. In that case, we want
+ // to delete all obsolete files we might have created and we force
+ // FindObsoleteFiles(). This is because job_context does not
+ // catch all created files if compaction failed.
+ FindObsoleteFiles(&job_context, !s.ok());
+ } // release the mutex
+
+ // delete unnecessary files if any, this is done outside the mutex
+ if (job_context.HaveSomethingToClean() ||
+ job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
+ // Have to flush the info logs before bg_compaction_scheduled_--
+ // because if bg_flush_scheduled_ becomes 0 and the lock is
+ // released, the deconstructor of DB can kick in and destroy all the
+ // states of DB so info_log might not be available after that point.
+ // It also applies to access other states that DB owns.
+ log_buffer.FlushBufferToLog();
+ if (job_context.HaveSomethingToDelete()) {
+ // no mutex is locked here. No need to Unlock() and Lock() here.
+ PurgeObsoleteFiles(job_context);
+ }
+ job_context.Clean();
+ }
+
+ return s;
+#endif // ROCKSDB_LITE
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::CompactFilesImpl(
+ const CompactionOptions& compact_options, ColumnFamilyData* cfd,
+ Version* version, const std::vector<std::string>& input_file_names,
+ std::vector<std::string>* const output_file_names, const int output_level,
+ int output_path_id, JobContext* job_context, LogBuffer* log_buffer,
+ CompactionJobInfo* compaction_job_info) {
+ mutex_.AssertHeld();
+
+ if (shutting_down_.load(std::memory_order_acquire)) {
+ return Status::ShutdownInProgress();
+ }
+ if (manual_compaction_paused_.load(std::memory_order_acquire)) {
+ return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+ }
+
+ std::unordered_set<uint64_t> input_set;
+ for (const auto& file_name : input_file_names) {
+ input_set.insert(TableFileNameToNumber(file_name));
+ }
+
+ ColumnFamilyMetaData cf_meta;
+ // TODO(yhchiang): can directly use version here if none of the
+ // following functions call is pluggable to external developers.
+ version->GetColumnFamilyMetaData(&cf_meta);
+
+ if (output_path_id < 0) {
+ if (cfd->ioptions()->cf_paths.size() == 1U) {
+ output_path_id = 0;
+ } else {
+ return Status::NotSupported(
+ "Automatic output path selection is not "
+ "yet supported in CompactFiles()");
+ }
+ }
+
+ Status s = cfd->compaction_picker()->SanitizeCompactionInputFiles(
+ &input_set, cf_meta, output_level);
+ if (!s.ok()) {
+ return s;
+ }
+
+ std::vector<CompactionInputFiles> input_files;
+ s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, version->storage_info(), compact_options);
+ if (!s.ok()) {
+ return s;
+ }
+
+ for (const auto& inputs : input_files) {
+ if (cfd->compaction_picker()->AreFilesInCompaction(inputs.files)) {
+ return Status::Aborted(
+ "Some of the necessary compaction input "
+ "files are already being compacted");
+ }
+ }
+ bool sfm_reserved_compact_space = false;
+ // First check if we have enough room to do the compaction
+ bool enough_room = EnoughRoomForCompaction(
+ cfd, input_files, &sfm_reserved_compact_space, log_buffer);
+
+ if (!enough_room) {
+ // m's vars will get set properly at the end of this function,
+ // as long as status == CompactionTooLarge
+ return Status::CompactionTooLarge();
+ }
+
+ // At this point, CompactFiles will be run.
+ bg_compaction_scheduled_++;
+
+ std::unique_ptr<Compaction> c;
+ assert(cfd->compaction_picker());
+ c.reset(cfd->compaction_picker()->CompactFiles(
+ compact_options, input_files, output_level, version->storage_info(),
+ *cfd->GetLatestMutableCFOptions(), output_path_id));
+ // we already sanitized the set of input files and checked for conflicts
+ // without releasing the lock, so we're guaranteed a compaction can be formed.
+ assert(c != nullptr);
+
+ c->SetInputVersion(version);
+ // deletion compaction currently not allowed in CompactFiles.
+ assert(!c->deletion_compaction());
+
+ std::vector<SequenceNumber> snapshot_seqs;
+ SequenceNumber earliest_write_conflict_snapshot;
+ SnapshotChecker* snapshot_checker;
+ GetSnapshotContext(job_context, &snapshot_seqs,
+ &earliest_write_conflict_snapshot, &snapshot_checker);
+
+ std::unique_ptr<std::list<uint64_t>::iterator> pending_outputs_inserted_elem(
+ new std::list<uint64_t>::iterator(
+ CaptureCurrentFileNumberInPendingOutputs()));
+
+ assert(is_snapshot_supported_ || snapshots_.empty());
+ CompactionJobStats compaction_job_stats;
+ CompactionJob compaction_job(
+ job_context->job_id, c.get(), immutable_db_options_,
+ file_options_for_compaction_, versions_.get(), &shutting_down_,
+ preserve_deletes_seqnum_.load(), log_buffer, directories_.GetDbDir(),
+ GetDataDir(c->column_family_data(), c->output_path_id()), stats_, &mutex_,
+ &error_handler_, snapshot_seqs, earliest_write_conflict_snapshot,
+ snapshot_checker, table_cache_, &event_logger_,
+ c->mutable_cf_options()->paranoid_file_checks,
+ c->mutable_cf_options()->report_bg_io_stats, dbname_,
+ &compaction_job_stats, Env::Priority::USER, &manual_compaction_paused_);
+
+ // Creating a compaction influences the compaction score because the score
+ // takes running compactions into account (by skipping files that are already
+ // being compacted). Since we just changed compaction score, we recalculate it
+ // here.
+ version->storage_info()->ComputeCompactionScore(*cfd->ioptions(),
+ *c->mutable_cf_options());
+
+ compaction_job.Prepare();
+
+ mutex_.Unlock();
+ TEST_SYNC_POINT("CompactFilesImpl:0");
+ TEST_SYNC_POINT("CompactFilesImpl:1");
+ compaction_job.Run();
+ TEST_SYNC_POINT("CompactFilesImpl:2");
+ TEST_SYNC_POINT("CompactFilesImpl:3");
+ mutex_.Lock();
+
+ Status status = compaction_job.Install(*c->mutable_cf_options());
+ if (status.ok()) {
+ InstallSuperVersionAndScheduleWork(c->column_family_data(),
+ &job_context->superversion_contexts[0],
+ *c->mutable_cf_options());
+ }
+ c->ReleaseCompactionFiles(s);
+#ifndef ROCKSDB_LITE
+ // Need to make sure SstFileManager does its bookkeeping
+ auto sfm = static_cast<SstFileManagerImpl*>(
+ immutable_db_options_.sst_file_manager.get());
+ if (sfm && sfm_reserved_compact_space) {
+ sfm->OnCompactionCompletion(c.get());
+ }
+#endif // ROCKSDB_LITE
+
+ ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
+
+ if (compaction_job_info != nullptr) {
+ BuildCompactionJobInfo(cfd, c.get(), s, compaction_job_stats,
+ job_context->job_id, version, compaction_job_info);
+ }
+
+ if (status.ok()) {
+ // Done
+ } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) {
+ // Ignore compaction errors found during shutting down
+ } else if (status.IsManualCompactionPaused()) {
+ // Don't report stopping manual compaction as error
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[%s] [JOB %d] Stopping manual compaction",
+ c->column_family_data()->GetName().c_str(),
+ job_context->job_id);
+ } else {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "[%s] [JOB %d] Compaction error: %s",
+ c->column_family_data()->GetName().c_str(),
+ job_context->job_id, status.ToString().c_str());
+ error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction);
+ }
+
+ if (output_file_names != nullptr) {
+ for (const auto newf : c->edit()->GetNewFiles()) {
+ (*output_file_names)
+ .push_back(TableFileName(c->immutable_cf_options()->cf_paths,
+ newf.second.fd.GetNumber(),
+ newf.second.fd.GetPathId()));
+ }
+ }
+
+ c.reset();
+
+ bg_compaction_scheduled_--;
+ if (bg_compaction_scheduled_ == 0) {
+ bg_cv_.SignalAll();
+ }
+ MaybeScheduleFlushOrCompaction();
+ TEST_SYNC_POINT("CompactFilesImpl:End");
+
+ return status;
+}
+#endif // ROCKSDB_LITE
+
+Status DBImpl::PauseBackgroundWork() {
+ InstrumentedMutexLock guard_lock(&mutex_);
+ bg_compaction_paused_++;
+ while (bg_bottom_compaction_scheduled_ > 0 || bg_compaction_scheduled_ > 0 ||
+ bg_flush_scheduled_ > 0) {
+ bg_cv_.Wait();
+ }
+ bg_work_paused_++;
+ return Status::OK();
+}
+
+Status DBImpl::ContinueBackgroundWork() {
+ InstrumentedMutexLock guard_lock(&mutex_);
+ if (bg_work_paused_ == 0) {
+ return Status::InvalidArgument();
+ }
+ assert(bg_work_paused_ > 0);
+ assert(bg_compaction_paused_ > 0);
+ bg_compaction_paused_--;
+ bg_work_paused_--;
+ // It's sufficient to check just bg_work_paused_ here since
+ // bg_work_paused_ is always no greater than bg_compaction_paused_
+ if (bg_work_paused_ == 0) {
+ MaybeScheduleFlushOrCompaction();
+ }
+ return Status::OK();
+}
+
+void DBImpl::NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c,
+ const Status& st,
+ const CompactionJobStats& job_stats,
+ int job_id) {
+#ifndef ROCKSDB_LITE
+ if (immutable_db_options_.listeners.empty()) {
+ return;
+ }
+ mutex_.AssertHeld();
+ if (shutting_down_.load(std::memory_order_acquire)) {
+ return;
+ }
+ if (c->is_manual_compaction() &&
+ manual_compaction_paused_.load(std::memory_order_acquire)) {
+ return;
+ }
+ Version* current = cfd->current();
+ current->Ref();
+ // release lock while notifying events
+ mutex_.Unlock();
+ TEST_SYNC_POINT("DBImpl::NotifyOnCompactionBegin::UnlockMutex");
+ {
+ CompactionJobInfo info{};
+ info.cf_name = cfd->GetName();
+ info.status = st;
+ info.thread_id = env_->GetThreadID();
+ info.job_id = job_id;
+ info.base_input_level = c->start_level();
+ info.output_level = c->output_level();
+ info.stats = job_stats;
+ info.table_properties = c->GetOutputTableProperties();
+ info.compaction_reason = c->compaction_reason();
+ info.compression = c->output_compression();
+ for (size_t i = 0; i < c->num_input_levels(); ++i) {
+ for (const auto fmd : *c->inputs(i)) {
+ const FileDescriptor& desc = fmd->fd;
+ const uint64_t file_number = desc.GetNumber();
+ auto fn = TableFileName(c->immutable_cf_options()->cf_paths,
+ file_number, desc.GetPathId());
+ info.input_files.push_back(fn);
+ info.input_file_infos.push_back(CompactionFileInfo{
+ static_cast<int>(i), file_number, fmd->oldest_blob_file_number});
+ if (info.table_properties.count(fn) == 0) {
+ std::shared_ptr<const TableProperties> tp;
+ auto s = current->GetTableProperties(&tp, fmd, &fn);
+ if (s.ok()) {
+ info.table_properties[fn] = tp;
+ }
+ }
+ }
+ }
+ for (const auto newf : c->edit()->GetNewFiles()) {
+ const FileMetaData& meta = newf.second;
+ const FileDescriptor& desc = meta.fd;
+ const uint64_t file_number = desc.GetNumber();
+ info.output_files.push_back(TableFileName(
+ c->immutable_cf_options()->cf_paths, file_number, desc.GetPathId()));
+ info.output_file_infos.push_back(CompactionFileInfo{
+ newf.first, file_number, meta.oldest_blob_file_number});
+ }
+ for (auto listener : immutable_db_options_.listeners) {
+ listener->OnCompactionBegin(this, info);
+ }
+ }
+ mutex_.Lock();
+ current->Unref();
+#else
+ (void)cfd;
+ (void)c;
+ (void)st;
+ (void)job_stats;
+ (void)job_id;
+#endif // ROCKSDB_LITE
+}
+
+void DBImpl::NotifyOnCompactionCompleted(
+ ColumnFamilyData* cfd, Compaction* c, const Status& st,
+ const CompactionJobStats& compaction_job_stats, const int job_id) {
+#ifndef ROCKSDB_LITE
+ if (immutable_db_options_.listeners.size() == 0U) {
+ return;
+ }
+ mutex_.AssertHeld();
+ if (shutting_down_.load(std::memory_order_acquire)) {
+ return;
+ }
+ if (c->is_manual_compaction() &&
+ manual_compaction_paused_.load(std::memory_order_acquire)) {
+ return;
+ }
+ Version* current = cfd->current();
+ current->Ref();
+ // release lock while notifying events
+ mutex_.Unlock();
+ TEST_SYNC_POINT("DBImpl::NotifyOnCompactionCompleted::UnlockMutex");
+ {
+ CompactionJobInfo info{};
+ BuildCompactionJobInfo(cfd, c, st, compaction_job_stats, job_id, current,
+ &info);
+ for (auto listener : immutable_db_options_.listeners) {
+ listener->OnCompactionCompleted(this, info);
+ }
+ }
+ mutex_.Lock();
+ current->Unref();
+ // no need to signal bg_cv_ as it will be signaled at the end of the
+ // flush process.
+#else
+ (void)cfd;
+ (void)c;
+ (void)st;
+ (void)compaction_job_stats;
+ (void)job_id;
+#endif // ROCKSDB_LITE
+}
+
+// REQUIREMENT: block all background work by calling PauseBackgroundWork()
+// before calling this function
+Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
+ assert(level < cfd->NumberLevels());
+ if (target_level >= cfd->NumberLevels()) {
+ return Status::InvalidArgument("Target level exceeds number of levels");
+ }
+
+ SuperVersionContext sv_context(/* create_superversion */ true);
+
+ Status status;
+
+ InstrumentedMutexLock guard_lock(&mutex_);
+
+ // only allow one thread refitting
+ if (refitting_level_) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[ReFitLevel] another thread is refitting");
+ return Status::NotSupported("another thread is refitting");
+ }
+ refitting_level_ = true;
+
+ const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
+ // move to a smaller level
+ int to_level = target_level;
+ if (target_level < 0) {
+ to_level = FindMinimumEmptyLevelFitting(cfd, mutable_cf_options, level);
+ }
+
+ auto* vstorage = cfd->current()->storage_info();
+ if (to_level > level) {
+ if (level == 0) {
+ return Status::NotSupported(
+ "Cannot change from level 0 to other levels.");
+ }
+ // Check levels are empty for a trivial move
+ for (int l = level + 1; l <= to_level; l++) {
+ if (vstorage->NumLevelFiles(l) > 0) {
+ return Status::NotSupported(
+ "Levels between source and target are not empty for a move.");
+ }
+ }
+ }
+ if (to_level != level) {
+ ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+ "[%s] Before refitting:\n%s", cfd->GetName().c_str(),
+ cfd->current()->DebugString().data());
+
+ VersionEdit edit;
+ edit.SetColumnFamily(cfd->GetID());
+ for (const auto& f : vstorage->LevelFiles(level)) {
+ edit.DeleteFile(level, f->fd.GetNumber());
+ edit.AddFile(to_level, f->fd.GetNumber(), f->fd.GetPathId(),
+ f->fd.GetFileSize(), f->smallest, f->largest,
+ f->fd.smallest_seqno, f->fd.largest_seqno,
+ f->marked_for_compaction, f->oldest_blob_file_number,
+ f->oldest_ancester_time, f->file_creation_time,
+ f->file_checksum, f->file_checksum_func_name);
+ }
+ ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+ "[%s] Apply version edit:\n%s", cfd->GetName().c_str(),
+ edit.DebugString().data());
+
+ status = versions_->LogAndApply(cfd, mutable_cf_options, &edit, &mutex_,
+ directories_.GetDbDir());
+ InstallSuperVersionAndScheduleWork(cfd, &sv_context, mutable_cf_options);
+
+ ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] LogAndApply: %s\n",
+ cfd->GetName().c_str(), status.ToString().data());
+
+ if (status.ok()) {
+ ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+ "[%s] After refitting:\n%s", cfd->GetName().c_str(),
+ cfd->current()->DebugString().data());
+ }
+ }
+
+ sv_context.Clean();
+ refitting_level_ = false;
+
+ return status;
+}
+
+int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ return cfh->cfd()->NumberLevels();
+}
+
+int DBImpl::MaxMemCompactionLevel(ColumnFamilyHandle* /*column_family*/) {
+ return 0;
+}
+
+int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ InstrumentedMutexLock l(&mutex_);
+ return cfh->cfd()
+ ->GetSuperVersion()
+ ->mutable_cf_options.level0_stop_writes_trigger;
+}
+
+Status DBImpl::Flush(const FlushOptions& flush_options,
+ ColumnFamilyHandle* column_family) {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "[%s] Manual flush start.",
+ cfh->GetName().c_str());
+ Status s;
+ if (immutable_db_options_.atomic_flush) {
+ s = AtomicFlushMemTables({cfh->cfd()}, flush_options,
+ FlushReason::kManualFlush);
+ } else {
+ s = FlushMemTable(cfh->cfd(), flush_options, FlushReason::kManualFlush);
+ }
+
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[%s] Manual flush finished, status: %s\n",
+ cfh->GetName().c_str(), s.ToString().c_str());
+ return s;
+}
+
+Status DBImpl::Flush(const FlushOptions& flush_options,
+ const std::vector<ColumnFamilyHandle*>& column_families) {
+ Status s;
+ if (!immutable_db_options_.atomic_flush) {
+ for (auto cfh : column_families) {
+ s = Flush(flush_options, cfh);
+ if (!s.ok()) {
+ break;
+ }
+ }
+ } else {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Manual atomic flush start.\n"
+ "=====Column families:=====");
+ for (auto cfh : column_families) {
+ auto cfhi = static_cast<ColumnFamilyHandleImpl*>(cfh);
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s",
+ cfhi->GetName().c_str());
+ }
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "=====End of column families list=====");
+ autovector<ColumnFamilyData*> cfds;
+ std::for_each(column_families.begin(), column_families.end(),
+ [&cfds](ColumnFamilyHandle* elem) {
+ auto cfh = static_cast<ColumnFamilyHandleImpl*>(elem);
+ cfds.emplace_back(cfh->cfd());
+ });
+ s = AtomicFlushMemTables(cfds, flush_options, FlushReason::kManualFlush);
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Manual atomic flush finished, status: %s\n"
+ "=====Column families:=====",
+ s.ToString().c_str());
+ for (auto cfh : column_families) {
+ auto cfhi = static_cast<ColumnFamilyHandleImpl*>(cfh);
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s",
+ cfhi->GetName().c_str());
+ }
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "=====End of column families list=====");
+ }
+ return s;
+}
+
+Status DBImpl::RunManualCompaction(
+ ColumnFamilyData* cfd, int input_level, int output_level,
+ const CompactRangeOptions& compact_range_options, const Slice* begin,
+ const Slice* end, bool exclusive, bool disallow_trivial_move,
+ uint64_t max_file_num_to_ignore) {
+ assert(input_level == ColumnFamilyData::kCompactAllLevels ||
+ input_level >= 0);
+
+ InternalKey begin_storage, end_storage;
+ CompactionArg* ca;
+
+ bool scheduled = false;
+ bool manual_conflict = false;
+ ManualCompactionState manual;
+ manual.cfd = cfd;
+ manual.input_level = input_level;
+ manual.output_level = output_level;
+ manual.output_path_id = compact_range_options.target_path_id;
+ manual.done = false;
+ manual.in_progress = false;
+ manual.incomplete = false;
+ manual.exclusive = exclusive;
+ manual.disallow_trivial_move = disallow_trivial_move;
+ // For universal compaction, we enforce every manual compaction to compact
+ // all files.
+ if (begin == nullptr ||
+ cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+ cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
+ manual.begin = nullptr;
+ } else {
+ begin_storage.SetMinPossibleForUserKey(*begin);
+ manual.begin = &begin_storage;
+ }
+ if (end == nullptr ||
+ cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+ cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
+ manual.end = nullptr;
+ } else {
+ end_storage.SetMaxPossibleForUserKey(*end);
+ manual.end = &end_storage;
+ }
+
+ TEST_SYNC_POINT("DBImpl::RunManualCompaction:0");
+ TEST_SYNC_POINT("DBImpl::RunManualCompaction:1");
+ InstrumentedMutexLock l(&mutex_);
+
+ // When a manual compaction arrives, temporarily disable scheduling of
+ // non-manual compactions and wait until the number of scheduled compaction
+ // jobs drops to zero. This is needed to ensure that this manual compaction
+ // can compact any range of keys/files.
+ //
+ // HasPendingManualCompaction() is true when at least one thread is inside
+ // RunManualCompaction(), i.e. during that time no other compaction will
+ // get scheduled (see MaybeScheduleFlushOrCompaction).
+ //
+ // Note that the following loop doesn't stop more that one thread calling
+ // RunManualCompaction() from getting to the second while loop below.
+ // However, only one of them will actually schedule compaction, while
+ // others will wait on a condition variable until it completes.
+
+ AddManualCompaction(&manual);
+ TEST_SYNC_POINT_CALLBACK("DBImpl::RunManualCompaction:NotScheduled", &mutex_);
+ if (exclusive) {
+ while (bg_bottom_compaction_scheduled_ > 0 ||
+ bg_compaction_scheduled_ > 0) {
+ TEST_SYNC_POINT("DBImpl::RunManualCompaction:WaitScheduled");
+ ROCKS_LOG_INFO(
+ immutable_db_options_.info_log,
+ "[%s] Manual compaction waiting for all other scheduled background "
+ "compactions to finish",
+ cfd->GetName().c_str());
+ bg_cv_.Wait();
+ }
+ }
+
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[%s] Manual compaction starting", cfd->GetName().c_str());
+
+ LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
+ immutable_db_options_.info_log.get());
+ // We don't check bg_error_ here, because if we get the error in compaction,
+ // the compaction will set manual.status to bg_error_ and set manual.done to
+ // true.
+ while (!manual.done) {
+ assert(HasPendingManualCompaction());
+ manual_conflict = false;
+ Compaction* compaction = nullptr;
+ if (ShouldntRunManualCompaction(&manual) || (manual.in_progress == true) ||
+ scheduled ||
+ (((manual.manual_end = &manual.tmp_storage1) != nullptr) &&
+ ((compaction = manual.cfd->CompactRange(
+ *manual.cfd->GetLatestMutableCFOptions(), manual.input_level,
+ manual.output_level, compact_range_options, manual.begin,
+ manual.end, &manual.manual_end, &manual_conflict,
+ max_file_num_to_ignore)) == nullptr &&
+ manual_conflict))) {
+ // exclusive manual compactions should not see a conflict during
+ // CompactRange
+ assert(!exclusive || !manual_conflict);
+ // Running either this or some other manual compaction
+ bg_cv_.Wait();
+ if (scheduled && manual.incomplete == true) {
+ assert(!manual.in_progress);
+ scheduled = false;
+ manual.incomplete = false;
+ }
+ } else if (!scheduled) {
+ if (compaction == nullptr) {
+ manual.done = true;
+ bg_cv_.SignalAll();
+ continue;
+ }
+ ca = new CompactionArg;
+ ca->db = this;
+ ca->prepicked_compaction = new PrepickedCompaction;
+ ca->prepicked_compaction->manual_compaction_state = &manual;
+ ca->prepicked_compaction->compaction = compaction;
+ if (!RequestCompactionToken(
+ cfd, true, &ca->prepicked_compaction->task_token, &log_buffer)) {
+ // Don't throttle manual compaction, only count outstanding tasks.
+ assert(false);
+ }
+ manual.incomplete = false;
+ bg_compaction_scheduled_++;
+ env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this,
+ &DBImpl::UnscheduleCompactionCallback);
+ scheduled = true;
+ }
+ }
+
+ log_buffer.FlushBufferToLog();
+ assert(!manual.in_progress);
+ assert(HasPendingManualCompaction());
+ RemoveManualCompaction(&manual);
+ bg_cv_.SignalAll();
+ return manual.status;
+}
+
+void DBImpl::GenerateFlushRequest(const autovector<ColumnFamilyData*>& cfds,
+ FlushRequest* req) {
+ assert(req != nullptr);
+ req->reserve(cfds.size());
+ for (const auto cfd : cfds) {
+ if (nullptr == cfd) {
+ // cfd may be null, see DBImpl::ScheduleFlushes
+ continue;
+ }
+ uint64_t max_memtable_id = cfd->imm()->GetLatestMemTableID();
+ req->emplace_back(cfd, max_memtable_id);
+ }
+}
+
+Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
+ const FlushOptions& flush_options,
+ FlushReason flush_reason, bool writes_stopped) {
+ Status s;
+ uint64_t flush_memtable_id = 0;
+ if (!flush_options.allow_write_stall) {
+ bool flush_needed = true;
+ s = WaitUntilFlushWouldNotStallWrites(cfd, &flush_needed);
+ TEST_SYNC_POINT("DBImpl::FlushMemTable:StallWaitDone");
+ if (!s.ok() || !flush_needed) {
+ return s;
+ }
+ }
+ FlushRequest flush_req;
+ {
+ WriteContext context;
+ InstrumentedMutexLock guard_lock(&mutex_);
+
+ WriteThread::Writer w;
+ WriteThread::Writer nonmem_w;
+ if (!writes_stopped) {
+ write_thread_.EnterUnbatched(&w, &mutex_);
+ if (two_write_queues_) {
+ nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+ }
+ }
+ WaitForPendingWrites();
+
+ if (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load()) {
+ s = SwitchMemtable(cfd, &context);
+ }
+ if (s.ok()) {
+ if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
+ !cached_recoverable_state_empty_.load()) {
+ flush_memtable_id = cfd->imm()->GetLatestMemTableID();
+ flush_req.emplace_back(cfd, flush_memtable_id);
+ }
+ if (immutable_db_options_.persist_stats_to_disk) {
+ ColumnFamilyData* cfd_stats =
+ versions_->GetColumnFamilySet()->GetColumnFamily(
+ kPersistentStatsColumnFamilyName);
+ if (cfd_stats != nullptr && cfd_stats != cfd &&
+ !cfd_stats->mem()->IsEmpty()) {
+ // only force flush stats CF when it will be the only CF lagging
+ // behind after the current flush
+ bool stats_cf_flush_needed = true;
+ for (auto* loop_cfd : *versions_->GetColumnFamilySet()) {
+ if (loop_cfd == cfd_stats || loop_cfd == cfd) {
+ continue;
+ }
+ if (loop_cfd->GetLogNumber() <= cfd_stats->GetLogNumber()) {
+ stats_cf_flush_needed = false;
+ }
+ }
+ if (stats_cf_flush_needed) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Force flushing stats CF with manual flush of %s "
+ "to avoid holding old logs",
+ cfd->GetName().c_str());
+ s = SwitchMemtable(cfd_stats, &context);
+ flush_memtable_id = cfd_stats->imm()->GetLatestMemTableID();
+ flush_req.emplace_back(cfd_stats, flush_memtable_id);
+ }
+ }
+ }
+ }
+
+ if (s.ok() && !flush_req.empty()) {
+ for (auto& elem : flush_req) {
+ ColumnFamilyData* loop_cfd = elem.first;
+ loop_cfd->imm()->FlushRequested();
+ }
+ // If the caller wants to wait for this flush to complete, it indicates
+ // that the caller expects the ColumnFamilyData not to be free'ed by
+ // other threads which may drop the column family concurrently.
+ // Therefore, we increase the cfd's ref count.
+ if (flush_options.wait) {
+ for (auto& elem : flush_req) {
+ ColumnFamilyData* loop_cfd = elem.first;
+ loop_cfd->Ref();
+ }
+ }
+ SchedulePendingFlush(flush_req, flush_reason);
+ MaybeScheduleFlushOrCompaction();
+ }
+
+ if (!writes_stopped) {
+ write_thread_.ExitUnbatched(&w);
+ if (two_write_queues_) {
+ nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+ }
+ }
+ }
+ TEST_SYNC_POINT("DBImpl::FlushMemTable:AfterScheduleFlush");
+ TEST_SYNC_POINT("DBImpl::FlushMemTable:BeforeWaitForBgFlush");
+ if (s.ok() && flush_options.wait) {
+ autovector<ColumnFamilyData*> cfds;
+ autovector<const uint64_t*> flush_memtable_ids;
+ for (auto& iter : flush_req) {
+ cfds.push_back(iter.first);
+ flush_memtable_ids.push_back(&(iter.second));
+ }
+ s = WaitForFlushMemTables(cfds, flush_memtable_ids,
+ (flush_reason == FlushReason::kErrorRecovery));
+ InstrumentedMutexLock lock_guard(&mutex_);
+ for (auto* tmp_cfd : cfds) {
+ tmp_cfd->UnrefAndTryDelete();
+ }
+ }
+ TEST_SYNC_POINT("DBImpl::FlushMemTable:FlushMemTableFinished");
+ return s;
+}
+
+// Flush all elements in 'column_family_datas'
+// and atomically record the result to the MANIFEST.
+Status DBImpl::AtomicFlushMemTables(
+ const autovector<ColumnFamilyData*>& column_family_datas,
+ const FlushOptions& flush_options, FlushReason flush_reason,
+ bool writes_stopped) {
+ Status s;
+ if (!flush_options.allow_write_stall) {
+ int num_cfs_to_flush = 0;
+ for (auto cfd : column_family_datas) {
+ bool flush_needed = true;
+ s = WaitUntilFlushWouldNotStallWrites(cfd, &flush_needed);
+ if (!s.ok()) {
+ return s;
+ } else if (flush_needed) {
+ ++num_cfs_to_flush;
+ }
+ }
+ if (0 == num_cfs_to_flush) {
+ return s;
+ }
+ }
+ FlushRequest flush_req;
+ autovector<ColumnFamilyData*> cfds;
+ {
+ WriteContext context;
+ InstrumentedMutexLock guard_lock(&mutex_);
+
+ WriteThread::Writer w;
+ WriteThread::Writer nonmem_w;
+ if (!writes_stopped) {
+ write_thread_.EnterUnbatched(&w, &mutex_);
+ if (two_write_queues_) {
+ nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+ }
+ }
+ WaitForPendingWrites();
+
+ for (auto cfd : column_family_datas) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
+ !cached_recoverable_state_empty_.load()) {
+ cfds.emplace_back(cfd);
+ }
+ }
+ for (auto cfd : cfds) {
+ if (cfd->mem()->IsEmpty() && cached_recoverable_state_empty_.load()) {
+ continue;
+ }
+ cfd->Ref();
+ s = SwitchMemtable(cfd, &context);
+ cfd->UnrefAndTryDelete();
+ if (!s.ok()) {
+ break;
+ }
+ }
+ if (s.ok()) {
+ AssignAtomicFlushSeq(cfds);
+ for (auto cfd : cfds) {
+ cfd->imm()->FlushRequested();
+ }
+ // If the caller wants to wait for this flush to complete, it indicates
+ // that the caller expects the ColumnFamilyData not to be free'ed by
+ // other threads which may drop the column family concurrently.
+ // Therefore, we increase the cfd's ref count.
+ if (flush_options.wait) {
+ for (auto cfd : cfds) {
+ cfd->Ref();
+ }
+ }
+ GenerateFlushRequest(cfds, &flush_req);
+ SchedulePendingFlush(flush_req, flush_reason);
+ MaybeScheduleFlushOrCompaction();
+ }
+
+ if (!writes_stopped) {
+ write_thread_.ExitUnbatched(&w);
+ if (two_write_queues_) {
+ nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+ }
+ }
+ }
+ TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:AfterScheduleFlush");
+ TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:BeforeWaitForBgFlush");
+ if (s.ok() && flush_options.wait) {
+ autovector<const uint64_t*> flush_memtable_ids;
+ for (auto& iter : flush_req) {
+ flush_memtable_ids.push_back(&(iter.second));
+ }
+ s = WaitForFlushMemTables(cfds, flush_memtable_ids,
+ (flush_reason == FlushReason::kErrorRecovery));
+ InstrumentedMutexLock lock_guard(&mutex_);
+ for (auto* cfd : cfds) {
+ cfd->UnrefAndTryDelete();
+ }
+ }
+ return s;
+}
+
+// Calling FlushMemTable(), whether from DB::Flush() or from Backup Engine, can
+// cause write stall, for example if one memtable is being flushed already.
+// This method tries to avoid write stall (similar to CompactRange() behavior)
+// it emulates how the SuperVersion / LSM would change if flush happens, checks
+// it against various constrains and delays flush if it'd cause write stall.
+// Called should check status and flush_needed to see if flush already happened.
+Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
+ bool* flush_needed) {
+ {
+ *flush_needed = true;
+ InstrumentedMutexLock l(&mutex_);
+ uint64_t orig_active_memtable_id = cfd->mem()->GetID();
+ WriteStallCondition write_stall_condition = WriteStallCondition::kNormal;
+ do {
+ if (write_stall_condition != WriteStallCondition::kNormal) {
+ // Same error handling as user writes: Don't wait if there's a
+ // background error, even if it's a soft error. We might wait here
+ // indefinitely as the pending flushes/compactions may never finish
+ // successfully, resulting in the stall condition lasting indefinitely
+ if (error_handler_.IsBGWorkStopped()) {
+ return error_handler_.GetBGError();
+ }
+
+ TEST_SYNC_POINT("DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait");
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[%s] WaitUntilFlushWouldNotStallWrites"
+ " waiting on stall conditions to clear",
+ cfd->GetName().c_str());
+ bg_cv_.Wait();
+ }
+ if (cfd->IsDropped()) {
+ return Status::ColumnFamilyDropped();
+ }
+ if (shutting_down_.load(std::memory_order_acquire)) {
+ return Status::ShutdownInProgress();
+ }
+
+ uint64_t earliest_memtable_id =
+ std::min(cfd->mem()->GetID(), cfd->imm()->GetEarliestMemTableID());
+ if (earliest_memtable_id > orig_active_memtable_id) {
+ // We waited so long that the memtable we were originally waiting on was
+ // flushed.
+ *flush_needed = false;
+ return Status::OK();
+ }
+
+ const auto& mutable_cf_options = *cfd->GetLatestMutableCFOptions();
+ const auto* vstorage = cfd->current()->storage_info();
+
+ // Skip stalling check if we're below auto-flush and auto-compaction
+ // triggers. If it stalled in these conditions, that'd mean the stall
+ // triggers are so low that stalling is needed for any background work. In
+ // that case we shouldn't wait since background work won't be scheduled.
+ if (cfd->imm()->NumNotFlushed() <
+ cfd->ioptions()->min_write_buffer_number_to_merge &&
+ vstorage->l0_delay_trigger_count() <
+ mutable_cf_options.level0_file_num_compaction_trigger) {
+ break;
+ }
+
+ // check whether one extra immutable memtable or an extra L0 file would
+ // cause write stalling mode to be entered. It could still enter stall
+ // mode due to pending compaction bytes, but that's less common
+ write_stall_condition =
+ ColumnFamilyData::GetWriteStallConditionAndCause(
+ cfd->imm()->NumNotFlushed() + 1,
+ vstorage->l0_delay_trigger_count() + 1,
+ vstorage->estimated_compaction_needed_bytes(), mutable_cf_options)
+ .first;
+ } while (write_stall_condition != WriteStallCondition::kNormal);
+ }
+ return Status::OK();
+}
+
+// Wait for memtables to be flushed for multiple column families.
+// let N = cfds.size()
+// for i in [0, N),
+// 1) if flush_memtable_ids[i] is not null, then the memtables with lower IDs
+// have to be flushed for THIS column family;
+// 2) if flush_memtable_ids[i] is null, then all memtables in THIS column
+// family have to be flushed.
+// Finish waiting when ALL column families finish flushing memtables.
+// resuming_from_bg_err indicates whether the caller is trying to resume from
+// background error or in normal processing.
+Status DBImpl::WaitForFlushMemTables(
+ const autovector<ColumnFamilyData*>& cfds,
+ const autovector<const uint64_t*>& flush_memtable_ids,
+ bool resuming_from_bg_err) {
+ int num = static_cast<int>(cfds.size());
+ // Wait until the compaction completes
+ InstrumentedMutexLock l(&mutex_);
+ // If the caller is trying to resume from bg error, then
+ // error_handler_.IsDBStopped() is true.
+ while (resuming_from_bg_err || !error_handler_.IsDBStopped()) {
+ if (shutting_down_.load(std::memory_order_acquire)) {
+ return Status::ShutdownInProgress();
+ }
+ // If an error has occurred during resumption, then no need to wait.
+ if (!error_handler_.GetRecoveryError().ok()) {
+ break;
+ }
+ // Number of column families that have been dropped.
+ int num_dropped = 0;
+ // Number of column families that have finished flush.
+ int num_finished = 0;
+ for (int i = 0; i < num; ++i) {
+ if (cfds[i]->IsDropped()) {
+ ++num_dropped;
+ } else if (cfds[i]->imm()->NumNotFlushed() == 0 ||
+ (flush_memtable_ids[i] != nullptr &&
+ cfds[i]->imm()->GetEarliestMemTableID() >
+ *flush_memtable_ids[i])) {
+ ++num_finished;
+ }
+ }
+ if (1 == num_dropped && 1 == num) {
+ return Status::InvalidArgument("Cannot flush a dropped CF");
+ }
+ // Column families involved in this flush request have either been dropped
+ // or finished flush. Then it's time to finish waiting.
+ if (num_dropped + num_finished == num) {
+ break;
+ }
+ bg_cv_.Wait();
+ }
+ Status s;
+ // If not resuming from bg error, and an error has caused the DB to stop,
+ // then report the bg error to caller.
+ if (!resuming_from_bg_err && error_handler_.IsDBStopped()) {
+ s = error_handler_.GetBGError();
+ }
+ return s;
+}
+
+Status DBImpl::EnableAutoCompaction(
+ const std::vector<ColumnFamilyHandle*>& column_family_handles) {
+ Status s;
+ for (auto cf_ptr : column_family_handles) {
+ Status status =
+ this->SetOptions(cf_ptr, {{"disable_auto_compactions", "false"}});
+ if (!status.ok()) {
+ s = status;
+ }
+ }
+
+ return s;
+}
+
+void DBImpl::DisableManualCompaction() {
+ manual_compaction_paused_.store(true, std::memory_order_release);
+}
+
+void DBImpl::EnableManualCompaction() {
+ manual_compaction_paused_.store(false, std::memory_order_release);
+}
+
+void DBImpl::MaybeScheduleFlushOrCompaction() {
+ mutex_.AssertHeld();
+ if (!opened_successfully_) {
+ // Compaction may introduce data race to DB open
+ return;
+ }
+ if (bg_work_paused_ > 0) {
+ // we paused the background work
+ return;
+ } else if (error_handler_.IsBGWorkStopped() &&
+ !error_handler_.IsRecoveryInProgress()) {
+ // There has been a hard error and this call is not part of the recovery
+ // sequence. Bail out here so we don't get into an endless loop of
+ // scheduling BG work which will again call this function
+ return;
+ } else if (shutting_down_.load(std::memory_order_acquire)) {
+ // DB is being deleted; no more background compactions
+ return;
+ }
+ auto bg_job_limits = GetBGJobLimits();
+ bool is_flush_pool_empty =
+ env_->GetBackgroundThreads(Env::Priority::HIGH) == 0;
+ while (!is_flush_pool_empty && unscheduled_flushes_ > 0 &&
+ bg_flush_scheduled_ < bg_job_limits.max_flushes) {
+ bg_flush_scheduled_++;
+ FlushThreadArg* fta = new FlushThreadArg;
+ fta->db_ = this;
+ fta->thread_pri_ = Env::Priority::HIGH;
+ env_->Schedule(&DBImpl::BGWorkFlush, fta, Env::Priority::HIGH, this,
+ &DBImpl::UnscheduleFlushCallback);
+ --unscheduled_flushes_;
+ TEST_SYNC_POINT_CALLBACK(
+ "DBImpl::MaybeScheduleFlushOrCompaction:AfterSchedule:0",
+ &unscheduled_flushes_);
+ }
+
+ // special case -- if high-pri (flush) thread pool is empty, then schedule
+ // flushes in low-pri (compaction) thread pool.
+ if (is_flush_pool_empty) {
+ while (unscheduled_flushes_ > 0 &&
+ bg_flush_scheduled_ + bg_compaction_scheduled_ <
+ bg_job_limits.max_flushes) {
+ bg_flush_scheduled_++;
+ FlushThreadArg* fta = new FlushThreadArg;
+ fta->db_ = this;
+ fta->thread_pri_ = Env::Priority::LOW;
+ env_->Schedule(&DBImpl::BGWorkFlush, fta, Env::Priority::LOW, this,
+ &DBImpl::UnscheduleFlushCallback);
+ --unscheduled_flushes_;
+ }
+ }
+
+ if (bg_compaction_paused_ > 0) {
+ // we paused the background compaction
+ return;
+ } else if (error_handler_.IsBGWorkStopped()) {
+ // Compaction is not part of the recovery sequence from a hard error. We
+ // might get here because recovery might do a flush and install a new
+ // super version, which will try to schedule pending compactions. Bail
+ // out here and let the higher level recovery handle compactions
+ return;
+ }
+
+ if (HasExclusiveManualCompaction()) {
+ // only manual compactions are allowed to run. don't schedule automatic
+ // compactions
+ TEST_SYNC_POINT("DBImpl::MaybeScheduleFlushOrCompaction:Conflict");
+ return;
+ }
+
+ while (bg_compaction_scheduled_ < bg_job_limits.max_compactions &&
+ unscheduled_compactions_ > 0) {
+ CompactionArg* ca = new CompactionArg;
+ ca->db = this;
+ ca->prepicked_compaction = nullptr;
+ bg_compaction_scheduled_++;
+ unscheduled_compactions_--;
+ env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this,
+ &DBImpl::UnscheduleCompactionCallback);
+ }
+}
+
+DBImpl::BGJobLimits DBImpl::GetBGJobLimits() const {
+ mutex_.AssertHeld();
+ return GetBGJobLimits(immutable_db_options_.max_background_flushes,
+ mutable_db_options_.max_background_compactions,
+ mutable_db_options_.max_background_jobs,
+ write_controller_.NeedSpeedupCompaction());
+}
+
+DBImpl::BGJobLimits DBImpl::GetBGJobLimits(int max_background_flushes,
+ int max_background_compactions,
+ int max_background_jobs,
+ bool parallelize_compactions) {
+ BGJobLimits res;
+ if (max_background_flushes == -1 && max_background_compactions == -1) {
+ // for our first stab implementing max_background_jobs, simply allocate a
+ // quarter of the threads to flushes.
+ res.max_flushes = std::max(1, max_background_jobs / 4);
+ res.max_compactions = std::max(1, max_background_jobs - res.max_flushes);
+ } else {
+ // compatibility code in case users haven't migrated to max_background_jobs,
+ // which automatically computes flush/compaction limits
+ res.max_flushes = std::max(1, max_background_flushes);
+ res.max_compactions = std::max(1, max_background_compactions);
+ }
+ if (!parallelize_compactions) {
+ // throttle background compactions until we deem necessary
+ res.max_compactions = 1;
+ }
+ return res;
+}
+
+void DBImpl::AddToCompactionQueue(ColumnFamilyData* cfd) {
+ assert(!cfd->queued_for_compaction());
+ cfd->Ref();
+ compaction_queue_.push_back(cfd);
+ cfd->set_queued_for_compaction(true);
+}
+
+ColumnFamilyData* DBImpl::PopFirstFromCompactionQueue() {
+ assert(!compaction_queue_.empty());
+ auto cfd = *compaction_queue_.begin();
+ compaction_queue_.pop_front();
+ assert(cfd->queued_for_compaction());
+ cfd->set_queued_for_compaction(false);
+ return cfd;
+}
+
+DBImpl::FlushRequest DBImpl::PopFirstFromFlushQueue() {
+ assert(!flush_queue_.empty());
+ FlushRequest flush_req = flush_queue_.front();
+ flush_queue_.pop_front();
+ // TODO: need to unset flush reason?
+ return flush_req;
+}
+
+ColumnFamilyData* DBImpl::PickCompactionFromQueue(
+ std::unique_ptr<TaskLimiterToken>* token, LogBuffer* log_buffer) {
+ assert(!compaction_queue_.empty());
+ assert(*token == nullptr);
+ autovector<ColumnFamilyData*> throttled_candidates;
+ ColumnFamilyData* cfd = nullptr;
+ while (!compaction_queue_.empty()) {
+ auto first_cfd = *compaction_queue_.begin();
+ compaction_queue_.pop_front();
+ assert(first_cfd->queued_for_compaction());
+ if (!RequestCompactionToken(first_cfd, false, token, log_buffer)) {
+ throttled_candidates.push_back(first_cfd);
+ continue;
+ }
+ cfd = first_cfd;
+ cfd->set_queued_for_compaction(false);
+ break;
+ }
+ // Add throttled compaction candidates back to queue in the original order.
+ for (auto iter = throttled_candidates.rbegin();
+ iter != throttled_candidates.rend(); ++iter) {
+ compaction_queue_.push_front(*iter);
+ }
+ return cfd;
+}
+
+void DBImpl::SchedulePendingFlush(const FlushRequest& flush_req,
+ FlushReason flush_reason) {
+ if (flush_req.empty()) {
+ return;
+ }
+ for (auto& iter : flush_req) {
+ ColumnFamilyData* cfd = iter.first;
+ cfd->Ref();
+ cfd->SetFlushReason(flush_reason);
+ }
+ ++unscheduled_flushes_;
+ flush_queue_.push_back(flush_req);
+}
+
+void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) {
+ if (!cfd->queued_for_compaction() && cfd->NeedsCompaction()) {
+ AddToCompactionQueue(cfd);
+ ++unscheduled_compactions_;
+ }
+}
+
+void DBImpl::SchedulePendingPurge(std::string fname, std::string dir_to_sync,
+ FileType type, uint64_t number, int job_id) {
+ mutex_.AssertHeld();
+ PurgeFileInfo file_info(fname, dir_to_sync, type, number, job_id);
+ purge_files_.insert({{number, std::move(file_info)}});
+}
+
+void DBImpl::BGWorkFlush(void* arg) {
+ FlushThreadArg fta = *(reinterpret_cast<FlushThreadArg*>(arg));
+ delete reinterpret_cast<FlushThreadArg*>(arg);
+
+ IOSTATS_SET_THREAD_POOL_ID(fta.thread_pri_);
+ TEST_SYNC_POINT("DBImpl::BGWorkFlush");
+ static_cast_with_check<DBImpl, DB>(fta.db_)->BackgroundCallFlush(
+ fta.thread_pri_);
+ TEST_SYNC_POINT("DBImpl::BGWorkFlush:done");
+}
+
+void DBImpl::BGWorkCompaction(void* arg) {
+ CompactionArg ca = *(reinterpret_cast<CompactionArg*>(arg));
+ delete reinterpret_cast<CompactionArg*>(arg);
+ IOSTATS_SET_THREAD_POOL_ID(Env::Priority::LOW);
+ TEST_SYNC_POINT("DBImpl::BGWorkCompaction");
+ auto prepicked_compaction =
+ static_cast<PrepickedCompaction*>(ca.prepicked_compaction);
+ static_cast_with_check<DBImpl, DB>(ca.db)->BackgroundCallCompaction(
+ prepicked_compaction, Env::Priority::LOW);
+ delete prepicked_compaction;
+}
+
+void DBImpl::BGWorkBottomCompaction(void* arg) {
+ CompactionArg ca = *(static_cast<CompactionArg*>(arg));
+ delete static_cast<CompactionArg*>(arg);
+ IOSTATS_SET_THREAD_POOL_ID(Env::Priority::BOTTOM);
+ TEST_SYNC_POINT("DBImpl::BGWorkBottomCompaction");
+ auto* prepicked_compaction = ca.prepicked_compaction;
+ assert(prepicked_compaction && prepicked_compaction->compaction &&
+ !prepicked_compaction->manual_compaction_state);
+ ca.db->BackgroundCallCompaction(prepicked_compaction, Env::Priority::BOTTOM);
+ delete prepicked_compaction;
+}
+
+void DBImpl::BGWorkPurge(void* db) {
+ IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH);
+ TEST_SYNC_POINT("DBImpl::BGWorkPurge:start");
+ reinterpret_cast<DBImpl*>(db)->BackgroundCallPurge();
+ TEST_SYNC_POINT("DBImpl::BGWorkPurge:end");
+}
+
+void DBImpl::UnscheduleCompactionCallback(void* arg) {
+ CompactionArg ca = *(reinterpret_cast<CompactionArg*>(arg));
+ delete reinterpret_cast<CompactionArg*>(arg);
+ if (ca.prepicked_compaction != nullptr) {
+ if (ca.prepicked_compaction->compaction != nullptr) {
+ delete ca.prepicked_compaction->compaction;
+ }
+ delete ca.prepicked_compaction;
+ }
+ TEST_SYNC_POINT("DBImpl::UnscheduleCompactionCallback");
+}
+
+void DBImpl::UnscheduleFlushCallback(void* arg) {
+ delete reinterpret_cast<FlushThreadArg*>(arg);
+ TEST_SYNC_POINT("DBImpl::UnscheduleFlushCallback");
+}
+
+Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context,
+ LogBuffer* log_buffer, FlushReason* reason,
+ Env::Priority thread_pri) {
+ mutex_.AssertHeld();
+
+ Status status;
+ *reason = FlushReason::kOthers;
+ // If BG work is stopped due to an error, but a recovery is in progress,
+ // that means this flush is part of the recovery. So allow it to go through
+ if (!error_handler_.IsBGWorkStopped()) {
+ if (shutting_down_.load(std::memory_order_acquire)) {
+ status = Status::ShutdownInProgress();
+ }
+ } else if (!error_handler_.IsRecoveryInProgress()) {
+ status = error_handler_.GetBGError();
+ }
+
+ if (!status.ok()) {
+ return status;
+ }
+
+ autovector<BGFlushArg> bg_flush_args;
+ std::vector<SuperVersionContext>& superversion_contexts =
+ job_context->superversion_contexts;
+ autovector<ColumnFamilyData*> column_families_not_to_flush;
+ while (!flush_queue_.empty()) {
+ // This cfd is already referenced
+ const FlushRequest& flush_req = PopFirstFromFlushQueue();
+ superversion_contexts.clear();
+ superversion_contexts.reserve(flush_req.size());
+
+ for (const auto& iter : flush_req) {
+ ColumnFamilyData* cfd = iter.first;
+ if (cfd->IsDropped() || !cfd->imm()->IsFlushPending()) {
+ // can't flush this CF, try next one
+ column_families_not_to_flush.push_back(cfd);
+ continue;
+ }
+ superversion_contexts.emplace_back(SuperVersionContext(true));
+ bg_flush_args.emplace_back(cfd, iter.second,
+ &(superversion_contexts.back()));
+ }
+ if (!bg_flush_args.empty()) {
+ break;
+ }
+ }
+
+ if (!bg_flush_args.empty()) {
+ auto bg_job_limits = GetBGJobLimits();
+ for (const auto& arg : bg_flush_args) {
+ ColumnFamilyData* cfd = arg.cfd_;
+ ROCKS_LOG_BUFFER(
+ log_buffer,
+ "Calling FlushMemTableToOutputFile with column "
+ "family [%s], flush slots available %d, compaction slots available "
+ "%d, "
+ "flush slots scheduled %d, compaction slots scheduled %d",
+ cfd->GetName().c_str(), bg_job_limits.max_flushes,
+ bg_job_limits.max_compactions, bg_flush_scheduled_,
+ bg_compaction_scheduled_);
+ }
+ status = FlushMemTablesToOutputFiles(bg_flush_args, made_progress,
+ job_context, log_buffer, thread_pri);
+ TEST_SYNC_POINT("DBImpl::BackgroundFlush:BeforeFlush");
+ // All the CFDs in the FlushReq must have the same flush reason, so just
+ // grab the first one
+ *reason = bg_flush_args[0].cfd_->GetFlushReason();
+ for (auto& arg : bg_flush_args) {
+ ColumnFamilyData* cfd = arg.cfd_;
+ if (cfd->UnrefAndTryDelete()) {
+ arg.cfd_ = nullptr;
+ }
+ }
+ }
+ for (auto cfd : column_families_not_to_flush) {
+ cfd->UnrefAndTryDelete();
+ }
+ return status;
+}
+
+void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) {
+ bool made_progress = false;
+ JobContext job_context(next_job_id_.fetch_add(1), true);
+
+ TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:start");
+
+ LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
+ immutable_db_options_.info_log.get());
+ {
+ InstrumentedMutexLock l(&mutex_);
+ assert(bg_flush_scheduled_);
+ num_running_flushes_++;
+
+ std::unique_ptr<std::list<uint64_t>::iterator>
+ pending_outputs_inserted_elem(new std::list<uint64_t>::iterator(
+ CaptureCurrentFileNumberInPendingOutputs()));
+ FlushReason reason;
+
+ Status s = BackgroundFlush(&made_progress, &job_context, &log_buffer,
+ &reason, thread_pri);
+ if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped() &&
+ reason != FlushReason::kErrorRecovery) {
+ // Wait a little bit before retrying background flush in
+ // case this is an environmental problem and we do not want to
+ // chew up resources for failed flushes for the duration of
+ // the problem.
+ uint64_t error_cnt =
+ default_cf_internal_stats_->BumpAndGetBackgroundErrorCount();
+ bg_cv_.SignalAll(); // In case a waiter can proceed despite the error
+ mutex_.Unlock();
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "Waiting after background flush error: %s"
+ "Accumulated background error counts: %" PRIu64,
+ s.ToString().c_str(), error_cnt);
+ log_buffer.FlushBufferToLog();
+ LogFlush(immutable_db_options_.info_log);
+ env_->SleepForMicroseconds(1000000);
+ mutex_.Lock();
+ }
+
+ TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FlushFinish:0");
+ ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
+
+ // If flush failed, we want to delete all temporary files that we might have
+ // created. Thus, we force full scan in FindObsoleteFiles()
+ FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() &&
+ !s.IsColumnFamilyDropped());
+ // delete unnecessary files if any, this is done outside the mutex
+ if (job_context.HaveSomethingToClean() ||
+ job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
+ mutex_.Unlock();
+ TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FilesFound");
+ // Have to flush the info logs before bg_flush_scheduled_--
+ // because if bg_flush_scheduled_ becomes 0 and the lock is
+ // released, the deconstructor of DB can kick in and destroy all the
+ // states of DB so info_log might not be available after that point.
+ // It also applies to access other states that DB owns.
+ log_buffer.FlushBufferToLog();
+ if (job_context.HaveSomethingToDelete()) {
+ PurgeObsoleteFiles(job_context);
+ }
+ job_context.Clean();
+ mutex_.Lock();
+ }
+ TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:ContextCleanedUp");
+
+ assert(num_running_flushes_ > 0);
+ num_running_flushes_--;
+ bg_flush_scheduled_--;
+ // See if there's more work to be done
+ MaybeScheduleFlushOrCompaction();
+ atomic_flush_install_cv_.SignalAll();
+ bg_cv_.SignalAll();
+ // IMPORTANT: there should be no code after calling SignalAll. This call may
+ // signal the DB destructor that it's OK to proceed with destruction. In
+ // that case, all DB variables will be dealloacated and referencing them
+ // will cause trouble.
+ }
+}
+
+void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
+ Env::Priority bg_thread_pri) {
+ bool made_progress = false;
+ JobContext job_context(next_job_id_.fetch_add(1), true);
+ TEST_SYNC_POINT("BackgroundCallCompaction:0");
+ LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
+ immutable_db_options_.info_log.get());
+ {
+ InstrumentedMutexLock l(&mutex_);
+
+ // This call will unlock/lock the mutex to wait for current running
+ // IngestExternalFile() calls to finish.
+ WaitForIngestFile();
+
+ num_running_compactions_++;
+
+ std::unique_ptr<std::list<uint64_t>::iterator>
+ pending_outputs_inserted_elem(new std::list<uint64_t>::iterator(
+ CaptureCurrentFileNumberInPendingOutputs()));
+
+ assert((bg_thread_pri == Env::Priority::BOTTOM &&
+ bg_bottom_compaction_scheduled_) ||
+ (bg_thread_pri == Env::Priority::LOW && bg_compaction_scheduled_));
+ Status s = BackgroundCompaction(&made_progress, &job_context, &log_buffer,
+ prepicked_compaction, bg_thread_pri);
+ TEST_SYNC_POINT("BackgroundCallCompaction:1");
+ if (s.IsBusy()) {
+ bg_cv_.SignalAll(); // In case a waiter can proceed despite the error
+ mutex_.Unlock();
+ env_->SleepForMicroseconds(10000); // prevent hot loop
+ mutex_.Lock();
+ } else if (!s.ok() && !s.IsShutdownInProgress() &&
+ !s.IsManualCompactionPaused() && !s.IsColumnFamilyDropped()) {
+ // Wait a little bit before retrying background compaction in
+ // case this is an environmental problem and we do not want to
+ // chew up resources for failed compactions for the duration of
+ // the problem.
+ uint64_t error_cnt =
+ default_cf_internal_stats_->BumpAndGetBackgroundErrorCount();
+ bg_cv_.SignalAll(); // In case a waiter can proceed despite the error
+ mutex_.Unlock();
+ log_buffer.FlushBufferToLog();
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "Waiting after background compaction error: %s, "
+ "Accumulated background error counts: %" PRIu64,
+ s.ToString().c_str(), error_cnt);
+ LogFlush(immutable_db_options_.info_log);
+ env_->SleepForMicroseconds(1000000);
+ mutex_.Lock();
+ } else if (s.IsManualCompactionPaused()) {
+ ManualCompactionState* m = prepicked_compaction->manual_compaction_state;
+ assert(m);
+ ROCKS_LOG_BUFFER(&log_buffer, "[%s] [JOB %d] Manual compaction paused",
+ m->cfd->GetName().c_str(), job_context.job_id);
+ }
+
+ ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
+
+ // If compaction failed, we want to delete all temporary files that we might
+ // have created (they might not be all recorded in job_context in case of a
+ // failure). Thus, we force full scan in FindObsoleteFiles()
+ FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() &&
+ !s.IsManualCompactionPaused() &&
+ !s.IsColumnFamilyDropped());
+ TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:FoundObsoleteFiles");
+
+ // delete unnecessary files if any, this is done outside the mutex
+ if (job_context.HaveSomethingToClean() ||
+ job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
+ mutex_.Unlock();
+ // Have to flush the info logs before bg_compaction_scheduled_--
+ // because if bg_flush_scheduled_ becomes 0 and the lock is
+ // released, the deconstructor of DB can kick in and destroy all the
+ // states of DB so info_log might not be available after that point.
+ // It also applies to access other states that DB owns.
+ log_buffer.FlushBufferToLog();
+ if (job_context.HaveSomethingToDelete()) {
+ PurgeObsoleteFiles(job_context);
+ TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles");
+ }
+ job_context.Clean();
+ mutex_.Lock();
+ }
+
+ assert(num_running_compactions_ > 0);
+ num_running_compactions_--;
+ if (bg_thread_pri == Env::Priority::LOW) {
+ bg_compaction_scheduled_--;
+ } else {
+ assert(bg_thread_pri == Env::Priority::BOTTOM);
+ bg_bottom_compaction_scheduled_--;
+ }
+
+ versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
+
+ // See if there's more work to be done
+ MaybeScheduleFlushOrCompaction();
+ if (made_progress ||
+ (bg_compaction_scheduled_ == 0 &&
+ bg_bottom_compaction_scheduled_ == 0) ||
+ HasPendingManualCompaction() || unscheduled_compactions_ == 0) {
+ // signal if
+ // * made_progress -- need to wakeup DelayWrite
+ // * bg_{bottom,}_compaction_scheduled_ == 0 -- need to wakeup ~DBImpl
+ // * HasPendingManualCompaction -- need to wakeup RunManualCompaction
+ // If none of this is true, there is no need to signal since nobody is
+ // waiting for it
+ bg_cv_.SignalAll();
+ }
+ // IMPORTANT: there should be no code after calling SignalAll. This call may
+ // signal the DB destructor that it's OK to proceed with destruction. In
+ // that case, all DB variables will be dealloacated and referencing them
+ // will cause trouble.
+ }
+}
+
+Status DBImpl::BackgroundCompaction(bool* made_progress,
+ JobContext* job_context,
+ LogBuffer* log_buffer,
+ PrepickedCompaction* prepicked_compaction,
+ Env::Priority thread_pri) {
+ ManualCompactionState* manual_compaction =
+ prepicked_compaction == nullptr
+ ? nullptr
+ : prepicked_compaction->manual_compaction_state;
+ *made_progress = false;
+ mutex_.AssertHeld();
+ TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Start");
+
+ bool is_manual = (manual_compaction != nullptr);
+ std::unique_ptr<Compaction> c;
+ if (prepicked_compaction != nullptr &&
+ prepicked_compaction->compaction != nullptr) {
+ c.reset(prepicked_compaction->compaction);
+ }
+ bool is_prepicked = is_manual || c;
+
+ // (manual_compaction->in_progress == false);
+ bool trivial_move_disallowed =
+ is_manual && manual_compaction->disallow_trivial_move;
+
+ CompactionJobStats compaction_job_stats;
+ Status status;
+ if (!error_handler_.IsBGWorkStopped()) {
+ if (shutting_down_.load(std::memory_order_acquire)) {
+ status = Status::ShutdownInProgress();
+ } else if (is_manual &&
+ manual_compaction_paused_.load(std::memory_order_acquire)) {
+ status = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+ }
+ } else {
+ status = error_handler_.GetBGError();
+ // If we get here, it means a hard error happened after this compaction
+ // was scheduled by MaybeScheduleFlushOrCompaction(), but before it got
+ // a chance to execute. Since we didn't pop a cfd from the compaction
+ // queue, increment unscheduled_compactions_
+ unscheduled_compactions_++;
+ }
+
+ if (!status.ok()) {
+ if (is_manual) {
+ manual_compaction->status = status;
+ manual_compaction->done = true;
+ manual_compaction->in_progress = false;
+ manual_compaction = nullptr;
+ }
+ if (c) {
+ c->ReleaseCompactionFiles(status);
+ c.reset();
+ }
+ return status;
+ }
+
+ if (is_manual) {
+ // another thread cannot pick up the same work
+ manual_compaction->in_progress = true;
+ }
+
+ std::unique_ptr<TaskLimiterToken> task_token;
+
+ // InternalKey manual_end_storage;
+ // InternalKey* manual_end = &manual_end_storage;
+ bool sfm_reserved_compact_space = false;
+ if (is_manual) {
+ ManualCompactionState* m = manual_compaction;
+ assert(m->in_progress);
+ if (!c) {
+ m->done = true;
+ m->manual_end = nullptr;
+ ROCKS_LOG_BUFFER(log_buffer,
+ "[%s] Manual compaction from level-%d from %s .. "
+ "%s; nothing to do\n",
+ m->cfd->GetName().c_str(), m->input_level,
+ (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
+ (m->end ? m->end->DebugString().c_str() : "(end)"));
+ } else {
+ // First check if we have enough room to do the compaction
+ bool enough_room = EnoughRoomForCompaction(
+ m->cfd, *(c->inputs()), &sfm_reserved_compact_space, log_buffer);
+
+ if (!enough_room) {
+ // Then don't do the compaction
+ c->ReleaseCompactionFiles(status);
+ c.reset();
+ // m's vars will get set properly at the end of this function,
+ // as long as status == CompactionTooLarge
+ status = Status::CompactionTooLarge();
+ } else {
+ ROCKS_LOG_BUFFER(
+ log_buffer,
+ "[%s] Manual compaction from level-%d to level-%d from %s .. "
+ "%s; will stop at %s\n",
+ m->cfd->GetName().c_str(), m->input_level, c->output_level(),
+ (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
+ (m->end ? m->end->DebugString().c_str() : "(end)"),
+ ((m->done || m->manual_end == nullptr)
+ ? "(end)"
+ : m->manual_end->DebugString().c_str()));
+ }
+ }
+ } else if (!is_prepicked && !compaction_queue_.empty()) {
+ if (HasExclusiveManualCompaction()) {
+ // Can't compact right now, but try again later
+ TEST_SYNC_POINT("DBImpl::BackgroundCompaction()::Conflict");
+
+ // Stay in the compaction queue.
+ unscheduled_compactions_++;
+
+ return Status::OK();
+ }
+
+ auto cfd = PickCompactionFromQueue(&task_token, log_buffer);
+ if (cfd == nullptr) {
+ // Can't find any executable task from the compaction queue.
+ // All tasks have been throttled by compaction thread limiter.
+ ++unscheduled_compactions_;
+ return Status::Busy();
+ }
+
+ // We unreference here because the following code will take a Ref() on
+ // this cfd if it is going to use it (Compaction class holds a
+ // reference).
+ // This will all happen under a mutex so we don't have to be afraid of
+ // somebody else deleting it.
+ if (cfd->UnrefAndTryDelete()) {
+ // This was the last reference of the column family, so no need to
+ // compact.
+ return Status::OK();
+ }
+
+ // Pick up latest mutable CF Options and use it throughout the
+ // compaction job
+ // Compaction makes a copy of the latest MutableCFOptions. It should be used
+ // throughout the compaction procedure to make sure consistency. It will
+ // eventually be installed into SuperVersion
+ auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+ if (!mutable_cf_options->disable_auto_compactions && !cfd->IsDropped()) {
+ // NOTE: try to avoid unnecessary copy of MutableCFOptions if
+ // compaction is not necessary. Need to make sure mutex is held
+ // until we make a copy in the following code
+ TEST_SYNC_POINT("DBImpl::BackgroundCompaction():BeforePickCompaction");
+ c.reset(cfd->PickCompaction(*mutable_cf_options, log_buffer));
+ TEST_SYNC_POINT("DBImpl::BackgroundCompaction():AfterPickCompaction");
+
+ if (c != nullptr) {
+ bool enough_room = EnoughRoomForCompaction(
+ cfd, *(c->inputs()), &sfm_reserved_compact_space, log_buffer);
+
+ if (!enough_room) {
+ // Then don't do the compaction
+ c->ReleaseCompactionFiles(status);
+ c->column_family_data()
+ ->current()
+ ->storage_info()
+ ->ComputeCompactionScore(*(c->immutable_cf_options()),
+ *(c->mutable_cf_options()));
+ AddToCompactionQueue(cfd);
+ ++unscheduled_compactions_;
+
+ c.reset();
+ // Don't need to sleep here, because BackgroundCallCompaction
+ // will sleep if !s.ok()
+ status = Status::CompactionTooLarge();
+ } else {
+ // update statistics
+ RecordInHistogram(stats_, NUM_FILES_IN_SINGLE_COMPACTION,
+ c->inputs(0)->size());
+ // There are three things that can change compaction score:
+ // 1) When flush or compaction finish. This case is covered by
+ // InstallSuperVersionAndScheduleWork
+ // 2) When MutableCFOptions changes. This case is also covered by
+ // InstallSuperVersionAndScheduleWork, because this is when the new
+ // options take effect.
+ // 3) When we Pick a new compaction, we "remove" those files being
+ // compacted from the calculation, which then influences compaction
+ // score. Here we check if we need the new compaction even without the
+ // files that are currently being compacted. If we need another
+ // compaction, we might be able to execute it in parallel, so we add
+ // it to the queue and schedule a new thread.
+ if (cfd->NeedsCompaction()) {
+ // Yes, we need more compactions!
+ AddToCompactionQueue(cfd);
+ ++unscheduled_compactions_;
+ MaybeScheduleFlushOrCompaction();
+ }
+ }
+ }
+ }
+ }
+
+ if (!c) {
+ // Nothing to do
+ ROCKS_LOG_BUFFER(log_buffer, "Compaction nothing to do");
+ } else if (c->deletion_compaction()) {
+ // TODO(icanadi) Do we want to honor snapshots here? i.e. not delete old
+ // file if there is alive snapshot pointing to it
+ TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction",
+ c->column_family_data());
+ assert(c->num_input_files(1) == 0);
+ assert(c->level() == 0);
+ assert(c->column_family_data()->ioptions()->compaction_style ==
+ kCompactionStyleFIFO);
+
+ compaction_job_stats.num_input_files = c->num_input_files(0);
+
+ NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
+ compaction_job_stats, job_context->job_id);
+
+ for (const auto& f : *c->inputs(0)) {
+ c->edit()->DeleteFile(c->level(), f->fd.GetNumber());
+ }
+ status = versions_->LogAndApply(c->column_family_data(),
+ *c->mutable_cf_options(), c->edit(),
+ &mutex_, directories_.GetDbDir());
+ InstallSuperVersionAndScheduleWork(c->column_family_data(),
+ &job_context->superversion_contexts[0],
+ *c->mutable_cf_options());
+ ROCKS_LOG_BUFFER(log_buffer, "[%s] Deleted %d files\n",
+ c->column_family_data()->GetName().c_str(),
+ c->num_input_files(0));
+ *made_progress = true;
+ TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction",
+ c->column_family_data());
+ } else if (!trivial_move_disallowed && c->IsTrivialMove()) {
+ TEST_SYNC_POINT("DBImpl::BackgroundCompaction:TrivialMove");
+ TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction",
+ c->column_family_data());
+ // Instrument for event update
+ // TODO(yhchiang): add op details for showing trivial-move.
+ ThreadStatusUtil::SetColumnFamily(
+ c->column_family_data(), c->column_family_data()->ioptions()->env,
+ immutable_db_options_.enable_thread_tracking);
+ ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
+
+ compaction_job_stats.num_input_files = c->num_input_files(0);
+
+ NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
+ compaction_job_stats, job_context->job_id);
+
+ // Move files to next level
+ int32_t moved_files = 0;
+ int64_t moved_bytes = 0;
+ for (unsigned int l = 0; l < c->num_input_levels(); l++) {
+ if (c->level(l) == c->output_level()) {
+ continue;
+ }
+ for (size_t i = 0; i < c->num_input_files(l); i++) {
+ FileMetaData* f = c->input(l, i);
+ c->edit()->DeleteFile(c->level(l), f->fd.GetNumber());
+ c->edit()->AddFile(c->output_level(), f->fd.GetNumber(),
+ f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest,
+ f->largest, f->fd.smallest_seqno,
+ f->fd.largest_seqno, f->marked_for_compaction,
+ f->oldest_blob_file_number, f->oldest_ancester_time,
+ f->file_creation_time, f->file_checksum,
+ f->file_checksum_func_name);
+
+ ROCKS_LOG_BUFFER(
+ log_buffer,
+ "[%s] Moving #%" PRIu64 " to level-%d %" PRIu64 " bytes\n",
+ c->column_family_data()->GetName().c_str(), f->fd.GetNumber(),
+ c->output_level(), f->fd.GetFileSize());
+ ++moved_files;
+ moved_bytes += f->fd.GetFileSize();
+ }
+ }
+
+ status = versions_->LogAndApply(c->column_family_data(),
+ *c->mutable_cf_options(), c->edit(),
+ &mutex_, directories_.GetDbDir());
+ // Use latest MutableCFOptions
+ InstallSuperVersionAndScheduleWork(c->column_family_data(),
+ &job_context->superversion_contexts[0],
+ *c->mutable_cf_options());
+
+ VersionStorageInfo::LevelSummaryStorage tmp;
+ c->column_family_data()->internal_stats()->IncBytesMoved(c->output_level(),
+ moved_bytes);
+ {
+ event_logger_.LogToBuffer(log_buffer)
+ << "job" << job_context->job_id << "event"
+ << "trivial_move"
+ << "destination_level" << c->output_level() << "files" << moved_files
+ << "total_files_size" << moved_bytes;
+ }
+ ROCKS_LOG_BUFFER(
+ log_buffer,
+ "[%s] Moved #%d files to level-%d %" PRIu64 " bytes %s: %s\n",
+ c->column_family_data()->GetName().c_str(), moved_files,
+ c->output_level(), moved_bytes, status.ToString().c_str(),
+ c->column_family_data()->current()->storage_info()->LevelSummary(&tmp));
+ *made_progress = true;
+
+ // Clear Instrument
+ ThreadStatusUtil::ResetThreadStatus();
+ TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction",
+ c->column_family_data());
+ } else if (!is_prepicked && c->output_level() > 0 &&
+ c->output_level() ==
+ c->column_family_data()
+ ->current()
+ ->storage_info()
+ ->MaxOutputLevel(
+ immutable_db_options_.allow_ingest_behind) &&
+ env_->GetBackgroundThreads(Env::Priority::BOTTOM) > 0) {
+ // Forward compactions involving last level to the bottom pool if it exists,
+ // such that compactions unlikely to contribute to write stalls can be
+ // delayed or deprioritized.
+ TEST_SYNC_POINT("DBImpl::BackgroundCompaction:ForwardToBottomPriPool");
+ CompactionArg* ca = new CompactionArg;
+ ca->db = this;
+ ca->prepicked_compaction = new PrepickedCompaction;
+ ca->prepicked_compaction->compaction = c.release();
+ ca->prepicked_compaction->manual_compaction_state = nullptr;
+ // Transfer requested token, so it doesn't need to do it again.
+ ca->prepicked_compaction->task_token = std::move(task_token);
+ ++bg_bottom_compaction_scheduled_;
+ env_->Schedule(&DBImpl::BGWorkBottomCompaction, ca, Env::Priority::BOTTOM,
+ this, &DBImpl::UnscheduleCompactionCallback);
+ } else {
+ TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction",
+ c->column_family_data());
+ int output_level __attribute__((__unused__));
+ output_level = c->output_level();
+ TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:NonTrivial",
+ &output_level);
+ std::vector<SequenceNumber> snapshot_seqs;
+ SequenceNumber earliest_write_conflict_snapshot;
+ SnapshotChecker* snapshot_checker;
+ GetSnapshotContext(job_context, &snapshot_seqs,
+ &earliest_write_conflict_snapshot, &snapshot_checker);
+ assert(is_snapshot_supported_ || snapshots_.empty());
+ CompactionJob compaction_job(
+ job_context->job_id, c.get(), immutable_db_options_,
+ file_options_for_compaction_, versions_.get(), &shutting_down_,
+ preserve_deletes_seqnum_.load(), log_buffer, directories_.GetDbDir(),
+ GetDataDir(c->column_family_data(), c->output_path_id()), stats_,
+ &mutex_, &error_handler_, snapshot_seqs,
+ earliest_write_conflict_snapshot, snapshot_checker, table_cache_,
+ &event_logger_, c->mutable_cf_options()->paranoid_file_checks,
+ c->mutable_cf_options()->report_bg_io_stats, dbname_,
+ &compaction_job_stats, thread_pri,
+ is_manual ? &manual_compaction_paused_ : nullptr);
+ compaction_job.Prepare();
+
+ NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
+ compaction_job_stats, job_context->job_id);
+
+ mutex_.Unlock();
+ TEST_SYNC_POINT_CALLBACK(
+ "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", nullptr);
+ compaction_job.Run();
+ TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun");
+ mutex_.Lock();
+
+ status = compaction_job.Install(*c->mutable_cf_options());
+ if (status.ok()) {
+ InstallSuperVersionAndScheduleWork(c->column_family_data(),
+ &job_context->superversion_contexts[0],
+ *c->mutable_cf_options());
+ }
+ *made_progress = true;
+ TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction",
+ c->column_family_data());
+ }
+ if (c != nullptr) {
+ c->ReleaseCompactionFiles(status);
+ *made_progress = true;
+
+#ifndef ROCKSDB_LITE
+ // Need to make sure SstFileManager does its bookkeeping
+ auto sfm = static_cast<SstFileManagerImpl*>(
+ immutable_db_options_.sst_file_manager.get());
+ if (sfm && sfm_reserved_compact_space) {
+ sfm->OnCompactionCompletion(c.get());
+ }
+#endif // ROCKSDB_LITE
+
+ NotifyOnCompactionCompleted(c->column_family_data(), c.get(), status,
+ compaction_job_stats, job_context->job_id);
+ }
+
+ if (status.ok() || status.IsCompactionTooLarge() ||
+ status.IsManualCompactionPaused()) {
+ // Done
+ } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) {
+ // Ignore compaction errors found during shutting down
+ } else {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log, "Compaction error: %s",
+ status.ToString().c_str());
+ error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction);
+ if (c != nullptr && !is_manual && !error_handler_.IsBGWorkStopped()) {
+ // Put this cfd back in the compaction queue so we can retry after some
+ // time
+ auto cfd = c->column_family_data();
+ assert(cfd != nullptr);
+ // Since this compaction failed, we need to recompute the score so it
+ // takes the original input files into account
+ c->column_family_data()
+ ->current()
+ ->storage_info()
+ ->ComputeCompactionScore(*(c->immutable_cf_options()),
+ *(c->mutable_cf_options()));
+ if (!cfd->queued_for_compaction()) {
+ AddToCompactionQueue(cfd);
+ ++unscheduled_compactions_;
+ }
+ }
+ }
+ // this will unref its input_version and column_family_data
+ c.reset();
+
+ if (is_manual) {
+ ManualCompactionState* m = manual_compaction;
+ if (!status.ok()) {
+ m->status = status;
+ m->done = true;
+ }
+ // For universal compaction:
+ // Because universal compaction always happens at level 0, so one
+ // compaction will pick up all overlapped files. No files will be
+ // filtered out due to size limit and left for a successive compaction.
+ // So we can safely conclude the current compaction.
+ //
+ // Also note that, if we don't stop here, then the current compaction
+ // writes a new file back to level 0, which will be used in successive
+ // compaction. Hence the manual compaction will never finish.
+ //
+ // Stop the compaction if manual_end points to nullptr -- this means
+ // that we compacted the whole range. manual_end should always point
+ // to nullptr in case of universal compaction
+ if (m->manual_end == nullptr) {
+ m->done = true;
+ }
+ if (!m->done) {
+ // We only compacted part of the requested range. Update *m
+ // to the range that is left to be compacted.
+ // Universal and FIFO compactions should always compact the whole range
+ assert(m->cfd->ioptions()->compaction_style !=
+ kCompactionStyleUniversal ||
+ m->cfd->ioptions()->num_levels > 1);
+ assert(m->cfd->ioptions()->compaction_style != kCompactionStyleFIFO);
+ m->tmp_storage = *m->manual_end;
+ m->begin = &m->tmp_storage;
+ m->incomplete = true;
+ }
+ m->in_progress = false; // not being processed anymore
+ }
+ TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Finish");
+ return status;
+}
+
+bool DBImpl::HasPendingManualCompaction() {
+ return (!manual_compaction_dequeue_.empty());
+}
+
+void DBImpl::AddManualCompaction(DBImpl::ManualCompactionState* m) {
+ manual_compaction_dequeue_.push_back(m);
+}
+
+void DBImpl::RemoveManualCompaction(DBImpl::ManualCompactionState* m) {
+ // Remove from queue
+ std::deque<ManualCompactionState*>::iterator it =
+ manual_compaction_dequeue_.begin();
+ while (it != manual_compaction_dequeue_.end()) {
+ if (m == (*it)) {
+ it = manual_compaction_dequeue_.erase(it);
+ return;
+ }
+ ++it;
+ }
+ assert(false);
+ return;
+}
+
+bool DBImpl::ShouldntRunManualCompaction(ManualCompactionState* m) {
+ if (num_running_ingest_file_ > 0) {
+ // We need to wait for other IngestExternalFile() calls to finish
+ // before running a manual compaction.
+ return true;
+ }
+ if (m->exclusive) {
+ return (bg_bottom_compaction_scheduled_ > 0 ||
+ bg_compaction_scheduled_ > 0);
+ }
+ std::deque<ManualCompactionState*>::iterator it =
+ manual_compaction_dequeue_.begin();
+ bool seen = false;
+ while (it != manual_compaction_dequeue_.end()) {
+ if (m == (*it)) {
+ ++it;
+ seen = true;
+ continue;
+ } else if (MCOverlap(m, (*it)) && (!seen && !(*it)->in_progress)) {
+ // Consider the other manual compaction *it, conflicts if:
+ // overlaps with m
+ // and (*it) is ahead in the queue and is not yet in progress
+ return true;
+ }
+ ++it;
+ }
+ return false;
+}
+
+bool DBImpl::HaveManualCompaction(ColumnFamilyData* cfd) {
+ // Remove from priority queue
+ std::deque<ManualCompactionState*>::iterator it =
+ manual_compaction_dequeue_.begin();
+ while (it != manual_compaction_dequeue_.end()) {
+ if ((*it)->exclusive) {
+ return true;
+ }
+ if ((cfd == (*it)->cfd) && (!((*it)->in_progress || (*it)->done))) {
+ // Allow automatic compaction if manual compaction is
+ // in progress
+ return true;
+ }
+ ++it;
+ }
+ return false;
+}
+
+bool DBImpl::HasExclusiveManualCompaction() {
+ // Remove from priority queue
+ std::deque<ManualCompactionState*>::iterator it =
+ manual_compaction_dequeue_.begin();
+ while (it != manual_compaction_dequeue_.end()) {
+ if ((*it)->exclusive) {
+ return true;
+ }
+ ++it;
+ }
+ return false;
+}
+
+bool DBImpl::MCOverlap(ManualCompactionState* m, ManualCompactionState* m1) {
+ if ((m->exclusive) || (m1->exclusive)) {
+ return true;
+ }
+ if (m->cfd != m1->cfd) {
+ return false;
+ }
+ return true;
+}
+
+#ifndef ROCKSDB_LITE
+void DBImpl::BuildCompactionJobInfo(
+ const ColumnFamilyData* cfd, Compaction* c, const Status& st,
+ const CompactionJobStats& compaction_job_stats, const int job_id,
+ const Version* current, CompactionJobInfo* compaction_job_info) const {
+ assert(compaction_job_info != nullptr);
+ compaction_job_info->cf_id = cfd->GetID();
+ compaction_job_info->cf_name = cfd->GetName();
+ compaction_job_info->status = st;
+ compaction_job_info->thread_id = env_->GetThreadID();
+ compaction_job_info->job_id = job_id;
+ compaction_job_info->base_input_level = c->start_level();
+ compaction_job_info->output_level = c->output_level();
+ compaction_job_info->stats = compaction_job_stats;
+ compaction_job_info->table_properties = c->GetOutputTableProperties();
+ compaction_job_info->compaction_reason = c->compaction_reason();
+ compaction_job_info->compression = c->output_compression();
+ for (size_t i = 0; i < c->num_input_levels(); ++i) {
+ for (const auto fmd : *c->inputs(i)) {
+ const FileDescriptor& desc = fmd->fd;
+ const uint64_t file_number = desc.GetNumber();
+ auto fn = TableFileName(c->immutable_cf_options()->cf_paths, file_number,
+ desc.GetPathId());
+ compaction_job_info->input_files.push_back(fn);
+ compaction_job_info->input_file_infos.push_back(CompactionFileInfo{
+ static_cast<int>(i), file_number, fmd->oldest_blob_file_number});
+ if (compaction_job_info->table_properties.count(fn) == 0) {
+ std::shared_ptr<const TableProperties> tp;
+ auto s = current->GetTableProperties(&tp, fmd, &fn);
+ if (s.ok()) {
+ compaction_job_info->table_properties[fn] = tp;
+ }
+ }
+ }
+ }
+ for (const auto& newf : c->edit()->GetNewFiles()) {
+ const FileMetaData& meta = newf.second;
+ const FileDescriptor& desc = meta.fd;
+ const uint64_t file_number = desc.GetNumber();
+ compaction_job_info->output_files.push_back(TableFileName(
+ c->immutable_cf_options()->cf_paths, file_number, desc.GetPathId()));
+ compaction_job_info->output_file_infos.push_back(CompactionFileInfo{
+ newf.first, file_number, meta.oldest_blob_file_number});
+ }
+}
+#endif
+
+// SuperVersionContext gets created and destructed outside of the lock --
+// we use this conveniently to:
+// * malloc one SuperVersion() outside of the lock -- new_superversion
+// * delete SuperVersion()s outside of the lock -- superversions_to_free
+//
+// However, if InstallSuperVersionAndScheduleWork() gets called twice with the
+// same sv_context, we can't reuse the SuperVersion() that got
+// malloced because
+// first call already used it. In that rare case, we take a hit and create a
+// new SuperVersion() inside of the mutex. We do similar thing
+// for superversion_to_free
+
+void DBImpl::InstallSuperVersionAndScheduleWork(
+ ColumnFamilyData* cfd, SuperVersionContext* sv_context,
+ const MutableCFOptions& mutable_cf_options) {
+ mutex_.AssertHeld();
+
+ // Update max_total_in_memory_state_
+ size_t old_memtable_size = 0;
+ auto* old_sv = cfd->GetSuperVersion();
+ if (old_sv) {
+ old_memtable_size = old_sv->mutable_cf_options.write_buffer_size *
+ old_sv->mutable_cf_options.max_write_buffer_number;
+ }
+
+ // this branch is unlikely to step in
+ if (UNLIKELY(sv_context->new_superversion == nullptr)) {
+ sv_context->NewSuperVersion();
+ }
+ cfd->InstallSuperVersion(sv_context, &mutex_, mutable_cf_options);
+
+ // There may be a small data race here. The snapshot tricking bottommost
+ // compaction may already be released here. But assuming there will always be
+ // newer snapshot created and released frequently, the compaction will be
+ // triggered soon anyway.
+ bottommost_files_mark_threshold_ = kMaxSequenceNumber;
+ for (auto* my_cfd : *versions_->GetColumnFamilySet()) {
+ bottommost_files_mark_threshold_ = std::min(
+ bottommost_files_mark_threshold_,
+ my_cfd->current()->storage_info()->bottommost_files_mark_threshold());
+ }
+
+ // Whenever we install new SuperVersion, we might need to issue new flushes or
+ // compactions.
+ SchedulePendingCompaction(cfd);
+ MaybeScheduleFlushOrCompaction();
+
+ // Update max_total_in_memory_state_
+ max_total_in_memory_state_ = max_total_in_memory_state_ - old_memtable_size +
+ mutable_cf_options.write_buffer_size *
+ mutable_cf_options.max_write_buffer_number;
+}
+
+// ShouldPurge is called by FindObsoleteFiles when doing a full scan,
+// and db mutex (mutex_) should already be held.
+// Actually, the current implementation of FindObsoleteFiles with
+// full_scan=true can issue I/O requests to obtain list of files in
+// directories, e.g. env_->getChildren while holding db mutex.
+bool DBImpl::ShouldPurge(uint64_t file_number) const {
+ return files_grabbed_for_purge_.find(file_number) ==
+ files_grabbed_for_purge_.end() &&
+ purge_files_.find(file_number) == purge_files_.end();
+}
+
+// MarkAsGrabbedForPurge is called by FindObsoleteFiles, and db mutex
+// (mutex_) should already be held.
+void DBImpl::MarkAsGrabbedForPurge(uint64_t file_number) {
+ files_grabbed_for_purge_.insert(file_number);
+}
+
+void DBImpl::SetSnapshotChecker(SnapshotChecker* snapshot_checker) {
+ InstrumentedMutexLock l(&mutex_);
+ // snapshot_checker_ should only set once. If we need to set it multiple
+ // times, we need to make sure the old one is not deleted while it is still
+ // using by a compaction job.
+ assert(!snapshot_checker_);
+ snapshot_checker_.reset(snapshot_checker);
+}
+
+void DBImpl::GetSnapshotContext(
+ JobContext* job_context, std::vector<SequenceNumber>* snapshot_seqs,
+ SequenceNumber* earliest_write_conflict_snapshot,
+ SnapshotChecker** snapshot_checker_ptr) {
+ mutex_.AssertHeld();
+ assert(job_context != nullptr);
+ assert(snapshot_seqs != nullptr);
+ assert(earliest_write_conflict_snapshot != nullptr);
+ assert(snapshot_checker_ptr != nullptr);
+
+ *snapshot_checker_ptr = snapshot_checker_.get();
+ if (use_custom_gc_ && *snapshot_checker_ptr == nullptr) {
+ *snapshot_checker_ptr = DisableGCSnapshotChecker::Instance();
+ }
+ if (*snapshot_checker_ptr != nullptr) {
+ // If snapshot_checker is used, that means the flush/compaction may
+ // contain values not visible to snapshot taken after
+ // flush/compaction job starts. Take a snapshot and it will appear
+ // in snapshot_seqs and force compaction iterator to consider such
+ // snapshots.
+ const Snapshot* job_snapshot =
+ GetSnapshotImpl(false /*write_conflict_boundary*/, false /*lock*/);
+ job_context->job_snapshot.reset(new ManagedSnapshot(this, job_snapshot));
+ }
+ *snapshot_seqs = snapshots_.GetAll(earliest_write_conflict_snapshot);
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_debug.cc b/src/rocksdb/db/db_impl/db_impl_debug.cc
new file mode 100644
index 000000000..610b57d39
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_debug.cc
@@ -0,0 +1,294 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef NDEBUG
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/error_handler.h"
+#include "monitoring/thread_status_updater.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+uint64_t DBImpl::TEST_GetLevel0TotalSize() {
+ InstrumentedMutexLock l(&mutex_);
+ return default_cf_handle_->cfd()->current()->storage_info()->NumLevelBytes(0);
+}
+
+void DBImpl::TEST_SwitchWAL() {
+ WriteContext write_context;
+ InstrumentedMutexLock l(&mutex_);
+ void* writer = TEST_BeginWrite();
+ SwitchWAL(&write_context);
+ TEST_EndWrite(writer);
+}
+
+bool DBImpl::TEST_WALBufferIsEmpty(bool lock) {
+ if (lock) {
+ log_write_mutex_.Lock();
+ }
+ log::Writer* cur_log_writer = logs_.back().writer;
+ auto res = cur_log_writer->TEST_BufferIsEmpty();
+ if (lock) {
+ log_write_mutex_.Unlock();
+ }
+ return res;
+}
+
+int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes(
+ ColumnFamilyHandle* column_family) {
+ ColumnFamilyData* cfd;
+ if (column_family == nullptr) {
+ cfd = default_cf_handle_->cfd();
+ } else {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ cfd = cfh->cfd();
+ }
+ InstrumentedMutexLock l(&mutex_);
+ return cfd->current()->storage_info()->MaxNextLevelOverlappingBytes();
+}
+
+void DBImpl::TEST_GetFilesMetaData(
+ ColumnFamilyHandle* column_family,
+ std::vector<std::vector<FileMetaData>>* metadata) {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ auto cfd = cfh->cfd();
+ InstrumentedMutexLock l(&mutex_);
+ metadata->resize(NumberLevels());
+ for (int level = 0; level < NumberLevels(); level++) {
+ const std::vector<FileMetaData*>& files =
+ cfd->current()->storage_info()->LevelFiles(level);
+
+ (*metadata)[level].clear();
+ for (const auto& f : files) {
+ (*metadata)[level].push_back(*f);
+ }
+ }
+}
+
+uint64_t DBImpl::TEST_Current_Manifest_FileNo() {
+ return versions_->manifest_file_number();
+}
+
+uint64_t DBImpl::TEST_Current_Next_FileNo() {
+ return versions_->current_next_file_number();
+}
+
+Status DBImpl::TEST_CompactRange(int level, const Slice* begin,
+ const Slice* end,
+ ColumnFamilyHandle* column_family,
+ bool disallow_trivial_move) {
+ ColumnFamilyData* cfd;
+ if (column_family == nullptr) {
+ cfd = default_cf_handle_->cfd();
+ } else {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ cfd = cfh->cfd();
+ }
+ int output_level =
+ (cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+ cfd->ioptions()->compaction_style == kCompactionStyleFIFO)
+ ? level
+ : level + 1;
+ return RunManualCompaction(cfd, level, output_level, CompactRangeOptions(),
+ begin, end, true, disallow_trivial_move,
+ port::kMaxUint64 /*max_file_num_to_ignore*/);
+}
+
+Status DBImpl::TEST_SwitchMemtable(ColumnFamilyData* cfd) {
+ WriteContext write_context;
+ InstrumentedMutexLock l(&mutex_);
+ if (cfd == nullptr) {
+ cfd = default_cf_handle_->cfd();
+ }
+
+ Status s;
+ void* writer = TEST_BeginWrite();
+ if (two_write_queues_) {
+ WriteThread::Writer nonmem_w;
+ nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+ s = SwitchMemtable(cfd, &write_context);
+ nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+ } else {
+ s = SwitchMemtable(cfd, &write_context);
+ }
+ TEST_EndWrite(writer);
+ return s;
+}
+
+Status DBImpl::TEST_FlushMemTable(bool wait, bool allow_write_stall,
+ ColumnFamilyHandle* cfh) {
+ FlushOptions fo;
+ fo.wait = wait;
+ fo.allow_write_stall = allow_write_stall;
+ ColumnFamilyData* cfd;
+ if (cfh == nullptr) {
+ cfd = default_cf_handle_->cfd();
+ } else {
+ auto cfhi = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh);
+ cfd = cfhi->cfd();
+ }
+ return FlushMemTable(cfd, fo, FlushReason::kTest);
+}
+
+Status DBImpl::TEST_FlushMemTable(ColumnFamilyData* cfd,
+ const FlushOptions& flush_opts) {
+ return FlushMemTable(cfd, flush_opts, FlushReason::kTest);
+}
+
+Status DBImpl::TEST_AtomicFlushMemTables(
+ const autovector<ColumnFamilyData*>& cfds, const FlushOptions& flush_opts) {
+ return AtomicFlushMemTables(cfds, flush_opts, FlushReason::kTest);
+}
+
+Status DBImpl::TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family) {
+ ColumnFamilyData* cfd;
+ if (column_family == nullptr) {
+ cfd = default_cf_handle_->cfd();
+ } else {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ cfd = cfh->cfd();
+ }
+ return WaitForFlushMemTable(cfd, nullptr, false);
+}
+
+Status DBImpl::TEST_WaitForCompact(bool wait_unscheduled) {
+ // Wait until the compaction completes
+
+ // TODO: a bug here. This function actually does not necessarily
+ // wait for compact. It actually waits for scheduled compaction
+ // OR flush to finish.
+
+ InstrumentedMutexLock l(&mutex_);
+ while ((bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
+ bg_flush_scheduled_ ||
+ (wait_unscheduled && unscheduled_compactions_)) &&
+ (error_handler_.GetBGError() == Status::OK())) {
+ bg_cv_.Wait();
+ }
+ return error_handler_.GetBGError();
+}
+
+void DBImpl::TEST_LockMutex() { mutex_.Lock(); }
+
+void DBImpl::TEST_UnlockMutex() { mutex_.Unlock(); }
+
+void* DBImpl::TEST_BeginWrite() {
+ auto w = new WriteThread::Writer();
+ write_thread_.EnterUnbatched(w, &mutex_);
+ return reinterpret_cast<void*>(w);
+}
+
+void DBImpl::TEST_EndWrite(void* w) {
+ auto writer = reinterpret_cast<WriteThread::Writer*>(w);
+ write_thread_.ExitUnbatched(writer);
+ delete writer;
+}
+
+size_t DBImpl::TEST_LogsToFreeSize() {
+ InstrumentedMutexLock l(&mutex_);
+ return logs_to_free_.size();
+}
+
+uint64_t DBImpl::TEST_LogfileNumber() {
+ InstrumentedMutexLock l(&mutex_);
+ return logfile_number_;
+}
+
+Status DBImpl::TEST_GetAllImmutableCFOptions(
+ std::unordered_map<std::string, const ImmutableCFOptions*>* iopts_map) {
+ std::vector<std::string> cf_names;
+ std::vector<const ImmutableCFOptions*> iopts;
+ {
+ InstrumentedMutexLock l(&mutex_);
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ cf_names.push_back(cfd->GetName());
+ iopts.push_back(cfd->ioptions());
+ }
+ }
+ iopts_map->clear();
+ for (size_t i = 0; i < cf_names.size(); ++i) {
+ iopts_map->insert({cf_names[i], iopts[i]});
+ }
+
+ return Status::OK();
+}
+
+uint64_t DBImpl::TEST_FindMinLogContainingOutstandingPrep() {
+ return logs_with_prep_tracker_.FindMinLogContainingOutstandingPrep();
+}
+
+size_t DBImpl::TEST_PreparedSectionCompletedSize() {
+ return logs_with_prep_tracker_.TEST_PreparedSectionCompletedSize();
+}
+
+size_t DBImpl::TEST_LogsWithPrepSize() {
+ return logs_with_prep_tracker_.TEST_LogsWithPrepSize();
+}
+
+uint64_t DBImpl::TEST_FindMinPrepLogReferencedByMemTable() {
+ autovector<MemTable*> empty_list;
+ return FindMinPrepLogReferencedByMemTable(versions_.get(), nullptr,
+ empty_list);
+}
+
+Status DBImpl::TEST_GetLatestMutableCFOptions(
+ ColumnFamilyHandle* column_family, MutableCFOptions* mutable_cf_options) {
+ InstrumentedMutexLock l(&mutex_);
+
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ *mutable_cf_options = *cfh->cfd()->GetLatestMutableCFOptions();
+ return Status::OK();
+}
+
+int DBImpl::TEST_BGCompactionsAllowed() const {
+ InstrumentedMutexLock l(&mutex_);
+ return GetBGJobLimits().max_compactions;
+}
+
+int DBImpl::TEST_BGFlushesAllowed() const {
+ InstrumentedMutexLock l(&mutex_);
+ return GetBGJobLimits().max_flushes;
+}
+
+SequenceNumber DBImpl::TEST_GetLastVisibleSequence() const {
+ if (last_seq_same_as_publish_seq_) {
+ return versions_->LastSequence();
+ } else {
+ return versions_->LastAllocatedSequence();
+ }
+}
+
+size_t DBImpl::TEST_GetWalPreallocateBlockSize(
+ uint64_t write_buffer_size) const {
+ InstrumentedMutexLock l(&mutex_);
+ return GetWalPreallocateBlockSize(write_buffer_size);
+}
+
+void DBImpl::TEST_WaitForDumpStatsRun(std::function<void()> callback) const {
+ if (thread_dump_stats_ != nullptr) {
+ thread_dump_stats_->TEST_WaitForRun(callback);
+ }
+}
+
+void DBImpl::TEST_WaitForPersistStatsRun(std::function<void()> callback) const {
+ if (thread_persist_stats_ != nullptr) {
+ thread_persist_stats_->TEST_WaitForRun(callback);
+ }
+}
+
+bool DBImpl::TEST_IsPersistentStatsEnabled() const {
+ return thread_persist_stats_ && thread_persist_stats_->IsRunning();
+}
+
+size_t DBImpl::TEST_EstimateInMemoryStatsHistorySize() const {
+ return EstimateInMemoryStatsHistorySize();
+}
+} // namespace ROCKSDB_NAMESPACE
+#endif // NDEBUG
diff --git a/src/rocksdb/db/db_impl/db_impl_experimental.cc b/src/rocksdb/db/db_impl/db_impl_experimental.cc
new file mode 100644
index 000000000..f0c17ce95
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_experimental.cc
@@ -0,0 +1,151 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_impl/db_impl.h"
+
+#include <cinttypes>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/job_context.h"
+#include "db/version_set.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::SuggestCompactRange(ColumnFamilyHandle* column_family,
+ const Slice* begin, const Slice* end) {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ auto cfd = cfh->cfd();
+ InternalKey start_key, end_key;
+ if (begin != nullptr) {
+ start_key.SetMinPossibleForUserKey(*begin);
+ }
+ if (end != nullptr) {
+ end_key.SetMaxPossibleForUserKey(*end);
+ }
+ {
+ InstrumentedMutexLock l(&mutex_);
+ auto vstorage = cfd->current()->storage_info();
+ for (int level = 0; level < vstorage->num_non_empty_levels() - 1; ++level) {
+ std::vector<FileMetaData*> inputs;
+ vstorage->GetOverlappingInputs(
+ level, begin == nullptr ? nullptr : &start_key,
+ end == nullptr ? nullptr : &end_key, &inputs);
+ for (auto f : inputs) {
+ f->marked_for_compaction = true;
+ }
+ }
+ // Since we have some more files to compact, we should also recompute
+ // compaction score
+ vstorage->ComputeCompactionScore(*cfd->ioptions(),
+ *cfd->GetLatestMutableCFOptions());
+ SchedulePendingCompaction(cfd);
+ MaybeScheduleFlushOrCompaction();
+ }
+ return Status::OK();
+}
+
+Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) {
+ assert(column_family);
+
+ if (target_level < 1) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "PromoteL0 FAILED. Invalid target level %d\n", target_level);
+ return Status::InvalidArgument("Invalid target level");
+ }
+
+ Status status;
+ VersionEdit edit;
+ JobContext job_context(next_job_id_.fetch_add(1), true);
+ {
+ InstrumentedMutexLock l(&mutex_);
+ auto* cfd = static_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+ const auto* vstorage = cfd->current()->storage_info();
+
+ if (target_level >= vstorage->num_levels()) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "PromoteL0 FAILED. Target level %d does not exist\n",
+ target_level);
+ job_context.Clean();
+ return Status::InvalidArgument("Target level does not exist");
+ }
+
+ // Sort L0 files by range.
+ const InternalKeyComparator* icmp = &cfd->internal_comparator();
+ auto l0_files = vstorage->LevelFiles(0);
+ std::sort(l0_files.begin(), l0_files.end(),
+ [icmp](FileMetaData* f1, FileMetaData* f2) {
+ return icmp->Compare(f1->largest, f2->largest) < 0;
+ });
+
+ // Check that no L0 file is being compacted and that they have
+ // non-overlapping ranges.
+ for (size_t i = 0; i < l0_files.size(); ++i) {
+ auto f = l0_files[i];
+ if (f->being_compacted) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "PromoteL0 FAILED. File %" PRIu64 " being compacted\n",
+ f->fd.GetNumber());
+ job_context.Clean();
+ return Status::InvalidArgument("PromoteL0 called during L0 compaction");
+ }
+
+ if (i == 0) continue;
+ auto prev_f = l0_files[i - 1];
+ if (icmp->Compare(prev_f->largest, f->smallest) >= 0) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "PromoteL0 FAILED. Files %" PRIu64 " and %" PRIu64
+ " have overlapping ranges\n",
+ prev_f->fd.GetNumber(), f->fd.GetNumber());
+ job_context.Clean();
+ return Status::InvalidArgument("L0 has overlapping files");
+ }
+ }
+
+ // Check that all levels up to target_level are empty.
+ for (int level = 1; level <= target_level; ++level) {
+ if (vstorage->NumLevelFiles(level) > 0) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "PromoteL0 FAILED. Level %d not empty\n", level);
+ job_context.Clean();
+ return Status::InvalidArgument(
+ "All levels up to target_level "
+ "must be empty");
+ }
+ }
+
+ edit.SetColumnFamily(cfd->GetID());
+ for (const auto& f : l0_files) {
+ edit.DeleteFile(0, f->fd.GetNumber());
+ edit.AddFile(target_level, f->fd.GetNumber(), f->fd.GetPathId(),
+ f->fd.GetFileSize(), f->smallest, f->largest,
+ f->fd.smallest_seqno, f->fd.largest_seqno,
+ f->marked_for_compaction, f->oldest_blob_file_number,
+ f->oldest_ancester_time, f->file_creation_time,
+ f->file_checksum, f->file_checksum_func_name);
+ }
+
+ status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+ &edit, &mutex_, directories_.GetDbDir());
+ if (status.ok()) {
+ InstallSuperVersionAndScheduleWork(cfd,
+ &job_context.superversion_contexts[0],
+ *cfd->GetLatestMutableCFOptions());
+ }
+ } // lock released here
+ LogFlush(immutable_db_options_.info_log);
+ job_context.Clean();
+
+ return status;
+}
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_files.cc b/src/rocksdb/db/db_impl/db_impl_files.cc
new file mode 100644
index 000000000..c5d07dd01
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_files.cc
@@ -0,0 +1,667 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/db_impl/db_impl.h"
+
+#include <cinttypes>
+#include <set>
+#include <unordered_set>
+#include "db/event_helpers.h"
+#include "db/memtable_list.h"
+#include "file/file_util.h"
+#include "file/sst_file_manager_impl.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+uint64_t DBImpl::MinLogNumberToKeep() {
+ if (allow_2pc()) {
+ return versions_->min_log_number_to_keep_2pc();
+ } else {
+ return versions_->MinLogNumberWithUnflushedData();
+ }
+}
+
+uint64_t DBImpl::MinObsoleteSstNumberToKeep() {
+ mutex_.AssertHeld();
+ if (!pending_outputs_.empty()) {
+ return *pending_outputs_.begin();
+ }
+ return std::numeric_limits<uint64_t>::max();
+}
+
+// * Returns the list of live files in 'sst_live'
+// If it's doing full scan:
+// * Returns the list of all files in the filesystem in
+// 'full_scan_candidate_files'.
+// Otherwise, gets obsolete files from VersionSet.
+// no_full_scan = true -- never do the full scan using GetChildren()
+// force = false -- don't force the full scan, except every
+// mutable_db_options_.delete_obsolete_files_period_micros
+// force = true -- force the full scan
+void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
+ bool no_full_scan) {
+ mutex_.AssertHeld();
+
+ // if deletion is disabled, do nothing
+ if (disable_delete_obsolete_files_ > 0) {
+ return;
+ }
+
+ bool doing_the_full_scan = false;
+
+ // logic for figuring out if we're doing the full scan
+ if (no_full_scan) {
+ doing_the_full_scan = false;
+ } else if (force ||
+ mutable_db_options_.delete_obsolete_files_period_micros == 0) {
+ doing_the_full_scan = true;
+ } else {
+ const uint64_t now_micros = env_->NowMicros();
+ if ((delete_obsolete_files_last_run_ +
+ mutable_db_options_.delete_obsolete_files_period_micros) <
+ now_micros) {
+ doing_the_full_scan = true;
+ delete_obsolete_files_last_run_ = now_micros;
+ }
+ }
+
+ // don't delete files that might be currently written to from compaction
+ // threads
+ // Since job_context->min_pending_output is set, until file scan finishes,
+ // mutex_ cannot be released. Otherwise, we might see no min_pending_output
+ // here but later find newer generated unfinalized files while scanning.
+ if (!pending_outputs_.empty()) {
+ job_context->min_pending_output = *pending_outputs_.begin();
+ } else {
+ // delete all of them
+ job_context->min_pending_output = std::numeric_limits<uint64_t>::max();
+ }
+
+ // Get obsolete files. This function will also update the list of
+ // pending files in VersionSet().
+ versions_->GetObsoleteFiles(&job_context->sst_delete_files,
+ &job_context->manifest_delete_files,
+ job_context->min_pending_output);
+
+ // Mark the elements in job_context->sst_delete_files as grabbedForPurge
+ // so that other threads calling FindObsoleteFiles with full_scan=true
+ // will not add these files to candidate list for purge.
+ for (const auto& sst_to_del : job_context->sst_delete_files) {
+ MarkAsGrabbedForPurge(sst_to_del.metadata->fd.GetNumber());
+ }
+
+ // store the current filenum, lognum, etc
+ job_context->manifest_file_number = versions_->manifest_file_number();
+ job_context->pending_manifest_file_number =
+ versions_->pending_manifest_file_number();
+ job_context->log_number = MinLogNumberToKeep();
+ job_context->prev_log_number = versions_->prev_log_number();
+
+ versions_->AddLiveFiles(&job_context->sst_live);
+ if (doing_the_full_scan) {
+ InfoLogPrefix info_log_prefix(!immutable_db_options_.db_log_dir.empty(),
+ dbname_);
+ std::set<std::string> paths;
+ for (size_t path_id = 0; path_id < immutable_db_options_.db_paths.size();
+ path_id++) {
+ paths.insert(immutable_db_options_.db_paths[path_id].path);
+ }
+
+ // Note that if cf_paths is not specified in the ColumnFamilyOptions
+ // of a particular column family, we use db_paths as the cf_paths
+ // setting. Hence, there can be multiple duplicates of files from db_paths
+ // in the following code. The duplicate are removed while identifying
+ // unique files in PurgeObsoleteFiles.
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ for (size_t path_id = 0; path_id < cfd->ioptions()->cf_paths.size();
+ path_id++) {
+ auto& path = cfd->ioptions()->cf_paths[path_id].path;
+
+ if (paths.find(path) == paths.end()) {
+ paths.insert(path);
+ }
+ }
+ }
+
+ for (auto& path : paths) {
+ // set of all files in the directory. We'll exclude files that are still
+ // alive in the subsequent processings.
+ std::vector<std::string> files;
+ env_->GetChildren(path, &files); // Ignore errors
+ for (const std::string& file : files) {
+ uint64_t number;
+ FileType type;
+ // 1. If we cannot parse the file name, we skip;
+ // 2. If the file with file_number equals number has already been
+ // grabbed for purge by another compaction job, or it has already been
+ // schedule for purge, we also skip it if we
+ // are doing full scan in order to avoid double deletion of the same
+ // file under race conditions. See
+ // https://github.com/facebook/rocksdb/issues/3573
+ if (!ParseFileName(file, &number, info_log_prefix.prefix, &type) ||
+ !ShouldPurge(number)) {
+ continue;
+ }
+
+ // TODO(icanadi) clean up this mess to avoid having one-off "/" prefixes
+ job_context->full_scan_candidate_files.emplace_back("/" + file, path);
+ }
+ }
+
+ // Add log files in wal_dir
+ if (immutable_db_options_.wal_dir != dbname_) {
+ std::vector<std::string> log_files;
+ env_->GetChildren(immutable_db_options_.wal_dir,
+ &log_files); // Ignore errors
+ for (const std::string& log_file : log_files) {
+ job_context->full_scan_candidate_files.emplace_back(
+ log_file, immutable_db_options_.wal_dir);
+ }
+ }
+ // Add info log files in db_log_dir
+ if (!immutable_db_options_.db_log_dir.empty() &&
+ immutable_db_options_.db_log_dir != dbname_) {
+ std::vector<std::string> info_log_files;
+ // Ignore errors
+ env_->GetChildren(immutable_db_options_.db_log_dir, &info_log_files);
+ for (std::string& log_file : info_log_files) {
+ job_context->full_scan_candidate_files.emplace_back(
+ log_file, immutable_db_options_.db_log_dir);
+ }
+ }
+ }
+
+ // logs_ is empty when called during recovery, in which case there can't yet
+ // be any tracked obsolete logs
+ if (!alive_log_files_.empty() && !logs_.empty()) {
+ uint64_t min_log_number = job_context->log_number;
+ size_t num_alive_log_files = alive_log_files_.size();
+ // find newly obsoleted log files
+ while (alive_log_files_.begin()->number < min_log_number) {
+ auto& earliest = *alive_log_files_.begin();
+ if (immutable_db_options_.recycle_log_file_num >
+ log_recycle_files_.size()) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "adding log %" PRIu64 " to recycle list\n",
+ earliest.number);
+ log_recycle_files_.push_back(earliest.number);
+ } else {
+ job_context->log_delete_files.push_back(earliest.number);
+ }
+ if (job_context->size_log_to_delete == 0) {
+ job_context->prev_total_log_size = total_log_size_;
+ job_context->num_alive_log_files = num_alive_log_files;
+ }
+ job_context->size_log_to_delete += earliest.size;
+ total_log_size_ -= earliest.size;
+ if (two_write_queues_) {
+ log_write_mutex_.Lock();
+ }
+ alive_log_files_.pop_front();
+ if (two_write_queues_) {
+ log_write_mutex_.Unlock();
+ }
+ // Current log should always stay alive since it can't have
+ // number < MinLogNumber().
+ assert(alive_log_files_.size());
+ }
+ while (!logs_.empty() && logs_.front().number < min_log_number) {
+ auto& log = logs_.front();
+ if (log.getting_synced) {
+ log_sync_cv_.Wait();
+ // logs_ could have changed while we were waiting.
+ continue;
+ }
+ logs_to_free_.push_back(log.ReleaseWriter());
+ {
+ InstrumentedMutexLock wl(&log_write_mutex_);
+ logs_.pop_front();
+ }
+ }
+ // Current log cannot be obsolete.
+ assert(!logs_.empty());
+ }
+
+ // We're just cleaning up for DB::Write().
+ assert(job_context->logs_to_free.empty());
+ job_context->logs_to_free = logs_to_free_;
+ job_context->log_recycle_files.assign(log_recycle_files_.begin(),
+ log_recycle_files_.end());
+ if (job_context->HaveSomethingToDelete()) {
+ ++pending_purge_obsolete_files_;
+ }
+ logs_to_free_.clear();
+}
+
+namespace {
+bool CompareCandidateFile(const JobContext::CandidateFileInfo& first,
+ const JobContext::CandidateFileInfo& second) {
+ if (first.file_name > second.file_name) {
+ return true;
+ } else if (first.file_name < second.file_name) {
+ return false;
+ } else {
+ return (first.file_path > second.file_path);
+ }
+}
+}; // namespace
+
+// Delete obsolete files and log status and information of file deletion
+void DBImpl::DeleteObsoleteFileImpl(int job_id, const std::string& fname,
+ const std::string& path_to_sync,
+ FileType type, uint64_t number) {
+ Status file_deletion_status;
+ if (type == kTableFile || type == kLogFile) {
+ file_deletion_status =
+ DeleteDBFile(&immutable_db_options_, fname, path_to_sync,
+ /*force_bg=*/false, /*force_fg=*/!wal_in_db_path_);
+ } else {
+ file_deletion_status = env_->DeleteFile(fname);
+ }
+ TEST_SYNC_POINT_CALLBACK("DBImpl::DeleteObsoleteFileImpl:AfterDeletion",
+ &file_deletion_status);
+ if (file_deletion_status.ok()) {
+ ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+ "[JOB %d] Delete %s type=%d #%" PRIu64 " -- %s\n", job_id,
+ fname.c_str(), type, number,
+ file_deletion_status.ToString().c_str());
+ } else if (env_->FileExists(fname).IsNotFound()) {
+ ROCKS_LOG_INFO(
+ immutable_db_options_.info_log,
+ "[JOB %d] Tried to delete a non-existing file %s type=%d #%" PRIu64
+ " -- %s\n",
+ job_id, fname.c_str(), type, number,
+ file_deletion_status.ToString().c_str());
+ } else {
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "[JOB %d] Failed to delete %s type=%d #%" PRIu64 " -- %s\n",
+ job_id, fname.c_str(), type, number,
+ file_deletion_status.ToString().c_str());
+ }
+ if (type == kTableFile) {
+ EventHelpers::LogAndNotifyTableFileDeletion(
+ &event_logger_, job_id, number, fname, file_deletion_status, GetName(),
+ immutable_db_options_.listeners);
+ }
+}
+
+// Diffs the files listed in filenames and those that do not
+// belong to live files are possibly removed. Also, removes all the
+// files in sst_delete_files and log_delete_files.
+// It is not necessary to hold the mutex when invoking this method.
+void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
+ TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:Begin");
+ // we'd better have sth to delete
+ assert(state.HaveSomethingToDelete());
+
+ // FindObsoleteFiles() should've populated this so nonzero
+ assert(state.manifest_file_number != 0);
+
+ // Now, convert live list to an unordered map, WITHOUT mutex held;
+ // set is slow.
+ std::unordered_map<uint64_t, const FileDescriptor*> sst_live_map;
+ for (const FileDescriptor& fd : state.sst_live) {
+ sst_live_map[fd.GetNumber()] = &fd;
+ }
+ std::unordered_set<uint64_t> log_recycle_files_set(
+ state.log_recycle_files.begin(), state.log_recycle_files.end());
+
+ auto candidate_files = state.full_scan_candidate_files;
+ candidate_files.reserve(
+ candidate_files.size() + state.sst_delete_files.size() +
+ state.log_delete_files.size() + state.manifest_delete_files.size());
+ // We may ignore the dbname when generating the file names.
+ for (auto& file : state.sst_delete_files) {
+ candidate_files.emplace_back(
+ MakeTableFileName(file.metadata->fd.GetNumber()), file.path);
+ if (file.metadata->table_reader_handle) {
+ table_cache_->Release(file.metadata->table_reader_handle);
+ }
+ file.DeleteMetadata();
+ }
+
+ for (auto file_num : state.log_delete_files) {
+ if (file_num > 0) {
+ candidate_files.emplace_back(LogFileName(file_num),
+ immutable_db_options_.wal_dir);
+ }
+ }
+ for (const auto& filename : state.manifest_delete_files) {
+ candidate_files.emplace_back(filename, dbname_);
+ }
+
+ // dedup state.candidate_files so we don't try to delete the same
+ // file twice
+ std::sort(candidate_files.begin(), candidate_files.end(),
+ CompareCandidateFile);
+ candidate_files.erase(
+ std::unique(candidate_files.begin(), candidate_files.end()),
+ candidate_files.end());
+
+ if (state.prev_total_log_size > 0) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[JOB %d] Try to delete WAL files size %" PRIu64
+ ", prev total WAL file size %" PRIu64
+ ", number of live WAL files %" ROCKSDB_PRIszt ".\n",
+ state.job_id, state.size_log_to_delete,
+ state.prev_total_log_size, state.num_alive_log_files);
+ }
+
+ std::vector<std::string> old_info_log_files;
+ InfoLogPrefix info_log_prefix(!immutable_db_options_.db_log_dir.empty(),
+ dbname_);
+
+ // File numbers of most recent two OPTIONS file in candidate_files (found in
+ // previos FindObsoleteFiles(full_scan=true))
+ // At this point, there must not be any duplicate file numbers in
+ // candidate_files.
+ uint64_t optsfile_num1 = std::numeric_limits<uint64_t>::min();
+ uint64_t optsfile_num2 = std::numeric_limits<uint64_t>::min();
+ for (const auto& candidate_file : candidate_files) {
+ const std::string& fname = candidate_file.file_name;
+ uint64_t number;
+ FileType type;
+ if (!ParseFileName(fname, &number, info_log_prefix.prefix, &type) ||
+ type != kOptionsFile) {
+ continue;
+ }
+ if (number > optsfile_num1) {
+ optsfile_num2 = optsfile_num1;
+ optsfile_num1 = number;
+ } else if (number > optsfile_num2) {
+ optsfile_num2 = number;
+ }
+ }
+
+ // Close WALs before trying to delete them.
+ for (const auto w : state.logs_to_free) {
+ // TODO: maybe check the return value of Close.
+ w->Close();
+ }
+
+ bool own_files = OwnTablesAndLogs();
+ std::unordered_set<uint64_t> files_to_del;
+ for (const auto& candidate_file : candidate_files) {
+ const std::string& to_delete = candidate_file.file_name;
+ uint64_t number;
+ FileType type;
+ // Ignore file if we cannot recognize it.
+ if (!ParseFileName(to_delete, &number, info_log_prefix.prefix, &type)) {
+ continue;
+ }
+
+ bool keep = true;
+ switch (type) {
+ case kLogFile:
+ keep = ((number >= state.log_number) ||
+ (number == state.prev_log_number) ||
+ (log_recycle_files_set.find(number) !=
+ log_recycle_files_set.end()));
+ break;
+ case kDescriptorFile:
+ // Keep my manifest file, and any newer incarnations'
+ // (can happen during manifest roll)
+ keep = (number >= state.manifest_file_number);
+ break;
+ case kTableFile:
+ // If the second condition is not there, this makes
+ // DontDeletePendingOutputs fail
+ keep = (sst_live_map.find(number) != sst_live_map.end()) ||
+ number >= state.min_pending_output;
+ if (!keep) {
+ files_to_del.insert(number);
+ }
+ break;
+ case kTempFile:
+ // Any temp files that are currently being written to must
+ // be recorded in pending_outputs_, which is inserted into "live".
+ // Also, SetCurrentFile creates a temp file when writing out new
+ // manifest, which is equal to state.pending_manifest_file_number. We
+ // should not delete that file
+ //
+ // TODO(yhchiang): carefully modify the third condition to safely
+ // remove the temp options files.
+ keep = (sst_live_map.find(number) != sst_live_map.end()) ||
+ (number == state.pending_manifest_file_number) ||
+ (to_delete.find(kOptionsFileNamePrefix) != std::string::npos);
+ break;
+ case kInfoLogFile:
+ keep = true;
+ if (number != 0) {
+ old_info_log_files.push_back(to_delete);
+ }
+ break;
+ case kOptionsFile:
+ keep = (number >= optsfile_num2);
+ TEST_SYNC_POINT_CALLBACK(
+ "DBImpl::PurgeObsoleteFiles:CheckOptionsFiles:1",
+ reinterpret_cast<void*>(&number));
+ TEST_SYNC_POINT_CALLBACK(
+ "DBImpl::PurgeObsoleteFiles:CheckOptionsFiles:2",
+ reinterpret_cast<void*>(&keep));
+ break;
+ case kCurrentFile:
+ case kDBLockFile:
+ case kIdentityFile:
+ case kMetaDatabase:
+ case kBlobFile:
+ keep = true;
+ break;
+ }
+
+ if (keep) {
+ continue;
+ }
+
+ std::string fname;
+ std::string dir_to_sync;
+ if (type == kTableFile) {
+ // evict from cache
+ TableCache::Evict(table_cache_.get(), number);
+ fname = MakeTableFileName(candidate_file.file_path, number);
+ dir_to_sync = candidate_file.file_path;
+ } else {
+ dir_to_sync =
+ (type == kLogFile) ? immutable_db_options_.wal_dir : dbname_;
+ fname = dir_to_sync +
+ ((!dir_to_sync.empty() && dir_to_sync.back() == '/') ||
+ (!to_delete.empty() && to_delete.front() == '/')
+ ? ""
+ : "/") +
+ to_delete;
+ }
+
+#ifndef ROCKSDB_LITE
+ if (type == kLogFile && (immutable_db_options_.wal_ttl_seconds > 0 ||
+ immutable_db_options_.wal_size_limit_mb > 0)) {
+ wal_manager_.ArchiveWALFile(fname, number);
+ continue;
+ }
+#endif // !ROCKSDB_LITE
+
+ // If I do not own these files, e.g. secondary instance with max_open_files
+ // = -1, then no need to delete or schedule delete these files since they
+ // will be removed by their owner, e.g. the primary instance.
+ if (!own_files) {
+ continue;
+ }
+ Status file_deletion_status;
+ if (schedule_only) {
+ InstrumentedMutexLock guard_lock(&mutex_);
+ SchedulePendingPurge(fname, dir_to_sync, type, number, state.job_id);
+ } else {
+ DeleteObsoleteFileImpl(state.job_id, fname, dir_to_sync, type, number);
+ }
+ }
+
+ {
+ // After purging obsolete files, remove them from files_grabbed_for_purge_.
+ InstrumentedMutexLock guard_lock(&mutex_);
+ autovector<uint64_t> to_be_removed;
+ for (auto fn : files_grabbed_for_purge_) {
+ if (files_to_del.count(fn) != 0) {
+ to_be_removed.emplace_back(fn);
+ }
+ }
+ for (auto fn : to_be_removed) {
+ files_grabbed_for_purge_.erase(fn);
+ }
+ }
+
+ // Delete old info log files.
+ size_t old_info_log_file_count = old_info_log_files.size();
+ if (old_info_log_file_count != 0 &&
+ old_info_log_file_count >= immutable_db_options_.keep_log_file_num) {
+ std::sort(old_info_log_files.begin(), old_info_log_files.end());
+ size_t end =
+ old_info_log_file_count - immutable_db_options_.keep_log_file_num;
+ for (unsigned int i = 0; i <= end; i++) {
+ std::string& to_delete = old_info_log_files.at(i);
+ std::string full_path_to_delete =
+ (immutable_db_options_.db_log_dir.empty()
+ ? dbname_
+ : immutable_db_options_.db_log_dir) +
+ "/" + to_delete;
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[JOB %d] Delete info log file %s\n", state.job_id,
+ full_path_to_delete.c_str());
+ Status s = env_->DeleteFile(full_path_to_delete);
+ if (!s.ok()) {
+ if (env_->FileExists(full_path_to_delete).IsNotFound()) {
+ ROCKS_LOG_INFO(
+ immutable_db_options_.info_log,
+ "[JOB %d] Tried to delete non-existing info log file %s FAILED "
+ "-- %s\n",
+ state.job_id, to_delete.c_str(), s.ToString().c_str());
+ } else {
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "[JOB %d] Delete info log file %s FAILED -- %s\n",
+ state.job_id, to_delete.c_str(),
+ s.ToString().c_str());
+ }
+ }
+ }
+ }
+#ifndef ROCKSDB_LITE
+ wal_manager_.PurgeObsoleteWALFiles();
+#endif // ROCKSDB_LITE
+ LogFlush(immutable_db_options_.info_log);
+ InstrumentedMutexLock l(&mutex_);
+ --pending_purge_obsolete_files_;
+ assert(pending_purge_obsolete_files_ >= 0);
+ if (pending_purge_obsolete_files_ == 0) {
+ bg_cv_.SignalAll();
+ }
+ TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:End");
+}
+
+void DBImpl::DeleteObsoleteFiles() {
+ mutex_.AssertHeld();
+ JobContext job_context(next_job_id_.fetch_add(1));
+ FindObsoleteFiles(&job_context, true);
+
+ mutex_.Unlock();
+ if (job_context.HaveSomethingToDelete()) {
+ PurgeObsoleteFiles(job_context);
+ }
+ job_context.Clean();
+ mutex_.Lock();
+}
+
+uint64_t FindMinPrepLogReferencedByMemTable(
+ VersionSet* vset, const ColumnFamilyData* cfd_to_flush,
+ const autovector<MemTable*>& memtables_to_flush) {
+ uint64_t min_log = 0;
+
+ // we must look through the memtables for two phase transactions
+ // that have been committed but not yet flushed
+ for (auto loop_cfd : *vset->GetColumnFamilySet()) {
+ if (loop_cfd->IsDropped() || loop_cfd == cfd_to_flush) {
+ continue;
+ }
+
+ auto log = loop_cfd->imm()->PrecomputeMinLogContainingPrepSection(
+ memtables_to_flush);
+
+ if (log > 0 && (min_log == 0 || log < min_log)) {
+ min_log = log;
+ }
+
+ log = loop_cfd->mem()->GetMinLogContainingPrepSection();
+
+ if (log > 0 && (min_log == 0 || log < min_log)) {
+ min_log = log;
+ }
+ }
+
+ return min_log;
+}
+
+uint64_t PrecomputeMinLogNumberToKeep(
+ VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
+ autovector<VersionEdit*> edit_list,
+ const autovector<MemTable*>& memtables_to_flush,
+ LogsWithPrepTracker* prep_tracker) {
+ assert(vset != nullptr);
+ assert(prep_tracker != nullptr);
+ // Calculate updated min_log_number_to_keep
+ // Since the function should only be called in 2pc mode, log number in
+ // the version edit should be sufficient.
+
+ // Precompute the min log number containing unflushed data for the column
+ // family being flushed (`cfd_to_flush`).
+ uint64_t cf_min_log_number_to_keep = 0;
+ for (auto& e : edit_list) {
+ if (e->HasLogNumber()) {
+ cf_min_log_number_to_keep =
+ std::max(cf_min_log_number_to_keep, e->GetLogNumber());
+ }
+ }
+ if (cf_min_log_number_to_keep == 0) {
+ // No version edit contains information on log number. The log number
+ // for this column family should stay the same as it is.
+ cf_min_log_number_to_keep = cfd_to_flush.GetLogNumber();
+ }
+
+ // Get min log number containing unflushed data for other column families.
+ uint64_t min_log_number_to_keep =
+ vset->PreComputeMinLogNumberWithUnflushedData(&cfd_to_flush);
+ if (cf_min_log_number_to_keep != 0) {
+ min_log_number_to_keep =
+ std::min(cf_min_log_number_to_keep, min_log_number_to_keep);
+ }
+
+ // if are 2pc we must consider logs containing prepared
+ // sections of outstanding transactions.
+ //
+ // We must check min logs with outstanding prep before we check
+ // logs references by memtables because a log referenced by the
+ // first data structure could transition to the second under us.
+ //
+ // TODO: iterating over all column families under db mutex.
+ // should find more optimal solution
+ auto min_log_in_prep_heap =
+ prep_tracker->FindMinLogContainingOutstandingPrep();
+
+ if (min_log_in_prep_heap != 0 &&
+ min_log_in_prep_heap < min_log_number_to_keep) {
+ min_log_number_to_keep = min_log_in_prep_heap;
+ }
+
+ uint64_t min_log_refed_by_mem = FindMinPrepLogReferencedByMemTable(
+ vset, &cfd_to_flush, memtables_to_flush);
+
+ if (min_log_refed_by_mem != 0 &&
+ min_log_refed_by_mem < min_log_number_to_keep) {
+ min_log_number_to_keep = min_log_refed_by_mem;
+ }
+ return min_log_number_to_keep;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_open.cc b/src/rocksdb/db/db_impl/db_impl_open.cc
new file mode 100644
index 000000000..6ae4ead54
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_open.cc
@@ -0,0 +1,1651 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/db_impl/db_impl.h"
+
+#include <cinttypes>
+
+#include "db/builder.h"
+#include "db/error_handler.h"
+#include "env/composite_env_wrapper.h"
+#include "file/read_write_util.h"
+#include "file/sst_file_manager_impl.h"
+#include "file/writable_file_writer.h"
+#include "monitoring/persistent_stats_history.h"
+#include "options/options_helper.h"
+#include "rocksdb/wal_filter.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "test_util/sync_point.h"
+#include "util/rate_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+Options SanitizeOptions(const std::string& dbname, const Options& src) {
+ auto db_options = SanitizeOptions(dbname, DBOptions(src));
+ ImmutableDBOptions immutable_db_options(db_options);
+ auto cf_options =
+ SanitizeOptions(immutable_db_options, ColumnFamilyOptions(src));
+ return Options(db_options, cf_options);
+}
+
+DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
+ DBOptions result(src);
+
+ if (result.file_system == nullptr) {
+ if (result.env == Env::Default()) {
+ result.file_system = FileSystem::Default();
+ } else {
+ result.file_system.reset(new LegacyFileSystemWrapper(result.env));
+ }
+ } else {
+ if (result.env == nullptr) {
+ result.env = Env::Default();
+ }
+ }
+
+ // result.max_open_files means an "infinite" open files.
+ if (result.max_open_files != -1) {
+ int max_max_open_files = port::GetMaxOpenFiles();
+ if (max_max_open_files == -1) {
+ max_max_open_files = 0x400000;
+ }
+ ClipToRange(&result.max_open_files, 20, max_max_open_files);
+ TEST_SYNC_POINT_CALLBACK("SanitizeOptions::AfterChangeMaxOpenFiles",
+ &result.max_open_files);
+ }
+
+ if (result.info_log == nullptr) {
+ Status s = CreateLoggerFromOptions(dbname, result, &result.info_log);
+ if (!s.ok()) {
+ // No place suitable for logging
+ result.info_log = nullptr;
+ }
+ }
+
+ if (!result.write_buffer_manager) {
+ result.write_buffer_manager.reset(
+ new WriteBufferManager(result.db_write_buffer_size));
+ }
+ auto bg_job_limits = DBImpl::GetBGJobLimits(
+ result.max_background_flushes, result.max_background_compactions,
+ result.max_background_jobs, true /* parallelize_compactions */);
+ result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_compactions,
+ Env::Priority::LOW);
+ result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_flushes,
+ Env::Priority::HIGH);
+
+ if (result.rate_limiter.get() != nullptr) {
+ if (result.bytes_per_sync == 0) {
+ result.bytes_per_sync = 1024 * 1024;
+ }
+ }
+
+ if (result.delayed_write_rate == 0) {
+ if (result.rate_limiter.get() != nullptr) {
+ result.delayed_write_rate = result.rate_limiter->GetBytesPerSecond();
+ }
+ if (result.delayed_write_rate == 0) {
+ result.delayed_write_rate = 16 * 1024 * 1024;
+ }
+ }
+
+ if (result.WAL_ttl_seconds > 0 || result.WAL_size_limit_MB > 0) {
+ result.recycle_log_file_num = false;
+ }
+
+ if (result.recycle_log_file_num &&
+ (result.wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery ||
+ result.wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency)) {
+ // kPointInTimeRecovery is inconsistent with recycle log file feature since
+ // we define the "end" of the log as the first corrupt record we encounter.
+ // kAbsoluteConsistency doesn't make sense because even a clean
+ // shutdown leaves old junk at the end of the log file.
+ result.recycle_log_file_num = 0;
+ }
+
+ if (result.wal_dir.empty()) {
+ // Use dbname as default
+ result.wal_dir = dbname;
+ }
+ if (result.wal_dir.back() == '/') {
+ result.wal_dir = result.wal_dir.substr(0, result.wal_dir.size() - 1);
+ }
+
+ if (result.db_paths.size() == 0) {
+ result.db_paths.emplace_back(dbname, std::numeric_limits<uint64_t>::max());
+ }
+
+ if (result.use_direct_reads && result.compaction_readahead_size == 0) {
+ TEST_SYNC_POINT_CALLBACK("SanitizeOptions:direct_io", nullptr);
+ result.compaction_readahead_size = 1024 * 1024 * 2;
+ }
+
+ if (result.compaction_readahead_size > 0 || result.use_direct_reads) {
+ result.new_table_reader_for_compaction_inputs = true;
+ }
+
+ // Force flush on DB open if 2PC is enabled, since with 2PC we have no
+ // guarantee that consecutive log files have consecutive sequence id, which
+ // make recovery complicated.
+ if (result.allow_2pc) {
+ result.avoid_flush_during_recovery = false;
+ }
+
+#ifndef ROCKSDB_LITE
+ ImmutableDBOptions immutable_db_options(result);
+ if (!IsWalDirSameAsDBPath(&immutable_db_options)) {
+ // Either the WAL dir and db_paths[0]/db_name are not the same, or we
+ // cannot tell for sure. In either case, assume they're different and
+ // explicitly cleanup the trash log files (bypass DeleteScheduler)
+ // Do this first so even if we end up calling
+ // DeleteScheduler::CleanupDirectory on the same dir later, it will be
+ // safe
+ std::vector<std::string> filenames;
+ result.env->GetChildren(result.wal_dir, &filenames);
+ for (std::string& filename : filenames) {
+ if (filename.find(".log.trash", filename.length() -
+ std::string(".log.trash").length()) !=
+ std::string::npos) {
+ std::string trash_file = result.wal_dir + "/" + filename;
+ result.env->DeleteFile(trash_file);
+ }
+ }
+ }
+ // When the DB is stopped, it's possible that there are some .trash files that
+ // were not deleted yet, when we open the DB we will find these .trash files
+ // and schedule them to be deleted (or delete immediately if SstFileManager
+ // was not used)
+ auto sfm = static_cast<SstFileManagerImpl*>(result.sst_file_manager.get());
+ for (size_t i = 0; i < result.db_paths.size(); i++) {
+ DeleteScheduler::CleanupDirectory(result.env, sfm, result.db_paths[i].path);
+ }
+
+ // Create a default SstFileManager for purposes of tracking compaction size
+ // and facilitating recovery from out of space errors.
+ if (result.sst_file_manager.get() == nullptr) {
+ std::shared_ptr<SstFileManager> sst_file_manager(
+ NewSstFileManager(result.env, result.info_log));
+ result.sst_file_manager = sst_file_manager;
+ }
+#endif
+
+ if (!result.paranoid_checks) {
+ result.skip_checking_sst_file_sizes_on_db_open = true;
+ ROCKS_LOG_INFO(result.info_log,
+ "file size check will be skipped during open.");
+ }
+
+ return result;
+}
+
+namespace {
+Status SanitizeOptionsByTable(
+ const DBOptions& db_opts,
+ const std::vector<ColumnFamilyDescriptor>& column_families) {
+ Status s;
+ for (auto cf : column_families) {
+ s = cf.options.table_factory->SanitizeOptions(db_opts, cf.options);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ return Status::OK();
+}
+} // namespace
+
+Status DBImpl::ValidateOptions(
+ const DBOptions& db_options,
+ const std::vector<ColumnFamilyDescriptor>& column_families) {
+ Status s;
+ for (auto& cfd : column_families) {
+ s = ColumnFamilyData::ValidateOptions(db_options, cfd.options);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ s = ValidateOptions(db_options);
+ return s;
+}
+
+Status DBImpl::ValidateOptions(const DBOptions& db_options) {
+ if (db_options.db_paths.size() > 4) {
+ return Status::NotSupported(
+ "More than four DB paths are not supported yet. ");
+ }
+
+ if (db_options.allow_mmap_reads && db_options.use_direct_reads) {
+ // Protect against assert in PosixMMapReadableFile constructor
+ return Status::NotSupported(
+ "If memory mapped reads (allow_mmap_reads) are enabled "
+ "then direct I/O reads (use_direct_reads) must be disabled. ");
+ }
+
+ if (db_options.allow_mmap_writes &&
+ db_options.use_direct_io_for_flush_and_compaction) {
+ return Status::NotSupported(
+ "If memory mapped writes (allow_mmap_writes) are enabled "
+ "then direct I/O writes (use_direct_io_for_flush_and_compaction) must "
+ "be disabled. ");
+ }
+
+ if (db_options.keep_log_file_num == 0) {
+ return Status::InvalidArgument("keep_log_file_num must be greater than 0");
+ }
+
+ if (db_options.unordered_write &&
+ !db_options.allow_concurrent_memtable_write) {
+ return Status::InvalidArgument(
+ "unordered_write is incompatible with !allow_concurrent_memtable_write");
+ }
+
+ if (db_options.unordered_write && db_options.enable_pipelined_write) {
+ return Status::InvalidArgument(
+ "unordered_write is incompatible with enable_pipelined_write");
+ }
+
+ if (db_options.atomic_flush && db_options.enable_pipelined_write) {
+ return Status::InvalidArgument(
+ "atomic_flush is incompatible with enable_pipelined_write");
+ }
+
+ return Status::OK();
+}
+
+Status DBImpl::NewDB() {
+ VersionEdit new_db;
+ Status s = SetIdentityFile(env_, dbname_);
+ if (!s.ok()) {
+ return s;
+ }
+ if (immutable_db_options_.write_dbid_to_manifest) {
+ std::string temp_db_id;
+ GetDbIdentityFromIdentityFile(&temp_db_id);
+ new_db.SetDBId(temp_db_id);
+ }
+ new_db.SetLogNumber(0);
+ new_db.SetNextFile(2);
+ new_db.SetLastSequence(0);
+
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "Creating manifest 1 \n");
+ const std::string manifest = DescriptorFileName(dbname_, 1);
+ {
+ std::unique_ptr<FSWritableFile> file;
+ FileOptions file_options = fs_->OptimizeForManifestWrite(file_options_);
+ s = NewWritableFile(fs_.get(), manifest, &file, file_options);
+ if (!s.ok()) {
+ return s;
+ }
+ file->SetPreallocationBlockSize(
+ immutable_db_options_.manifest_preallocation_size);
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ std::move(file), manifest, file_options, env_, nullptr /* stats */,
+ immutable_db_options_.listeners));
+ log::Writer log(std::move(file_writer), 0, false);
+ std::string record;
+ new_db.EncodeTo(&record);
+ s = log.AddRecord(record);
+ if (s.ok()) {
+ s = SyncManifest(env_, &immutable_db_options_, log.file());
+ }
+ }
+ if (s.ok()) {
+ // Make "CURRENT" file that points to the new manifest file.
+ s = SetCurrentFile(env_, dbname_, 1, directories_.GetDbDir());
+ } else {
+ fs_->DeleteFile(manifest, IOOptions(), nullptr);
+ }
+ return s;
+}
+
+Status DBImpl::CreateAndNewDirectory(Env* env, const std::string& dirname,
+ std::unique_ptr<Directory>* directory) {
+ // We call CreateDirIfMissing() as the directory may already exist (if we
+ // are reopening a DB), when this happens we don't want creating the
+ // directory to cause an error. However, we need to check if creating the
+ // directory fails or else we may get an obscure message about the lock
+ // file not existing. One real-world example of this occurring is if
+ // env->CreateDirIfMissing() doesn't create intermediate directories, e.g.
+ // when dbname_ is "dir/db" but when "dir" doesn't exist.
+ Status s = env->CreateDirIfMissing(dirname);
+ if (!s.ok()) {
+ return s;
+ }
+ return env->NewDirectory(dirname, directory);
+}
+
+Status Directories::SetDirectories(Env* env, const std::string& dbname,
+ const std::string& wal_dir,
+ const std::vector<DbPath>& data_paths) {
+ Status s = DBImpl::CreateAndNewDirectory(env, dbname, &db_dir_);
+ if (!s.ok()) {
+ return s;
+ }
+ if (!wal_dir.empty() && dbname != wal_dir) {
+ s = DBImpl::CreateAndNewDirectory(env, wal_dir, &wal_dir_);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ data_dirs_.clear();
+ for (auto& p : data_paths) {
+ const std::string db_path = p.path;
+ if (db_path == dbname) {
+ data_dirs_.emplace_back(nullptr);
+ } else {
+ std::unique_ptr<Directory> path_directory;
+ s = DBImpl::CreateAndNewDirectory(env, db_path, &path_directory);
+ if (!s.ok()) {
+ return s;
+ }
+ data_dirs_.emplace_back(path_directory.release());
+ }
+ }
+ assert(data_dirs_.size() == data_paths.size());
+ return Status::OK();
+}
+
+Status DBImpl::Recover(
+ const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
+ bool error_if_log_file_exist, bool error_if_data_exists_in_logs,
+ uint64_t* recovered_seq) {
+ mutex_.AssertHeld();
+
+ bool is_new_db = false;
+ assert(db_lock_ == nullptr);
+ if (!read_only) {
+ Status s = directories_.SetDirectories(env_, dbname_,
+ immutable_db_options_.wal_dir,
+ immutable_db_options_.db_paths);
+ if (!s.ok()) {
+ return s;
+ }
+
+ s = env_->LockFile(LockFileName(dbname_), &db_lock_);
+ if (!s.ok()) {
+ return s;
+ }
+
+ std::string current_fname = CurrentFileName(dbname_);
+ s = env_->FileExists(current_fname);
+ if (s.IsNotFound()) {
+ if (immutable_db_options_.create_if_missing) {
+ s = NewDB();
+ is_new_db = true;
+ if (!s.ok()) {
+ return s;
+ }
+ } else {
+ return Status::InvalidArgument(
+ current_fname, "does not exist (create_if_missing is false)");
+ }
+ } else if (s.ok()) {
+ if (immutable_db_options_.error_if_exists) {
+ return Status::InvalidArgument(dbname_,
+ "exists (error_if_exists is true)");
+ }
+ } else {
+ // Unexpected error reading file
+ assert(s.IsIOError());
+ return s;
+ }
+ // Verify compatibility of file_options_ and filesystem
+ {
+ std::unique_ptr<FSRandomAccessFile> idfile;
+ FileOptions customized_fs(file_options_);
+ customized_fs.use_direct_reads |=
+ immutable_db_options_.use_direct_io_for_flush_and_compaction;
+ s = fs_->NewRandomAccessFile(current_fname, customized_fs, &idfile,
+ nullptr);
+ if (!s.ok()) {
+ std::string error_str = s.ToString();
+ // Check if unsupported Direct I/O is the root cause
+ customized_fs.use_direct_reads = false;
+ s = fs_->NewRandomAccessFile(current_fname, customized_fs, &idfile,
+ nullptr);
+ if (s.ok()) {
+ return Status::InvalidArgument(
+ "Direct I/O is not supported by the specified DB.");
+ } else {
+ return Status::InvalidArgument(
+ "Found options incompatible with filesystem", error_str.c_str());
+ }
+ }
+ }
+ }
+ assert(db_id_.empty());
+ Status s = versions_->Recover(column_families, read_only, &db_id_);
+ if (!s.ok()) {
+ return s;
+ }
+ // Happens when immutable_db_options_.write_dbid_to_manifest is set to true
+ // the very first time.
+ if (db_id_.empty()) {
+ // Check for the IDENTITY file and create it if not there.
+ s = fs_->FileExists(IdentityFileName(dbname_), IOOptions(), nullptr);
+ // Typically Identity file is created in NewDB() and for some reason if
+ // it is no longer available then at this point DB ID is not in Identity
+ // file or Manifest.
+ if (s.IsNotFound()) {
+ s = SetIdentityFile(env_, dbname_);
+ if (!s.ok()) {
+ return s;
+ }
+ } else if (!s.ok()) {
+ assert(s.IsIOError());
+ return s;
+ }
+ s = GetDbIdentityFromIdentityFile(&db_id_);
+ if (immutable_db_options_.write_dbid_to_manifest && s.ok()) {
+ VersionEdit edit;
+ edit.SetDBId(db_id_);
+ Options options;
+ MutableCFOptions mutable_cf_options(options);
+ versions_->db_id_ = db_id_;
+ s = versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
+ mutable_cf_options, &edit, &mutex_, nullptr,
+ false);
+ }
+ } else {
+ s = SetIdentityFile(env_, dbname_, db_id_);
+ }
+
+ if (immutable_db_options_.paranoid_checks && s.ok()) {
+ s = CheckConsistency();
+ }
+ if (s.ok() && !read_only) {
+ std::map<std::string, std::shared_ptr<Directory>> created_dirs;
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ s = cfd->AddDirectories(&created_dirs);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ }
+ // DB mutex is already held
+ if (s.ok() && immutable_db_options_.persist_stats_to_disk) {
+ s = InitPersistStatsColumnFamily();
+ }
+
+ if (s.ok()) {
+ // Initial max_total_in_memory_state_ before recovery logs. Log recovery
+ // may check this value to decide whether to flush.
+ max_total_in_memory_state_ = 0;
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+ max_total_in_memory_state_ += mutable_cf_options->write_buffer_size *
+ mutable_cf_options->max_write_buffer_number;
+ }
+
+ SequenceNumber next_sequence(kMaxSequenceNumber);
+ default_cf_handle_ = new ColumnFamilyHandleImpl(
+ versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_);
+ default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats();
+ // TODO(Zhongyi): handle single_column_family_mode_ when
+ // persistent_stats is enabled
+ single_column_family_mode_ =
+ versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1;
+
+ // Recover from all newer log files than the ones named in the
+ // descriptor (new log files may have been added by the previous
+ // incarnation without registering them in the descriptor).
+ //
+ // Note that prev_log_number() is no longer used, but we pay
+ // attention to it in case we are recovering a database
+ // produced by an older version of rocksdb.
+ std::vector<std::string> filenames;
+ s = env_->GetChildren(immutable_db_options_.wal_dir, &filenames);
+ if (s.IsNotFound()) {
+ return Status::InvalidArgument("wal_dir not found",
+ immutable_db_options_.wal_dir);
+ } else if (!s.ok()) {
+ return s;
+ }
+
+ std::vector<uint64_t> logs;
+ for (size_t i = 0; i < filenames.size(); i++) {
+ uint64_t number;
+ FileType type;
+ if (ParseFileName(filenames[i], &number, &type) && type == kLogFile) {
+ if (is_new_db) {
+ return Status::Corruption(
+ "While creating a new Db, wal_dir contains "
+ "existing log file: ",
+ filenames[i]);
+ } else {
+ logs.push_back(number);
+ }
+ }
+ }
+
+ if (logs.size() > 0) {
+ if (error_if_log_file_exist) {
+ return Status::Corruption(
+ "The db was opened in readonly mode with error_if_log_file_exist"
+ "flag but a log file already exists");
+ } else if (error_if_data_exists_in_logs) {
+ for (auto& log : logs) {
+ std::string fname = LogFileName(immutable_db_options_.wal_dir, log);
+ uint64_t bytes;
+ s = env_->GetFileSize(fname, &bytes);
+ if (s.ok()) {
+ if (bytes > 0) {
+ return Status::Corruption(
+ "error_if_data_exists_in_logs is set but there are data "
+ " in log files.");
+ }
+ }
+ }
+ }
+ }
+
+ if (!logs.empty()) {
+ // Recover in the order in which the logs were generated
+ std::sort(logs.begin(), logs.end());
+ bool corrupted_log_found = false;
+ s = RecoverLogFiles(logs, &next_sequence, read_only,
+ &corrupted_log_found);
+ if (corrupted_log_found && recovered_seq != nullptr) {
+ *recovered_seq = next_sequence;
+ }
+ if (!s.ok()) {
+ // Clear memtables if recovery failed
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
+ kMaxSequenceNumber);
+ }
+ }
+ }
+ }
+
+ if (read_only) {
+ // If we are opening as read-only, we need to update options_file_number_
+ // to reflect the most recent OPTIONS file. It does not matter for regular
+ // read-write db instance because options_file_number_ will later be
+ // updated to versions_->NewFileNumber() in RenameTempFileToOptionsFile.
+ std::vector<std::string> file_names;
+ if (s.ok()) {
+ s = env_->GetChildren(GetName(), &file_names);
+ }
+ if (s.ok()) {
+ uint64_t number = 0;
+ uint64_t options_file_number = 0;
+ FileType type;
+ for (const auto& fname : file_names) {
+ if (ParseFileName(fname, &number, &type) && type == kOptionsFile) {
+ options_file_number = std::max(number, options_file_number);
+ }
+ }
+ versions_->options_file_number_ = options_file_number;
+ }
+ }
+
+ return s;
+}
+
+Status DBImpl::PersistentStatsProcessFormatVersion() {
+ mutex_.AssertHeld();
+ Status s;
+ // persist version when stats CF doesn't exist
+ bool should_persist_format_version = !persistent_stats_cfd_exists_;
+ mutex_.Unlock();
+ if (persistent_stats_cfd_exists_) {
+ // Check persistent stats format version compatibility. Drop and recreate
+ // persistent stats CF if format version is incompatible
+ uint64_t format_version_recovered = 0;
+ Status s_format = DecodePersistentStatsVersionNumber(
+ this, StatsVersionKeyType::kFormatVersion, &format_version_recovered);
+ uint64_t compatible_version_recovered = 0;
+ Status s_compatible = DecodePersistentStatsVersionNumber(
+ this, StatsVersionKeyType::kCompatibleVersion,
+ &compatible_version_recovered);
+ // abort reading from existing stats CF if any of following is true:
+ // 1. failed to read format version or compatible version from disk
+ // 2. sst's format version is greater than current format version, meaning
+ // this sst is encoded with a newer RocksDB release, and current compatible
+ // version is below the sst's compatible version
+ if (!s_format.ok() || !s_compatible.ok() ||
+ (kStatsCFCurrentFormatVersion < format_version_recovered &&
+ kStatsCFCompatibleFormatVersion < compatible_version_recovered)) {
+ if (!s_format.ok() || !s_compatible.ok()) {
+ ROCKS_LOG_INFO(
+ immutable_db_options_.info_log,
+ "Reading persistent stats version key failed. Format key: %s, "
+ "compatible key: %s",
+ s_format.ToString().c_str(), s_compatible.ToString().c_str());
+ } else {
+ ROCKS_LOG_INFO(
+ immutable_db_options_.info_log,
+ "Disable persistent stats due to corrupted or incompatible format "
+ "version\n");
+ }
+ DropColumnFamily(persist_stats_cf_handle_);
+ DestroyColumnFamilyHandle(persist_stats_cf_handle_);
+ ColumnFamilyHandle* handle = nullptr;
+ ColumnFamilyOptions cfo;
+ OptimizeForPersistentStats(&cfo);
+ s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle);
+ persist_stats_cf_handle_ = static_cast<ColumnFamilyHandleImpl*>(handle);
+ // should also persist version here because old stats CF is discarded
+ should_persist_format_version = true;
+ }
+ }
+ if (s.ok() && should_persist_format_version) {
+ // Persistent stats CF being created for the first time, need to write
+ // format version key
+ WriteBatch batch;
+ batch.Put(persist_stats_cf_handle_, kFormatVersionKeyString,
+ ToString(kStatsCFCurrentFormatVersion));
+ batch.Put(persist_stats_cf_handle_, kCompatibleVersionKeyString,
+ ToString(kStatsCFCompatibleFormatVersion));
+ WriteOptions wo;
+ wo.low_pri = true;
+ wo.no_slowdown = true;
+ wo.sync = false;
+ s = Write(wo, &batch);
+ }
+ mutex_.Lock();
+ return s;
+}
+
+Status DBImpl::InitPersistStatsColumnFamily() {
+ mutex_.AssertHeld();
+ assert(!persist_stats_cf_handle_);
+ ColumnFamilyData* persistent_stats_cfd =
+ versions_->GetColumnFamilySet()->GetColumnFamily(
+ kPersistentStatsColumnFamilyName);
+ persistent_stats_cfd_exists_ = persistent_stats_cfd != nullptr;
+
+ Status s;
+ if (persistent_stats_cfd != nullptr) {
+ // We are recovering from a DB which already contains persistent stats CF,
+ // the CF is already created in VersionSet::ApplyOneVersionEdit, but
+ // column family handle was not. Need to explicitly create handle here.
+ persist_stats_cf_handle_ =
+ new ColumnFamilyHandleImpl(persistent_stats_cfd, this, &mutex_);
+ } else {
+ mutex_.Unlock();
+ ColumnFamilyHandle* handle = nullptr;
+ ColumnFamilyOptions cfo;
+ OptimizeForPersistentStats(&cfo);
+ s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle);
+ persist_stats_cf_handle_ = static_cast<ColumnFamilyHandleImpl*>(handle);
+ mutex_.Lock();
+ }
+ return s;
+}
+
+// REQUIRES: log_numbers are sorted in ascending order
+Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
+ SequenceNumber* next_sequence, bool read_only,
+ bool* corrupted_log_found) {
+ struct LogReporter : public log::Reader::Reporter {
+ Env* env;
+ Logger* info_log;
+ const char* fname;
+ Status* status; // nullptr if immutable_db_options_.paranoid_checks==false
+ void Corruption(size_t bytes, const Status& s) override {
+ ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s",
+ (this->status == nullptr ? "(ignoring error) " : ""),
+ fname, static_cast<int>(bytes), s.ToString().c_str());
+ if (this->status != nullptr && this->status->ok()) {
+ *this->status = s;
+ }
+ }
+ };
+
+ mutex_.AssertHeld();
+ Status status;
+ std::unordered_map<int, VersionEdit> version_edits;
+ // no need to refcount because iteration is under mutex
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ VersionEdit edit;
+ edit.SetColumnFamily(cfd->GetID());
+ version_edits.insert({cfd->GetID(), edit});
+ }
+ int job_id = next_job_id_.fetch_add(1);
+ {
+ auto stream = event_logger_.Log();
+ stream << "job" << job_id << "event"
+ << "recovery_started";
+ stream << "log_files";
+ stream.StartArray();
+ for (auto log_number : log_numbers) {
+ stream << log_number;
+ }
+ stream.EndArray();
+ }
+
+#ifndef ROCKSDB_LITE
+ if (immutable_db_options_.wal_filter != nullptr) {
+ std::map<std::string, uint32_t> cf_name_id_map;
+ std::map<uint32_t, uint64_t> cf_lognumber_map;
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ cf_name_id_map.insert(std::make_pair(cfd->GetName(), cfd->GetID()));
+ cf_lognumber_map.insert(
+ std::make_pair(cfd->GetID(), cfd->GetLogNumber()));
+ }
+
+ immutable_db_options_.wal_filter->ColumnFamilyLogNumberMap(cf_lognumber_map,
+ cf_name_id_map);
+ }
+#endif
+
+ bool stop_replay_by_wal_filter = false;
+ bool stop_replay_for_corruption = false;
+ bool flushed = false;
+ uint64_t corrupted_log_number = kMaxSequenceNumber;
+ uint64_t min_log_number = MinLogNumberToKeep();
+ for (auto log_number : log_numbers) {
+ if (log_number < min_log_number) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Skipping log #%" PRIu64
+ " since it is older than min log to keep #%" PRIu64,
+ log_number, min_log_number);
+ continue;
+ }
+ // The previous incarnation may not have written any MANIFEST
+ // records after allocating this log number. So we manually
+ // update the file number allocation counter in VersionSet.
+ versions_->MarkFileNumberUsed(log_number);
+ // Open the log file
+ std::string fname = LogFileName(immutable_db_options_.wal_dir, log_number);
+
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Recovering log #%" PRIu64 " mode %d", log_number,
+ static_cast<int>(immutable_db_options_.wal_recovery_mode));
+ auto logFileDropped = [this, &fname]() {
+ uint64_t bytes;
+ if (env_->GetFileSize(fname, &bytes).ok()) {
+ auto info_log = immutable_db_options_.info_log.get();
+ ROCKS_LOG_WARN(info_log, "%s: dropping %d bytes", fname.c_str(),
+ static_cast<int>(bytes));
+ }
+ };
+ if (stop_replay_by_wal_filter) {
+ logFileDropped();
+ continue;
+ }
+
+ std::unique_ptr<SequentialFileReader> file_reader;
+ {
+ std::unique_ptr<FSSequentialFile> file;
+ status = fs_->NewSequentialFile(fname,
+ fs_->OptimizeForLogRead(file_options_),
+ &file, nullptr);
+ if (!status.ok()) {
+ MaybeIgnoreError(&status);
+ if (!status.ok()) {
+ return status;
+ } else {
+ // Fail with one log file, but that's ok.
+ // Try next one.
+ continue;
+ }
+ }
+ file_reader.reset(new SequentialFileReader(
+ std::move(file), fname, immutable_db_options_.log_readahead_size));
+ }
+
+ // Create the log reader.
+ LogReporter reporter;
+ reporter.env = env_;
+ reporter.info_log = immutable_db_options_.info_log.get();
+ reporter.fname = fname.c_str();
+ if (!immutable_db_options_.paranoid_checks ||
+ immutable_db_options_.wal_recovery_mode ==
+ WALRecoveryMode::kSkipAnyCorruptedRecords) {
+ reporter.status = nullptr;
+ } else {
+ reporter.status = &status;
+ }
+ // We intentially make log::Reader do checksumming even if
+ // paranoid_checks==false so that corruptions cause entire commits
+ // to be skipped instead of propagating bad information (like overly
+ // large sequence numbers).
+ log::Reader reader(immutable_db_options_.info_log, std::move(file_reader),
+ &reporter, true /*checksum*/, log_number);
+
+ // Determine if we should tolerate incomplete records at the tail end of the
+ // Read all the records and add to a memtable
+ std::string scratch;
+ Slice record;
+ WriteBatch batch;
+
+ while (!stop_replay_by_wal_filter &&
+ reader.ReadRecord(&record, &scratch,
+ immutable_db_options_.wal_recovery_mode) &&
+ status.ok()) {
+ if (record.size() < WriteBatchInternal::kHeader) {
+ reporter.Corruption(record.size(),
+ Status::Corruption("log record too small"));
+ continue;
+ }
+ WriteBatchInternal::SetContents(&batch, record);
+ SequenceNumber sequence = WriteBatchInternal::Sequence(&batch);
+
+ if (immutable_db_options_.wal_recovery_mode ==
+ WALRecoveryMode::kPointInTimeRecovery) {
+ // In point-in-time recovery mode, if sequence id of log files are
+ // consecutive, we continue recovery despite corruption. This could
+ // happen when we open and write to a corrupted DB, where sequence id
+ // will start from the last sequence id we recovered.
+ if (sequence == *next_sequence) {
+ stop_replay_for_corruption = false;
+ }
+ if (stop_replay_for_corruption) {
+ logFileDropped();
+ break;
+ }
+ }
+
+#ifndef ROCKSDB_LITE
+ if (immutable_db_options_.wal_filter != nullptr) {
+ WriteBatch new_batch;
+ bool batch_changed = false;
+
+ WalFilter::WalProcessingOption wal_processing_option =
+ immutable_db_options_.wal_filter->LogRecordFound(
+ log_number, fname, batch, &new_batch, &batch_changed);
+
+ switch (wal_processing_option) {
+ case WalFilter::WalProcessingOption::kContinueProcessing:
+ // do nothing, proceeed normally
+ break;
+ case WalFilter::WalProcessingOption::kIgnoreCurrentRecord:
+ // skip current record
+ continue;
+ case WalFilter::WalProcessingOption::kStopReplay:
+ // skip current record and stop replay
+ stop_replay_by_wal_filter = true;
+ continue;
+ case WalFilter::WalProcessingOption::kCorruptedRecord: {
+ status =
+ Status::Corruption("Corruption reported by Wal Filter ",
+ immutable_db_options_.wal_filter->Name());
+ MaybeIgnoreError(&status);
+ if (!status.ok()) {
+ reporter.Corruption(record.size(), status);
+ continue;
+ }
+ break;
+ }
+ default: {
+ assert(false); // unhandled case
+ status = Status::NotSupported(
+ "Unknown WalProcessingOption returned"
+ " by Wal Filter ",
+ immutable_db_options_.wal_filter->Name());
+ MaybeIgnoreError(&status);
+ if (!status.ok()) {
+ return status;
+ } else {
+ // Ignore the error with current record processing.
+ continue;
+ }
+ }
+ }
+
+ if (batch_changed) {
+ // Make sure that the count in the new batch is
+ // within the orignal count.
+ int new_count = WriteBatchInternal::Count(&new_batch);
+ int original_count = WriteBatchInternal::Count(&batch);
+ if (new_count > original_count) {
+ ROCKS_LOG_FATAL(
+ immutable_db_options_.info_log,
+ "Recovering log #%" PRIu64
+ " mode %d log filter %s returned "
+ "more records (%d) than original (%d) which is not allowed. "
+ "Aborting recovery.",
+ log_number,
+ static_cast<int>(immutable_db_options_.wal_recovery_mode),
+ immutable_db_options_.wal_filter->Name(), new_count,
+ original_count);
+ status = Status::NotSupported(
+ "More than original # of records "
+ "returned by Wal Filter ",
+ immutable_db_options_.wal_filter->Name());
+ return status;
+ }
+ // Set the same sequence number in the new_batch
+ // as the original batch.
+ WriteBatchInternal::SetSequence(&new_batch,
+ WriteBatchInternal::Sequence(&batch));
+ batch = new_batch;
+ }
+ }
+#endif // ROCKSDB_LITE
+
+ // If column family was not found, it might mean that the WAL write
+ // batch references to the column family that was dropped after the
+ // insert. We don't want to fail the whole write batch in that case --
+ // we just ignore the update.
+ // That's why we set ignore missing column families to true
+ bool has_valid_writes = false;
+ status = WriteBatchInternal::InsertInto(
+ &batch, column_family_memtables_.get(), &flush_scheduler_,
+ &trim_history_scheduler_, true, log_number, this,
+ false /* concurrent_memtable_writes */, next_sequence,
+ &has_valid_writes, seq_per_batch_, batch_per_txn_);
+ MaybeIgnoreError(&status);
+ if (!status.ok()) {
+ // We are treating this as a failure while reading since we read valid
+ // blocks that do not form coherent data
+ reporter.Corruption(record.size(), status);
+ continue;
+ }
+
+ if (has_valid_writes && !read_only) {
+ // we can do this because this is called before client has access to the
+ // DB and there is only a single thread operating on DB
+ ColumnFamilyData* cfd;
+
+ while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) {
+ cfd->UnrefAndTryDelete();
+ // If this asserts, it means that InsertInto failed in
+ // filtering updates to already-flushed column families
+ assert(cfd->GetLogNumber() <= log_number);
+ auto iter = version_edits.find(cfd->GetID());
+ assert(iter != version_edits.end());
+ VersionEdit* edit = &iter->second;
+ status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit);
+ if (!status.ok()) {
+ // Reflect errors immediately so that conditions like full
+ // file-systems cause the DB::Open() to fail.
+ return status;
+ }
+ flushed = true;
+
+ cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
+ *next_sequence);
+ }
+ }
+ }
+
+ if (!status.ok()) {
+ if (status.IsNotSupported()) {
+ // We should not treat NotSupported as corruption. It is rather a clear
+ // sign that we are processing a WAL that is produced by an incompatible
+ // version of the code.
+ return status;
+ }
+ if (immutable_db_options_.wal_recovery_mode ==
+ WALRecoveryMode::kSkipAnyCorruptedRecords) {
+ // We should ignore all errors unconditionally
+ status = Status::OK();
+ } else if (immutable_db_options_.wal_recovery_mode ==
+ WALRecoveryMode::kPointInTimeRecovery) {
+ // We should ignore the error but not continue replaying
+ status = Status::OK();
+ stop_replay_for_corruption = true;
+ corrupted_log_number = log_number;
+ if (corrupted_log_found != nullptr) {
+ *corrupted_log_found = true;
+ }
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Point in time recovered to log #%" PRIu64
+ " seq #%" PRIu64,
+ log_number, *next_sequence);
+ } else {
+ assert(immutable_db_options_.wal_recovery_mode ==
+ WALRecoveryMode::kTolerateCorruptedTailRecords ||
+ immutable_db_options_.wal_recovery_mode ==
+ WALRecoveryMode::kAbsoluteConsistency);
+ return status;
+ }
+ }
+
+ flush_scheduler_.Clear();
+ trim_history_scheduler_.Clear();
+ auto last_sequence = *next_sequence - 1;
+ if ((*next_sequence != kMaxSequenceNumber) &&
+ (versions_->LastSequence() <= last_sequence)) {
+ versions_->SetLastAllocatedSequence(last_sequence);
+ versions_->SetLastPublishedSequence(last_sequence);
+ versions_->SetLastSequence(last_sequence);
+ }
+ }
+ // Compare the corrupted log number to all columnfamily's current log number.
+ // Abort Open() if any column family's log number is greater than
+ // the corrupted log number, which means CF contains data beyond the point of
+ // corruption. This could during PIT recovery when the WAL is corrupted and
+ // some (but not all) CFs are flushed
+ // Exclude the PIT case where no log is dropped after the corruption point.
+ // This is to cover the case for empty logs after corrupted log, in which we
+ // don't reset stop_replay_for_corruption.
+ if (stop_replay_for_corruption == true &&
+ (immutable_db_options_.wal_recovery_mode ==
+ WALRecoveryMode::kPointInTimeRecovery ||
+ immutable_db_options_.wal_recovery_mode ==
+ WALRecoveryMode::kTolerateCorruptedTailRecords)) {
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ if (cfd->GetLogNumber() > corrupted_log_number) {
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "Column family inconsistency: SST file contains data"
+ " beyond the point of corruption.");
+ return Status::Corruption("SST file is ahead of WALs");
+ }
+ }
+ }
+
+ // True if there's any data in the WALs; if not, we can skip re-processing
+ // them later
+ bool data_seen = false;
+ if (!read_only) {
+ // no need to refcount since client still doesn't have access
+ // to the DB and can not drop column families while we iterate
+ auto max_log_number = log_numbers.back();
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ auto iter = version_edits.find(cfd->GetID());
+ assert(iter != version_edits.end());
+ VersionEdit* edit = &iter->second;
+
+ if (cfd->GetLogNumber() > max_log_number) {
+ // Column family cfd has already flushed the data
+ // from all logs. Memtable has to be empty because
+ // we filter the updates based on log_number
+ // (in WriteBatch::InsertInto)
+ assert(cfd->mem()->GetFirstSequenceNumber() == 0);
+ assert(edit->NumEntries() == 0);
+ continue;
+ }
+
+ TEST_SYNC_POINT_CALLBACK(
+ "DBImpl::RecoverLogFiles:BeforeFlushFinalMemtable", /*arg=*/nullptr);
+
+ // flush the final memtable (if non-empty)
+ if (cfd->mem()->GetFirstSequenceNumber() != 0) {
+ // If flush happened in the middle of recovery (e.g. due to memtable
+ // being full), we flush at the end. Otherwise we'll need to record
+ // where we were on last flush, which make the logic complicated.
+ if (flushed || !immutable_db_options_.avoid_flush_during_recovery) {
+ status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit);
+ if (!status.ok()) {
+ // Recovery failed
+ break;
+ }
+ flushed = true;
+
+ cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
+ versions_->LastSequence());
+ }
+ data_seen = true;
+ }
+
+ // Update the log number info in the version edit corresponding to this
+ // column family. Note that the version edits will be written to MANIFEST
+ // together later.
+ // writing log_number in the manifest means that any log file
+ // with number strongly less than (log_number + 1) is already
+ // recovered and should be ignored on next reincarnation.
+ // Since we already recovered max_log_number, we want all logs
+ // with numbers `<= max_log_number` (includes this one) to be ignored
+ if (flushed || cfd->mem()->GetFirstSequenceNumber() == 0) {
+ edit->SetLogNumber(max_log_number + 1);
+ }
+ }
+ if (status.ok()) {
+ // we must mark the next log number as used, even though it's
+ // not actually used. that is because VersionSet assumes
+ // VersionSet::next_file_number_ always to be strictly greater than any
+ // log number
+ versions_->MarkFileNumberUsed(max_log_number + 1);
+
+ autovector<ColumnFamilyData*> cfds;
+ autovector<const MutableCFOptions*> cf_opts;
+ autovector<autovector<VersionEdit*>> edit_lists;
+ for (auto* cfd : *versions_->GetColumnFamilySet()) {
+ cfds.push_back(cfd);
+ cf_opts.push_back(cfd->GetLatestMutableCFOptions());
+ auto iter = version_edits.find(cfd->GetID());
+ assert(iter != version_edits.end());
+ edit_lists.push_back({&iter->second});
+ }
+ // write MANIFEST with update
+ status = versions_->LogAndApply(cfds, cf_opts, edit_lists, &mutex_,
+ directories_.GetDbDir(),
+ /*new_descriptor_log=*/true);
+ }
+ }
+
+ if (status.ok() && data_seen && !flushed) {
+ status = RestoreAliveLogFiles(log_numbers);
+ }
+
+ event_logger_.Log() << "job" << job_id << "event"
+ << "recovery_finished";
+
+ return status;
+}
+
+Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& log_numbers) {
+ if (log_numbers.empty()) {
+ return Status::OK();
+ }
+ Status s;
+ mutex_.AssertHeld();
+ assert(immutable_db_options_.avoid_flush_during_recovery);
+ if (two_write_queues_) {
+ log_write_mutex_.Lock();
+ }
+ // Mark these as alive so they'll be considered for deletion later by
+ // FindObsoleteFiles()
+ total_log_size_ = 0;
+ log_empty_ = false;
+ for (auto log_number : log_numbers) {
+ LogFileNumberSize log(log_number);
+ std::string fname = LogFileName(immutable_db_options_.wal_dir, log_number);
+ // This gets the appear size of the logs, not including preallocated space.
+ s = env_->GetFileSize(fname, &log.size);
+ if (!s.ok()) {
+ break;
+ }
+ total_log_size_ += log.size;
+ alive_log_files_.push_back(log);
+ // We preallocate space for logs, but then after a crash and restart, those
+ // preallocated space are not needed anymore. It is likely only the last
+ // log has such preallocated space, so we only truncate for the last log.
+ if (log_number == log_numbers.back()) {
+ std::unique_ptr<FSWritableFile> last_log;
+ Status truncate_status = fs_->ReopenWritableFile(
+ fname,
+ fs_->OptimizeForLogWrite(
+ file_options_,
+ BuildDBOptions(immutable_db_options_, mutable_db_options_)),
+ &last_log, nullptr);
+ if (truncate_status.ok()) {
+ truncate_status = last_log->Truncate(log.size, IOOptions(), nullptr);
+ }
+ if (truncate_status.ok()) {
+ truncate_status = last_log->Close(IOOptions(), nullptr);
+ }
+ // Not a critical error if fail to truncate.
+ if (!truncate_status.ok()) {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "Failed to truncate log #%" PRIu64 ": %s", log_number,
+ truncate_status.ToString().c_str());
+ }
+ }
+ }
+ if (two_write_queues_) {
+ log_write_mutex_.Unlock();
+ }
+ return s;
+}
+
+Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
+ MemTable* mem, VersionEdit* edit) {
+ mutex_.AssertHeld();
+ const uint64_t start_micros = env_->NowMicros();
+ FileMetaData meta;
+ std::unique_ptr<std::list<uint64_t>::iterator> pending_outputs_inserted_elem(
+ new std::list<uint64_t>::iterator(
+ CaptureCurrentFileNumberInPendingOutputs()));
+ meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
+ ReadOptions ro;
+ ro.total_order_seek = true;
+ Arena arena;
+ Status s;
+ TableProperties table_properties;
+ {
+ ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
+ ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+ "[%s] [WriteLevel0TableForRecovery]"
+ " Level-0 table #%" PRIu64 ": started",
+ cfd->GetName().c_str(), meta.fd.GetNumber());
+
+ // Get the latest mutable cf options while the mutex is still locked
+ const MutableCFOptions mutable_cf_options =
+ *cfd->GetLatestMutableCFOptions();
+ bool paranoid_file_checks =
+ cfd->GetLatestMutableCFOptions()->paranoid_file_checks;
+
+ int64_t _current_time = 0;
+ env_->GetCurrentTime(&_current_time); // ignore error
+ const uint64_t current_time = static_cast<uint64_t>(_current_time);
+ meta.oldest_ancester_time = current_time;
+
+ {
+ auto write_hint = cfd->CalculateSSTWriteHint(0);
+ mutex_.Unlock();
+
+ SequenceNumber earliest_write_conflict_snapshot;
+ std::vector<SequenceNumber> snapshot_seqs =
+ snapshots_.GetAll(&earliest_write_conflict_snapshot);
+ auto snapshot_checker = snapshot_checker_.get();
+ if (use_custom_gc_ && snapshot_checker == nullptr) {
+ snapshot_checker = DisableGCSnapshotChecker::Instance();
+ }
+ std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+ range_del_iters;
+ auto range_del_iter =
+ mem->NewRangeTombstoneIterator(ro, kMaxSequenceNumber);
+ if (range_del_iter != nullptr) {
+ range_del_iters.emplace_back(range_del_iter);
+ }
+ s = BuildTable(
+ dbname_, env_, fs_.get(), *cfd->ioptions(), mutable_cf_options,
+ file_options_for_compaction_, cfd->table_cache(), iter.get(),
+ std::move(range_del_iters), &meta, cfd->internal_comparator(),
+ cfd->int_tbl_prop_collector_factories(), cfd->GetID(), cfd->GetName(),
+ snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker,
+ GetCompressionFlush(*cfd->ioptions(), mutable_cf_options),
+ mutable_cf_options.sample_for_compression,
+ cfd->ioptions()->compression_opts, paranoid_file_checks,
+ cfd->internal_stats(), TableFileCreationReason::kRecovery,
+ &event_logger_, job_id, Env::IO_HIGH, nullptr /* table_properties */,
+ -1 /* level */, current_time, write_hint);
+ LogFlush(immutable_db_options_.info_log);
+ ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+ "[%s] [WriteLevel0TableForRecovery]"
+ " Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s",
+ cfd->GetName().c_str(), meta.fd.GetNumber(),
+ meta.fd.GetFileSize(), s.ToString().c_str());
+ mutex_.Lock();
+ }
+ }
+ ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
+
+ // Note that if file_size is zero, the file has been deleted and
+ // should not be added to the manifest.
+ int level = 0;
+ if (s.ok() && meta.fd.GetFileSize() > 0) {
+ edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
+ meta.fd.GetFileSize(), meta.smallest, meta.largest,
+ meta.fd.smallest_seqno, meta.fd.largest_seqno,
+ meta.marked_for_compaction, meta.oldest_blob_file_number,
+ meta.oldest_ancester_time, meta.file_creation_time,
+ meta.file_checksum, meta.file_checksum_func_name);
+ }
+
+ InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
+ stats.micros = env_->NowMicros() - start_micros;
+ stats.bytes_written = meta.fd.GetFileSize();
+ stats.num_output_files = 1;
+ cfd->internal_stats()->AddCompactionStats(level, Env::Priority::USER, stats);
+ cfd->internal_stats()->AddCFStats(InternalStats::BYTES_FLUSHED,
+ meta.fd.GetFileSize());
+ RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize());
+ return s;
+}
+
+Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
+ DBOptions db_options(options);
+ ColumnFamilyOptions cf_options(options);
+ std::vector<ColumnFamilyDescriptor> column_families;
+ column_families.push_back(
+ ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+ if (db_options.persist_stats_to_disk) {
+ column_families.push_back(
+ ColumnFamilyDescriptor(kPersistentStatsColumnFamilyName, cf_options));
+ }
+ std::vector<ColumnFamilyHandle*> handles;
+ Status s = DB::Open(db_options, dbname, column_families, &handles, dbptr);
+ if (s.ok()) {
+ if (db_options.persist_stats_to_disk) {
+ assert(handles.size() == 2);
+ } else {
+ assert(handles.size() == 1);
+ }
+ // i can delete the handle since DBImpl is always holding a reference to
+ // default column family
+ if (db_options.persist_stats_to_disk && handles[1] != nullptr) {
+ delete handles[1];
+ }
+ delete handles[0];
+ }
+ return s;
+}
+
+Status DB::Open(const DBOptions& db_options, const std::string& dbname,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
+ const bool kSeqPerBatch = true;
+ const bool kBatchPerTxn = true;
+ return DBImpl::Open(db_options, dbname, column_families, handles, dbptr,
+ !kSeqPerBatch, kBatchPerTxn);
+}
+
+Status DBImpl::CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number,
+ size_t preallocate_block_size, log::Writer** new_log) {
+ Status s;
+ std::unique_ptr<FSWritableFile> lfile;
+
+ DBOptions db_options =
+ BuildDBOptions(immutable_db_options_, mutable_db_options_);
+ FileOptions opt_file_options =
+ fs_->OptimizeForLogWrite(file_options_, db_options);
+ std::string log_fname =
+ LogFileName(immutable_db_options_.wal_dir, log_file_num);
+
+ if (recycle_log_number) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "reusing log %" PRIu64 " from recycle list\n",
+ recycle_log_number);
+ std::string old_log_fname =
+ LogFileName(immutable_db_options_.wal_dir, recycle_log_number);
+ TEST_SYNC_POINT("DBImpl::CreateWAL:BeforeReuseWritableFile1");
+ TEST_SYNC_POINT("DBImpl::CreateWAL:BeforeReuseWritableFile2");
+ s = fs_->ReuseWritableFile(log_fname, old_log_fname, opt_file_options,
+ &lfile, /*dbg=*/nullptr);
+ } else {
+ s = NewWritableFile(fs_.get(), log_fname, &lfile, opt_file_options);
+ }
+
+ if (s.ok()) {
+ lfile->SetWriteLifeTimeHint(CalculateWALWriteHint());
+ lfile->SetPreallocationBlockSize(preallocate_block_size);
+
+ const auto& listeners = immutable_db_options_.listeners;
+ std::unique_ptr<WritableFileWriter> file_writer(
+ new WritableFileWriter(std::move(lfile), log_fname, opt_file_options,
+ env_, nullptr /* stats */, listeners));
+ *new_log = new log::Writer(std::move(file_writer), log_file_num,
+ immutable_db_options_.recycle_log_file_num > 0,
+ immutable_db_options_.manual_wal_flush);
+ }
+ return s;
+}
+
+Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+ const bool seq_per_batch, const bool batch_per_txn) {
+ Status s = SanitizeOptionsByTable(db_options, column_families);
+ if (!s.ok()) {
+ return s;
+ }
+
+ s = ValidateOptions(db_options, column_families);
+ if (!s.ok()) {
+ return s;
+ }
+
+ *dbptr = nullptr;
+ handles->clear();
+
+ size_t max_write_buffer_size = 0;
+ for (auto cf : column_families) {
+ max_write_buffer_size =
+ std::max(max_write_buffer_size, cf.options.write_buffer_size);
+ }
+
+ DBImpl* impl = new DBImpl(db_options, dbname, seq_per_batch, batch_per_txn);
+ s = impl->env_->CreateDirIfMissing(impl->immutable_db_options_.wal_dir);
+ if (s.ok()) {
+ std::vector<std::string> paths;
+ for (auto& db_path : impl->immutable_db_options_.db_paths) {
+ paths.emplace_back(db_path.path);
+ }
+ for (auto& cf : column_families) {
+ for (auto& cf_path : cf.options.cf_paths) {
+ paths.emplace_back(cf_path.path);
+ }
+ }
+ for (auto& path : paths) {
+ s = impl->env_->CreateDirIfMissing(path);
+ if (!s.ok()) {
+ break;
+ }
+ }
+
+ // For recovery from NoSpace() error, we can only handle
+ // the case where the database is stored in a single path
+ if (paths.size() <= 1) {
+ impl->error_handler_.EnableAutoRecovery();
+ }
+ }
+
+ if (!s.ok()) {
+ delete impl;
+ return s;
+ }
+
+ s = impl->CreateArchivalDirectory();
+ if (!s.ok()) {
+ delete impl;
+ return s;
+ }
+
+ impl->wal_in_db_path_ = IsWalDirSameAsDBPath(&impl->immutable_db_options_);
+
+ impl->mutex_.Lock();
+ // Handles create_if_missing, error_if_exists
+ uint64_t recovered_seq(kMaxSequenceNumber);
+ s = impl->Recover(column_families, false, false, false, &recovered_seq);
+ if (s.ok()) {
+ uint64_t new_log_number = impl->versions_->NewFileNumber();
+ log::Writer* new_log = nullptr;
+ const size_t preallocate_block_size =
+ impl->GetWalPreallocateBlockSize(max_write_buffer_size);
+ s = impl->CreateWAL(new_log_number, 0 /*recycle_log_number*/,
+ preallocate_block_size, &new_log);
+ if (s.ok()) {
+ InstrumentedMutexLock wl(&impl->log_write_mutex_);
+ impl->logfile_number_ = new_log_number;
+ assert(new_log != nullptr);
+ impl->logs_.emplace_back(new_log_number, new_log);
+ }
+
+ if (s.ok()) {
+ // set column family handles
+ for (auto cf : column_families) {
+ auto cfd =
+ impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
+ if (cfd != nullptr) {
+ handles->push_back(
+ new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
+ impl->NewThreadStatusCfInfo(cfd);
+ } else {
+ if (db_options.create_missing_column_families) {
+ // missing column family, create it
+ ColumnFamilyHandle* handle;
+ impl->mutex_.Unlock();
+ s = impl->CreateColumnFamily(cf.options, cf.name, &handle);
+ impl->mutex_.Lock();
+ if (s.ok()) {
+ handles->push_back(handle);
+ } else {
+ break;
+ }
+ } else {
+ s = Status::InvalidArgument("Column family not found: ", cf.name);
+ break;
+ }
+ }
+ }
+ }
+ if (s.ok()) {
+ SuperVersionContext sv_context(/* create_superversion */ true);
+ for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
+ impl->InstallSuperVersionAndScheduleWork(
+ cfd, &sv_context, *cfd->GetLatestMutableCFOptions());
+ }
+ sv_context.Clean();
+ if (impl->two_write_queues_) {
+ impl->log_write_mutex_.Lock();
+ }
+ impl->alive_log_files_.push_back(
+ DBImpl::LogFileNumberSize(impl->logfile_number_));
+ if (impl->two_write_queues_) {
+ impl->log_write_mutex_.Unlock();
+ }
+
+ impl->DeleteObsoleteFiles();
+ s = impl->directories_.GetDbDir()->Fsync();
+ }
+ if (s.ok()) {
+ // In WritePrepared there could be gap in sequence numbers. This breaks
+ // the trick we use in kPointInTimeRecovery which assumes the first seq in
+ // the log right after the corrupted log is one larger than the last seq
+ // we read from the logs. To let this trick keep working, we add a dummy
+ // entry with the expected sequence to the first log right after recovery.
+ // In non-WritePrepared case also the new log after recovery could be
+ // empty, and thus missing the consecutive seq hint to distinguish
+ // middle-log corruption to corrupted-log-remained-after-recovery. This
+ // case also will be addressed by a dummy write.
+ if (recovered_seq != kMaxSequenceNumber) {
+ WriteBatch empty_batch;
+ WriteBatchInternal::SetSequence(&empty_batch, recovered_seq);
+ WriteOptions write_options;
+ uint64_t log_used, log_size;
+ log::Writer* log_writer = impl->logs_.back().writer;
+ s = impl->WriteToWAL(empty_batch, log_writer, &log_used, &log_size);
+ if (s.ok()) {
+ // Need to fsync, otherwise it might get lost after a power reset.
+ s = impl->FlushWAL(false);
+ if (s.ok()) {
+ s = log_writer->file()->Sync(impl->immutable_db_options_.use_fsync);
+ }
+ }
+ }
+ }
+ }
+ if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) {
+ // try to read format version but no need to fail Open() even if it fails
+ s = impl->PersistentStatsProcessFormatVersion();
+ }
+
+ if (s.ok()) {
+ for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
+ if (cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
+ auto* vstorage = cfd->current()->storage_info();
+ for (int i = 1; i < vstorage->num_levels(); ++i) {
+ int num_files = vstorage->NumLevelFiles(i);
+ if (num_files > 0) {
+ s = Status::InvalidArgument(
+ "Not all files are at level 0. Cannot "
+ "open with FIFO compaction style.");
+ break;
+ }
+ }
+ }
+ if (!cfd->mem()->IsSnapshotSupported()) {
+ impl->is_snapshot_supported_ = false;
+ }
+ if (cfd->ioptions()->merge_operator != nullptr &&
+ !cfd->mem()->IsMergeOperatorSupported()) {
+ s = Status::InvalidArgument(
+ "The memtable of column family %s does not support merge operator "
+ "its options.merge_operator is non-null",
+ cfd->GetName().c_str());
+ }
+ if (!s.ok()) {
+ break;
+ }
+ }
+ }
+ TEST_SYNC_POINT("DBImpl::Open:Opened");
+ Status persist_options_status;
+ if (s.ok()) {
+ // Persist RocksDB Options before scheduling the compaction.
+ // The WriteOptionsFile() will release and lock the mutex internally.
+ persist_options_status = impl->WriteOptionsFile(
+ false /*need_mutex_lock*/, false /*need_enter_write_thread*/);
+
+ *dbptr = impl;
+ impl->opened_successfully_ = true;
+ impl->MaybeScheduleFlushOrCompaction();
+ }
+ impl->mutex_.Unlock();
+
+#ifndef ROCKSDB_LITE
+ auto sfm = static_cast<SstFileManagerImpl*>(
+ impl->immutable_db_options_.sst_file_manager.get());
+ if (s.ok() && sfm) {
+ // Notify SstFileManager about all sst files that already exist in
+ // db_paths[0] and cf_paths[0] when the DB is opened.
+
+ // SstFileManagerImpl needs to know sizes of the files. For files whose size
+ // we already know (sst files that appear in manifest - typically that's the
+ // vast majority of all files), we'll pass the size to SstFileManager.
+ // For all other files SstFileManager will query the size from filesystem.
+
+ std::vector<LiveFileMetaData> metadata;
+
+ impl->mutex_.Lock();
+ impl->versions_->GetLiveFilesMetaData(&metadata);
+ impl->mutex_.Unlock();
+
+ std::unordered_map<std::string, uint64_t> known_file_sizes;
+ for (const auto& md : metadata) {
+ std::string name = md.name;
+ if (!name.empty() && name[0] == '/') {
+ name = name.substr(1);
+ }
+ known_file_sizes[name] = md.size;
+ }
+
+ std::vector<std::string> paths;
+ paths.emplace_back(impl->immutable_db_options_.db_paths[0].path);
+ for (auto& cf : column_families) {
+ if (!cf.options.cf_paths.empty()) {
+ paths.emplace_back(cf.options.cf_paths[0].path);
+ }
+ }
+ // Remove duplicate paths.
+ std::sort(paths.begin(), paths.end());
+ paths.erase(std::unique(paths.begin(), paths.end()), paths.end());
+ for (auto& path : paths) {
+ std::vector<std::string> existing_files;
+ impl->immutable_db_options_.env->GetChildren(path, &existing_files);
+ for (auto& file_name : existing_files) {
+ uint64_t file_number;
+ FileType file_type;
+ std::string file_path = path + "/" + file_name;
+ if (ParseFileName(file_name, &file_number, &file_type) &&
+ file_type == kTableFile) {
+ if (known_file_sizes.count(file_name)) {
+ // We're assuming that each sst file name exists in at most one of
+ // the paths.
+ sfm->OnAddFile(file_path, known_file_sizes.at(file_name),
+ /* compaction */ false);
+ } else {
+ sfm->OnAddFile(file_path);
+ }
+ }
+ }
+ }
+
+ // Reserve some disk buffer space. This is a heuristic - when we run out
+ // of disk space, this ensures that there is atleast write_buffer_size
+ // amount of free space before we resume DB writes. In low disk space
+ // conditions, we want to avoid a lot of small L0 files due to frequent
+ // WAL write failures and resultant forced flushes
+ sfm->ReserveDiskBuffer(max_write_buffer_size,
+ impl->immutable_db_options_.db_paths[0].path);
+ }
+#endif // !ROCKSDB_LITE
+
+ if (s.ok()) {
+ ROCKS_LOG_HEADER(impl->immutable_db_options_.info_log, "DB pointer %p",
+ impl);
+ LogFlush(impl->immutable_db_options_.info_log);
+ assert(impl->TEST_WALBufferIsEmpty());
+ // If the assert above fails then we need to FlushWAL before returning
+ // control back to the user.
+ if (!persist_options_status.ok()) {
+ s = Status::IOError(
+ "DB::Open() failed --- Unable to persist Options file",
+ persist_options_status.ToString());
+ }
+ }
+ if (s.ok()) {
+ impl->StartTimedTasks();
+ }
+ if (!s.ok()) {
+ for (auto* h : *handles) {
+ delete h;
+ }
+ handles->clear();
+ delete impl;
+ *dbptr = nullptr;
+ }
+ return s;
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_readonly.cc b/src/rocksdb/db/db_impl/db_impl_readonly.cc
new file mode 100644
index 000000000..a4242bfe1
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_readonly.cc
@@ -0,0 +1,221 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_impl/db_impl_readonly.h"
+#include "db/arena_wrapped_db_iter.h"
+
+#include "db/compacted_db_impl.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_iter.h"
+#include "db/merge_context.h"
+#include "monitoring/perf_context_imp.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+
+DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options,
+ const std::string& dbname)
+ : DBImpl(db_options, dbname) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Opening the db in read only mode");
+ LogFlush(immutable_db_options_.info_log);
+}
+
+DBImplReadOnly::~DBImplReadOnly() {}
+
+// Implementations of the DB interface
+Status DBImplReadOnly::Get(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* pinnable_val) {
+ assert(pinnable_val != nullptr);
+ // TODO: stopwatch DB_GET needed?, perf timer needed?
+ PERF_TIMER_GUARD(get_snapshot_time);
+ Status s;
+ SequenceNumber snapshot = versions_->LastSequence();
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ auto cfd = cfh->cfd();
+ if (tracer_) {
+ InstrumentedMutexLock lock(&trace_mutex_);
+ if (tracer_) {
+ tracer_->Get(column_family, key);
+ }
+ }
+ SuperVersion* super_version = cfd->GetSuperVersion();
+ MergeContext merge_context;
+ SequenceNumber max_covering_tombstone_seq = 0;
+ LookupKey lkey(key, snapshot);
+ PERF_TIMER_STOP(get_snapshot_time);
+ if (super_version->mem->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context,
+ &max_covering_tombstone_seq, read_options)) {
+ pinnable_val->PinSelf();
+ RecordTick(stats_, MEMTABLE_HIT);
+ } else {
+ PERF_TIMER_GUARD(get_from_output_files_time);
+ super_version->current->Get(read_options, lkey, pinnable_val, &s,
+ &merge_context, &max_covering_tombstone_seq);
+ RecordTick(stats_, MEMTABLE_MISS);
+ }
+ RecordTick(stats_, NUMBER_KEYS_READ);
+ size_t size = pinnable_val->size();
+ RecordTick(stats_, BYTES_READ, size);
+ RecordInHistogram(stats_, BYTES_PER_READ, size);
+ PERF_COUNTER_ADD(get_read_bytes, size);
+ return s;
+}
+
+Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family) {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ auto cfd = cfh->cfd();
+ SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
+ SequenceNumber latest_snapshot = versions_->LastSequence();
+ SequenceNumber read_seq =
+ read_options.snapshot != nullptr
+ ? reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
+ ->number_
+ : latest_snapshot;
+ ReadCallback* read_callback = nullptr; // No read callback provided.
+ auto db_iter = NewArenaWrappedDbIterator(
+ env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options,
+ read_seq,
+ super_version->mutable_cf_options.max_sequential_skip_in_iterations,
+ super_version->version_number, read_callback);
+ auto internal_iter =
+ NewInternalIterator(read_options, cfd, super_version, db_iter->GetArena(),
+ db_iter->GetRangeDelAggregator(), read_seq);
+ db_iter->SetIterUnderDBIter(internal_iter);
+ return db_iter;
+}
+
+Status DBImplReadOnly::NewIterators(
+ const ReadOptions& read_options,
+ const std::vector<ColumnFamilyHandle*>& column_families,
+ std::vector<Iterator*>* iterators) {
+ ReadCallback* read_callback = nullptr; // No read callback provided.
+ if (iterators == nullptr) {
+ return Status::InvalidArgument("iterators not allowed to be nullptr");
+ }
+ iterators->clear();
+ iterators->reserve(column_families.size());
+ SequenceNumber latest_snapshot = versions_->LastSequence();
+ SequenceNumber read_seq =
+ read_options.snapshot != nullptr
+ ? reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
+ ->number_
+ : latest_snapshot;
+
+ for (auto cfh : column_families) {
+ auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
+ auto* sv = cfd->GetSuperVersion()->Ref();
+ auto* db_iter = NewArenaWrappedDbIterator(
+ env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, read_seq,
+ sv->mutable_cf_options.max_sequential_skip_in_iterations,
+ sv->version_number, read_callback);
+ auto* internal_iter =
+ NewInternalIterator(read_options, cfd, sv, db_iter->GetArena(),
+ db_iter->GetRangeDelAggregator(), read_seq);
+ db_iter->SetIterUnderDBIter(internal_iter);
+ iterators->push_back(db_iter);
+ }
+
+ return Status::OK();
+}
+
+Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
+ DB** dbptr, bool /*error_if_log_file_exist*/) {
+ *dbptr = nullptr;
+
+ // Try to first open DB as fully compacted DB
+ Status s;
+ s = CompactedDBImpl::Open(options, dbname, dbptr);
+ if (s.ok()) {
+ return s;
+ }
+
+ DBOptions db_options(options);
+ ColumnFamilyOptions cf_options(options);
+ std::vector<ColumnFamilyDescriptor> column_families;
+ column_families.push_back(
+ ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+ std::vector<ColumnFamilyHandle*> handles;
+
+ s = DB::OpenForReadOnly(db_options, dbname, column_families, &handles, dbptr);
+ if (s.ok()) {
+ assert(handles.size() == 1);
+ // i can delete the handle since DBImpl is always holding a
+ // reference to default column family
+ delete handles[0];
+ }
+ return s;
+}
+
+Status DB::OpenForReadOnly(
+ const DBOptions& db_options, const std::string& dbname,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+ bool error_if_log_file_exist) {
+ *dbptr = nullptr;
+ handles->clear();
+
+ SuperVersionContext sv_context(/* create_superversion */ true);
+ DBImplReadOnly* impl = new DBImplReadOnly(db_options, dbname);
+ impl->mutex_.Lock();
+ Status s = impl->Recover(column_families, true /* read only */,
+ error_if_log_file_exist);
+ if (s.ok()) {
+ // set column family handles
+ for (auto cf : column_families) {
+ auto cfd =
+ impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
+ if (cfd == nullptr) {
+ s = Status::InvalidArgument("Column family not found: ", cf.name);
+ break;
+ }
+ handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
+ }
+ }
+ if (s.ok()) {
+ for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
+ sv_context.NewSuperVersion();
+ cfd->InstallSuperVersion(&sv_context, &impl->mutex_);
+ }
+ }
+ impl->mutex_.Unlock();
+ sv_context.Clean();
+ if (s.ok()) {
+ *dbptr = impl;
+ for (auto* h : *handles) {
+ impl->NewThreadStatusCfInfo(
+ reinterpret_cast<ColumnFamilyHandleImpl*>(h)->cfd());
+ }
+ } else {
+ for (auto h : *handles) {
+ delete h;
+ }
+ handles->clear();
+ delete impl;
+ }
+ return s;
+}
+
+#else // !ROCKSDB_LITE
+
+Status DB::OpenForReadOnly(const Options& /*options*/,
+ const std::string& /*dbname*/, DB** /*dbptr*/,
+ bool /*error_if_log_file_exist*/) {
+ return Status::NotSupported("Not supported in ROCKSDB_LITE.");
+}
+
+Status DB::OpenForReadOnly(
+ const DBOptions& /*db_options*/, const std::string& /*dbname*/,
+ const std::vector<ColumnFamilyDescriptor>& /*column_families*/,
+ std::vector<ColumnFamilyHandle*>* /*handles*/, DB** /*dbptr*/,
+ bool /*error_if_log_file_exist*/) {
+ return Status::NotSupported("Not supported in ROCKSDB_LITE.");
+}
+#endif // !ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_readonly.h b/src/rocksdb/db/db_impl/db_impl_readonly.h
new file mode 100644
index 000000000..04d06b4a1
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_readonly.h
@@ -0,0 +1,137 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+#include "db/db_impl/db_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBImplReadOnly : public DBImpl {
+ public:
+ DBImplReadOnly(const DBOptions& options, const std::string& dbname);
+ // No copying allowed
+ DBImplReadOnly(const DBImplReadOnly&) = delete;
+ void operator=(const DBImplReadOnly&) = delete;
+
+ virtual ~DBImplReadOnly();
+
+ // Implementations of the DB interface
+ using DB::Get;
+ virtual Status Get(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value) override;
+
+ // TODO: Implement ReadOnly MultiGet?
+
+ using DBImpl::NewIterator;
+ virtual Iterator* NewIterator(const ReadOptions&,
+ ColumnFamilyHandle* column_family) override;
+
+ virtual Status NewIterators(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_families,
+ std::vector<Iterator*>* iterators) override;
+
+ using DBImpl::Put;
+ virtual Status Put(const WriteOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*key*/, const Slice& /*value*/) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+ using DBImpl::Merge;
+ virtual Status Merge(const WriteOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*key*/, const Slice& /*value*/) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+ using DBImpl::Delete;
+ virtual Status Delete(const WriteOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*key*/) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+ using DBImpl::SingleDelete;
+ virtual Status SingleDelete(const WriteOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*key*/) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+ virtual Status Write(const WriteOptions& /*options*/,
+ WriteBatch* /*updates*/) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+ using DBImpl::CompactRange;
+ virtual Status CompactRange(const CompactRangeOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice* /*begin*/,
+ const Slice* /*end*/) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+
+ using DBImpl::CompactFiles;
+ virtual Status CompactFiles(
+ const CompactionOptions& /*compact_options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const std::vector<std::string>& /*input_file_names*/,
+ const int /*output_level*/, const int /*output_path_id*/ = -1,
+ std::vector<std::string>* const /*output_file_names*/ = nullptr,
+ CompactionJobInfo* /*compaction_job_info*/ = nullptr) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+
+ virtual Status DisableFileDeletions() override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+
+ virtual Status EnableFileDeletions(bool /*force*/) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+ virtual Status GetLiveFiles(std::vector<std::string>& ret,
+ uint64_t* manifest_file_size,
+ bool /*flush_memtable*/) override {
+ return DBImpl::GetLiveFiles(ret, manifest_file_size,
+ false /* flush_memtable */);
+ }
+
+ using DBImpl::Flush;
+ virtual Status Flush(const FlushOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+
+ using DBImpl::SyncWAL;
+ virtual Status SyncWAL() override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+
+ using DB::IngestExternalFile;
+ virtual Status IngestExternalFile(
+ ColumnFamilyHandle* /*column_family*/,
+ const std::vector<std::string>& /*external_files*/,
+ const IngestExternalFileOptions& /*ingestion_options*/) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+
+ using DB::CreateColumnFamilyWithImport;
+ virtual Status CreateColumnFamilyWithImport(
+ const ColumnFamilyOptions& /*options*/,
+ const std::string& /*column_family_name*/,
+ const ImportColumnFamilyOptions& /*import_options*/,
+ const ExportImportFilesMetaData& /*metadata*/,
+ ColumnFamilyHandle** /*handle*/) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+
+ private:
+ friend class DB;
+};
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/db_impl/db_impl_secondary.cc b/src/rocksdb/db/db_impl/db_impl_secondary.cc
new file mode 100644
index 000000000..f0ec27c32
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_secondary.cc
@@ -0,0 +1,671 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_impl/db_impl_secondary.h"
+
+#include <cinttypes>
+
+#include "db/arena_wrapped_db_iter.h"
+#include "db/merge_context.h"
+#include "logging/auto_roll_logger.h"
+#include "monitoring/perf_context_imp.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+DBImplSecondary::DBImplSecondary(const DBOptions& db_options,
+ const std::string& dbname)
+ : DBImpl(db_options, dbname) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Opening the db in secondary mode");
+ LogFlush(immutable_db_options_.info_log);
+}
+
+DBImplSecondary::~DBImplSecondary() {}
+
+Status DBImplSecondary::Recover(
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ bool /*readonly*/, bool /*error_if_log_file_exist*/,
+ bool /*error_if_data_exists_in_logs*/, uint64_t*) {
+ mutex_.AssertHeld();
+
+ JobContext job_context(0);
+ Status s;
+ s = static_cast<ReactiveVersionSet*>(versions_.get())
+ ->Recover(column_families, &manifest_reader_, &manifest_reporter_,
+ &manifest_reader_status_);
+ if (!s.ok()) {
+ return s;
+ }
+ if (immutable_db_options_.paranoid_checks && s.ok()) {
+ s = CheckConsistency();
+ }
+ // Initial max_total_in_memory_state_ before recovery logs.
+ max_total_in_memory_state_ = 0;
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+ max_total_in_memory_state_ += mutable_cf_options->write_buffer_size *
+ mutable_cf_options->max_write_buffer_number;
+ }
+ if (s.ok()) {
+ default_cf_handle_ = new ColumnFamilyHandleImpl(
+ versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_);
+ default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats();
+ single_column_family_mode_ =
+ versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1;
+
+ std::unordered_set<ColumnFamilyData*> cfds_changed;
+ s = FindAndRecoverLogFiles(&cfds_changed, &job_context);
+ }
+
+ if (s.IsPathNotFound()) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Secondary tries to read WAL, but WAL file(s) have already "
+ "been purged by primary.");
+ s = Status::OK();
+ }
+ // TODO: update options_file_number_ needed?
+
+ job_context.Clean();
+ return s;
+}
+
+// find new WAL and apply them in order to the secondary instance
+Status DBImplSecondary::FindAndRecoverLogFiles(
+ std::unordered_set<ColumnFamilyData*>* cfds_changed,
+ JobContext* job_context) {
+ assert(nullptr != cfds_changed);
+ assert(nullptr != job_context);
+ Status s;
+ std::vector<uint64_t> logs;
+ s = FindNewLogNumbers(&logs);
+ if (s.ok() && !logs.empty()) {
+ SequenceNumber next_sequence(kMaxSequenceNumber);
+ s = RecoverLogFiles(logs, &next_sequence, cfds_changed, job_context);
+ }
+ return s;
+}
+
+// List wal_dir and find all new WALs, return these log numbers
+Status DBImplSecondary::FindNewLogNumbers(std::vector<uint64_t>* logs) {
+ assert(logs != nullptr);
+ std::vector<std::string> filenames;
+ Status s;
+ s = env_->GetChildren(immutable_db_options_.wal_dir, &filenames);
+ if (s.IsNotFound()) {
+ return Status::InvalidArgument("Failed to open wal_dir",
+ immutable_db_options_.wal_dir);
+ } else if (!s.ok()) {
+ return s;
+ }
+
+ // if log_readers_ is non-empty, it means we have applied all logs with log
+ // numbers smaller than the smallest log in log_readers_, so there is no
+ // need to pass these logs to RecoverLogFiles
+ uint64_t log_number_min = 0;
+ if (!log_readers_.empty()) {
+ log_number_min = log_readers_.begin()->first;
+ }
+ for (size_t i = 0; i < filenames.size(); i++) {
+ uint64_t number;
+ FileType type;
+ if (ParseFileName(filenames[i], &number, &type) && type == kLogFile &&
+ number >= log_number_min) {
+ logs->push_back(number);
+ }
+ }
+ // Recover logs in the order that they were generated
+ if (!logs->empty()) {
+ std::sort(logs->begin(), logs->end());
+ }
+ return s;
+}
+
+Status DBImplSecondary::MaybeInitLogReader(
+ uint64_t log_number, log::FragmentBufferedReader** log_reader) {
+ auto iter = log_readers_.find(log_number);
+ // make sure the log file is still present
+ if (iter == log_readers_.end() ||
+ iter->second->reader_->GetLogNumber() != log_number) {
+ // delete the obsolete log reader if log number mismatch
+ if (iter != log_readers_.end()) {
+ log_readers_.erase(iter);
+ }
+ // initialize log reader from log_number
+ // TODO: min_log_number_to_keep_2pc check needed?
+ // Open the log file
+ std::string fname = LogFileName(immutable_db_options_.wal_dir, log_number);
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Recovering log #%" PRIu64 " mode %d", log_number,
+ static_cast<int>(immutable_db_options_.wal_recovery_mode));
+
+ std::unique_ptr<SequentialFileReader> file_reader;
+ {
+ std::unique_ptr<FSSequentialFile> file;
+ Status status = fs_->NewSequentialFile(
+ fname, fs_->OptimizeForLogRead(file_options_), &file,
+ nullptr);
+ if (!status.ok()) {
+ *log_reader = nullptr;
+ return status;
+ }
+ file_reader.reset(new SequentialFileReader(
+ std::move(file), fname, immutable_db_options_.log_readahead_size));
+ }
+
+ // Create the log reader.
+ LogReaderContainer* log_reader_container = new LogReaderContainer(
+ env_, immutable_db_options_.info_log, std::move(fname),
+ std::move(file_reader), log_number);
+ log_readers_.insert(std::make_pair(
+ log_number, std::unique_ptr<LogReaderContainer>(log_reader_container)));
+ }
+ iter = log_readers_.find(log_number);
+ assert(iter != log_readers_.end());
+ *log_reader = iter->second->reader_;
+ return Status::OK();
+}
+
+// After manifest recovery, replay WALs and refresh log_readers_ if necessary
+// REQUIRES: log_numbers are sorted in ascending order
+Status DBImplSecondary::RecoverLogFiles(
+ const std::vector<uint64_t>& log_numbers, SequenceNumber* next_sequence,
+ std::unordered_set<ColumnFamilyData*>* cfds_changed,
+ JobContext* job_context) {
+ assert(nullptr != cfds_changed);
+ assert(nullptr != job_context);
+ mutex_.AssertHeld();
+ Status status;
+ for (auto log_number : log_numbers) {
+ log::FragmentBufferedReader* reader = nullptr;
+ status = MaybeInitLogReader(log_number, &reader);
+ if (!status.ok()) {
+ return status;
+ }
+ assert(reader != nullptr);
+ }
+ for (auto log_number : log_numbers) {
+ auto it = log_readers_.find(log_number);
+ assert(it != log_readers_.end());
+ log::FragmentBufferedReader* reader = it->second->reader_;
+ // Manually update the file number allocation counter in VersionSet.
+ versions_->MarkFileNumberUsed(log_number);
+
+ // Determine if we should tolerate incomplete records at the tail end of the
+ // Read all the records and add to a memtable
+ std::string scratch;
+ Slice record;
+ WriteBatch batch;
+
+ while (reader->ReadRecord(&record, &scratch,
+ immutable_db_options_.wal_recovery_mode) &&
+ status.ok()) {
+ if (record.size() < WriteBatchInternal::kHeader) {
+ reader->GetReporter()->Corruption(
+ record.size(), Status::Corruption("log record too small"));
+ continue;
+ }
+ WriteBatchInternal::SetContents(&batch, record);
+ SequenceNumber seq_of_batch = WriteBatchInternal::Sequence(&batch);
+ std::vector<uint32_t> column_family_ids;
+ status = CollectColumnFamilyIdsFromWriteBatch(batch, &column_family_ids);
+ if (status.ok()) {
+ for (const auto id : column_family_ids) {
+ ColumnFamilyData* cfd =
+ versions_->GetColumnFamilySet()->GetColumnFamily(id);
+ if (cfd == nullptr) {
+ continue;
+ }
+ if (cfds_changed->count(cfd) == 0) {
+ cfds_changed->insert(cfd);
+ }
+ const std::vector<FileMetaData*>& l0_files =
+ cfd->current()->storage_info()->LevelFiles(0);
+ SequenceNumber seq =
+ l0_files.empty() ? 0 : l0_files.back()->fd.largest_seqno;
+ // If the write batch's sequence number is smaller than the last
+ // sequence number of the largest sequence persisted for this column
+ // family, then its data must reside in an SST that has already been
+ // added in the prior MANIFEST replay.
+ if (seq_of_batch <= seq) {
+ continue;
+ }
+ auto curr_log_num = port::kMaxUint64;
+ if (cfd_to_current_log_.count(cfd) > 0) {
+ curr_log_num = cfd_to_current_log_[cfd];
+ }
+ // If the active memtable contains records added by replaying an
+ // earlier WAL, then we need to seal the memtable, add it to the
+ // immutable memtable list and create a new active memtable.
+ if (!cfd->mem()->IsEmpty() && (curr_log_num == port::kMaxUint64 ||
+ curr_log_num != log_number)) {
+ const MutableCFOptions mutable_cf_options =
+ *cfd->GetLatestMutableCFOptions();
+ MemTable* new_mem =
+ cfd->ConstructNewMemtable(mutable_cf_options, seq_of_batch);
+ cfd->mem()->SetNextLogNumber(log_number);
+ cfd->imm()->Add(cfd->mem(), &job_context->memtables_to_free);
+ new_mem->Ref();
+ cfd->SetMemtable(new_mem);
+ }
+ }
+ bool has_valid_writes = false;
+ status = WriteBatchInternal::InsertInto(
+ &batch, column_family_memtables_.get(),
+ nullptr /* flush_scheduler */, nullptr /* trim_history_scheduler*/,
+ true, log_number, this, false /* concurrent_memtable_writes */,
+ next_sequence, &has_valid_writes, seq_per_batch_, batch_per_txn_);
+ }
+ // If column family was not found, it might mean that the WAL write
+ // batch references to the column family that was dropped after the
+ // insert. We don't want to fail the whole write batch in that case --
+ // we just ignore the update.
+ // That's why we set ignore missing column families to true
+ // passing null flush_scheduler will disable memtable flushing which is
+ // needed for secondary instances
+ if (status.ok()) {
+ for (const auto id : column_family_ids) {
+ ColumnFamilyData* cfd =
+ versions_->GetColumnFamilySet()->GetColumnFamily(id);
+ if (cfd == nullptr) {
+ continue;
+ }
+ std::unordered_map<ColumnFamilyData*, uint64_t>::iterator iter =
+ cfd_to_current_log_.find(cfd);
+ if (iter == cfd_to_current_log_.end()) {
+ cfd_to_current_log_.insert({cfd, log_number});
+ } else if (log_number > iter->second) {
+ iter->second = log_number;
+ }
+ }
+ auto last_sequence = *next_sequence - 1;
+ if ((*next_sequence != kMaxSequenceNumber) &&
+ (versions_->LastSequence() <= last_sequence)) {
+ versions_->SetLastAllocatedSequence(last_sequence);
+ versions_->SetLastPublishedSequence(last_sequence);
+ versions_->SetLastSequence(last_sequence);
+ }
+ } else {
+ // We are treating this as a failure while reading since we read valid
+ // blocks that do not form coherent data
+ reader->GetReporter()->Corruption(record.size(), status);
+ }
+ }
+ if (!status.ok()) {
+ return status;
+ }
+ }
+ // remove logreaders from map after successfully recovering the WAL
+ if (log_readers_.size() > 1) {
+ auto erase_iter = log_readers_.begin();
+ std::advance(erase_iter, log_readers_.size() - 1);
+ log_readers_.erase(log_readers_.begin(), erase_iter);
+ }
+ return status;
+}
+
+// Implementation of the DB interface
+Status DBImplSecondary::Get(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value) {
+ return GetImpl(read_options, column_family, key, value);
+}
+
+Status DBImplSecondary::GetImpl(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key, PinnableSlice* pinnable_val) {
+ assert(pinnable_val != nullptr);
+ PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
+ StopWatch sw(env_, stats_, DB_GET);
+ PERF_TIMER_GUARD(get_snapshot_time);
+
+ auto cfh = static_cast<ColumnFamilyHandleImpl*>(column_family);
+ ColumnFamilyData* cfd = cfh->cfd();
+ if (tracer_) {
+ InstrumentedMutexLock lock(&trace_mutex_);
+ if (tracer_) {
+ tracer_->Get(column_family, key);
+ }
+ }
+ // Acquire SuperVersion
+ SuperVersion* super_version = GetAndRefSuperVersion(cfd);
+ SequenceNumber snapshot = versions_->LastSequence();
+ MergeContext merge_context;
+ SequenceNumber max_covering_tombstone_seq = 0;
+ Status s;
+ LookupKey lkey(key, snapshot);
+ PERF_TIMER_STOP(get_snapshot_time);
+
+ bool done = false;
+ if (super_version->mem->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context,
+ &max_covering_tombstone_seq, read_options)) {
+ done = true;
+ pinnable_val->PinSelf();
+ RecordTick(stats_, MEMTABLE_HIT);
+ } else if ((s.ok() || s.IsMergeInProgress()) &&
+ super_version->imm->Get(
+ lkey, pinnable_val->GetSelf(), &s, &merge_context,
+ &max_covering_tombstone_seq, read_options)) {
+ done = true;
+ pinnable_val->PinSelf();
+ RecordTick(stats_, MEMTABLE_HIT);
+ }
+ if (!done && !s.ok() && !s.IsMergeInProgress()) {
+ ReturnAndCleanupSuperVersion(cfd, super_version);
+ return s;
+ }
+ if (!done) {
+ PERF_TIMER_GUARD(get_from_output_files_time);
+ super_version->current->Get(read_options, lkey, pinnable_val, &s,
+ &merge_context, &max_covering_tombstone_seq);
+ RecordTick(stats_, MEMTABLE_MISS);
+ }
+ {
+ PERF_TIMER_GUARD(get_post_process_time);
+ ReturnAndCleanupSuperVersion(cfd, super_version);
+ RecordTick(stats_, NUMBER_KEYS_READ);
+ size_t size = pinnable_val->size();
+ RecordTick(stats_, BYTES_READ, size);
+ RecordTimeToHistogram(stats_, BYTES_PER_READ, size);
+ PERF_COUNTER_ADD(get_read_bytes, size);
+ }
+ return s;
+}
+
+Iterator* DBImplSecondary::NewIterator(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family) {
+ if (read_options.managed) {
+ return NewErrorIterator(
+ Status::NotSupported("Managed iterator is not supported anymore."));
+ }
+ if (read_options.read_tier == kPersistedTier) {
+ return NewErrorIterator(Status::NotSupported(
+ "ReadTier::kPersistedData is not yet supported in iterators."));
+ }
+ Iterator* result = nullptr;
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ auto cfd = cfh->cfd();
+ ReadCallback* read_callback = nullptr; // No read callback provided.
+ if (read_options.tailing) {
+ return NewErrorIterator(Status::NotSupported(
+ "tailing iterator not supported in secondary mode"));
+ } else if (read_options.snapshot != nullptr) {
+ // TODO (yanqin) support snapshot.
+ return NewErrorIterator(
+ Status::NotSupported("snapshot not supported in secondary mode"));
+ } else {
+ auto snapshot = versions_->LastSequence();
+ result = NewIteratorImpl(read_options, cfd, snapshot, read_callback);
+ }
+ return result;
+}
+
+ArenaWrappedDBIter* DBImplSecondary::NewIteratorImpl(
+ const ReadOptions& read_options, ColumnFamilyData* cfd,
+ SequenceNumber snapshot, ReadCallback* read_callback) {
+ assert(nullptr != cfd);
+ SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+ auto db_iter = NewArenaWrappedDbIterator(
+ env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options,
+ snapshot,
+ super_version->mutable_cf_options.max_sequential_skip_in_iterations,
+ super_version->version_number, read_callback);
+ auto internal_iter =
+ NewInternalIterator(read_options, cfd, super_version, db_iter->GetArena(),
+ db_iter->GetRangeDelAggregator(), snapshot);
+ db_iter->SetIterUnderDBIter(internal_iter);
+ return db_iter;
+}
+
+Status DBImplSecondary::NewIterators(
+ const ReadOptions& read_options,
+ const std::vector<ColumnFamilyHandle*>& column_families,
+ std::vector<Iterator*>* iterators) {
+ if (read_options.managed) {
+ return Status::NotSupported("Managed iterator is not supported anymore.");
+ }
+ if (read_options.read_tier == kPersistedTier) {
+ return Status::NotSupported(
+ "ReadTier::kPersistedData is not yet supported in iterators.");
+ }
+ ReadCallback* read_callback = nullptr; // No read callback provided.
+ if (iterators == nullptr) {
+ return Status::InvalidArgument("iterators not allowed to be nullptr");
+ }
+ iterators->clear();
+ iterators->reserve(column_families.size());
+ if (read_options.tailing) {
+ return Status::NotSupported(
+ "tailing iterator not supported in secondary mode");
+ } else if (read_options.snapshot != nullptr) {
+ // TODO (yanqin) support snapshot.
+ return Status::NotSupported("snapshot not supported in secondary mode");
+ } else {
+ SequenceNumber read_seq = versions_->LastSequence();
+ for (auto cfh : column_families) {
+ ColumnFamilyData* cfd = static_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
+ iterators->push_back(
+ NewIteratorImpl(read_options, cfd, read_seq, read_callback));
+ }
+ }
+ return Status::OK();
+}
+
+Status DBImplSecondary::CheckConsistency() {
+ mutex_.AssertHeld();
+ Status s = DBImpl::CheckConsistency();
+ // If DBImpl::CheckConsistency() which is stricter returns success, then we
+ // do not need to give a second chance.
+ if (s.ok()) {
+ return s;
+ }
+ // It's possible that DBImpl::CheckConssitency() can fail because the primary
+ // may have removed certain files, causing the GetFileSize(name) call to
+ // fail and returning a PathNotFound. In this case, we take a best-effort
+ // approach and just proceed.
+ TEST_SYNC_POINT_CALLBACK(
+ "DBImplSecondary::CheckConsistency:AfterFirstAttempt", &s);
+
+ if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) {
+ return Status::OK();
+ }
+
+ std::vector<LiveFileMetaData> metadata;
+ versions_->GetLiveFilesMetaData(&metadata);
+
+ std::string corruption_messages;
+ for (const auto& md : metadata) {
+ // md.name has a leading "/".
+ std::string file_path = md.db_path + md.name;
+
+ uint64_t fsize = 0;
+ s = env_->GetFileSize(file_path, &fsize);
+ if (!s.ok() &&
+ (env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok() ||
+ s.IsPathNotFound())) {
+ s = Status::OK();
+ }
+ if (!s.ok()) {
+ corruption_messages +=
+ "Can't access " + md.name + ": " + s.ToString() + "\n";
+ }
+ }
+ return corruption_messages.empty() ? Status::OK()
+ : Status::Corruption(corruption_messages);
+}
+
+Status DBImplSecondary::TryCatchUpWithPrimary() {
+ assert(versions_.get() != nullptr);
+ assert(manifest_reader_.get() != nullptr);
+ Status s;
+ // read the manifest and apply new changes to the secondary instance
+ std::unordered_set<ColumnFamilyData*> cfds_changed;
+ JobContext job_context(0, true /*create_superversion*/);
+ {
+ InstrumentedMutexLock lock_guard(&mutex_);
+ s = static_cast_with_check<ReactiveVersionSet>(versions_.get())
+ ->ReadAndApply(&mutex_, &manifest_reader_, &cfds_changed);
+
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "Last sequence is %" PRIu64,
+ static_cast<uint64_t>(versions_->LastSequence()));
+ for (ColumnFamilyData* cfd : cfds_changed) {
+ if (cfd->IsDropped()) {
+ ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] is dropped\n",
+ cfd->GetName().c_str());
+ continue;
+ }
+ VersionStorageInfo::LevelSummaryStorage tmp;
+ ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+ "[%s] Level summary: %s\n", cfd->GetName().c_str(),
+ cfd->current()->storage_info()->LevelSummary(&tmp));
+ }
+
+ // list wal_dir to discover new WALs and apply new changes to the secondary
+ // instance
+ if (s.ok()) {
+ s = FindAndRecoverLogFiles(&cfds_changed, &job_context);
+ }
+ if (s.IsPathNotFound()) {
+ ROCKS_LOG_INFO(
+ immutable_db_options_.info_log,
+ "Secondary tries to read WAL, but WAL file(s) have already "
+ "been purged by primary.");
+ s = Status::OK();
+ }
+ if (s.ok()) {
+ for (auto cfd : cfds_changed) {
+ cfd->imm()->RemoveOldMemTables(cfd->GetLogNumber(),
+ &job_context.memtables_to_free);
+ auto& sv_context = job_context.superversion_contexts.back();
+ cfd->InstallSuperVersion(&sv_context, &mutex_);
+ sv_context.NewSuperVersion();
+ }
+ }
+ }
+ job_context.Clean();
+
+ // Cleanup unused, obsolete files.
+ JobContext purge_files_job_context(0);
+ {
+ InstrumentedMutexLock lock_guard(&mutex_);
+ // Currently, secondary instance does not own the database files, thus it
+ // is unnecessary for the secondary to force full scan.
+ FindObsoleteFiles(&purge_files_job_context, /*force=*/false);
+ }
+ if (purge_files_job_context.HaveSomethingToDelete()) {
+ PurgeObsoleteFiles(purge_files_job_context);
+ }
+ purge_files_job_context.Clean();
+ return s;
+}
+
+Status DB::OpenAsSecondary(const Options& options, const std::string& dbname,
+ const std::string& secondary_path, DB** dbptr) {
+ *dbptr = nullptr;
+
+ DBOptions db_options(options);
+ ColumnFamilyOptions cf_options(options);
+ std::vector<ColumnFamilyDescriptor> column_families;
+ column_families.emplace_back(kDefaultColumnFamilyName, cf_options);
+ std::vector<ColumnFamilyHandle*> handles;
+
+ Status s = DB::OpenAsSecondary(db_options, dbname, secondary_path,
+ column_families, &handles, dbptr);
+ if (s.ok()) {
+ assert(handles.size() == 1);
+ delete handles[0];
+ }
+ return s;
+}
+
+Status DB::OpenAsSecondary(
+ const DBOptions& db_options, const std::string& dbname,
+ const std::string& secondary_path,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
+ *dbptr = nullptr;
+ if (db_options.max_open_files != -1) {
+ // TODO (yanqin) maybe support max_open_files != -1 by creating hard links
+ // on SST files so that db secondary can still have access to old SSTs
+ // while primary instance may delete original.
+ return Status::InvalidArgument("require max_open_files to be -1");
+ }
+
+ DBOptions tmp_opts(db_options);
+ Status s;
+ if (nullptr == tmp_opts.info_log) {
+ s = CreateLoggerFromOptions(secondary_path, tmp_opts, &tmp_opts.info_log);
+ if (!s.ok()) {
+ tmp_opts.info_log = nullptr;
+ }
+ }
+
+ handles->clear();
+ DBImplSecondary* impl = new DBImplSecondary(tmp_opts, dbname);
+ impl->versions_.reset(new ReactiveVersionSet(
+ dbname, &impl->immutable_db_options_, impl->file_options_,
+ impl->table_cache_.get(), impl->write_buffer_manager_,
+ &impl->write_controller_));
+ impl->column_family_memtables_.reset(
+ new ColumnFamilyMemTablesImpl(impl->versions_->GetColumnFamilySet()));
+ impl->wal_in_db_path_ = IsWalDirSameAsDBPath(&impl->immutable_db_options_);
+
+ impl->mutex_.Lock();
+ s = impl->Recover(column_families, true, false, false);
+ if (s.ok()) {
+ for (auto cf : column_families) {
+ auto cfd =
+ impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
+ if (nullptr == cfd) {
+ s = Status::InvalidArgument("Column family not found: ", cf.name);
+ break;
+ }
+ handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
+ }
+ }
+ SuperVersionContext sv_context(true /* create_superversion */);
+ if (s.ok()) {
+ for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
+ sv_context.NewSuperVersion();
+ cfd->InstallSuperVersion(&sv_context, &impl->mutex_);
+ }
+ }
+ impl->mutex_.Unlock();
+ sv_context.Clean();
+ if (s.ok()) {
+ *dbptr = impl;
+ for (auto h : *handles) {
+ impl->NewThreadStatusCfInfo(
+ reinterpret_cast<ColumnFamilyHandleImpl*>(h)->cfd());
+ }
+ } else {
+ for (auto h : *handles) {
+ delete h;
+ }
+ handles->clear();
+ delete impl;
+ }
+ return s;
+}
+#else // !ROCKSDB_LITE
+
+Status DB::OpenAsSecondary(const Options& /*options*/,
+ const std::string& /*name*/,
+ const std::string& /*secondary_path*/,
+ DB** /*dbptr*/) {
+ return Status::NotSupported("Not supported in ROCKSDB_LITE.");
+}
+
+Status DB::OpenAsSecondary(
+ const DBOptions& /*db_options*/, const std::string& /*dbname*/,
+ const std::string& /*secondary_path*/,
+ const std::vector<ColumnFamilyDescriptor>& /*column_families*/,
+ std::vector<ColumnFamilyHandle*>* /*handles*/, DB** /*dbptr*/) {
+ return Status::NotSupported("Not supported in ROCKSDB_LITE.");
+}
+#endif // !ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_secondary.h b/src/rocksdb/db/db_impl/db_impl_secondary.h
new file mode 100644
index 000000000..24f2e7767
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_secondary.h
@@ -0,0 +1,333 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+#include "db/db_impl/db_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A wrapper class to hold log reader, log reporter, log status.
+class LogReaderContainer {
+ public:
+ LogReaderContainer()
+ : reader_(nullptr), reporter_(nullptr), status_(nullptr) {}
+ LogReaderContainer(Env* env, std::shared_ptr<Logger> info_log,
+ std::string fname,
+ std::unique_ptr<SequentialFileReader>&& file_reader,
+ uint64_t log_number) {
+ LogReporter* reporter = new LogReporter();
+ status_ = new Status();
+ reporter->env = env;
+ reporter->info_log = info_log.get();
+ reporter->fname = std::move(fname);
+ reporter->status = status_;
+ reporter_ = reporter;
+ // We intentially make log::Reader do checksumming even if
+ // paranoid_checks==false so that corruptions cause entire commits
+ // to be skipped instead of propagating bad information (like overly
+ // large sequence numbers).
+ reader_ = new log::FragmentBufferedReader(info_log, std::move(file_reader),
+ reporter, true /*checksum*/,
+ log_number);
+ }
+ log::FragmentBufferedReader* reader_;
+ log::Reader::Reporter* reporter_;
+ Status* status_;
+ ~LogReaderContainer() {
+ delete reader_;
+ delete reporter_;
+ delete status_;
+ }
+ private:
+ struct LogReporter : public log::Reader::Reporter {
+ Env* env;
+ Logger* info_log;
+ std::string fname;
+ Status* status; // nullptr if immutable_db_options_.paranoid_checks==false
+ void Corruption(size_t bytes, const Status& s) override {
+ ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s",
+ (this->status == nullptr ? "(ignoring error) " : ""),
+ fname.c_str(), static_cast<int>(bytes),
+ s.ToString().c_str());
+ if (this->status != nullptr && this->status->ok()) {
+ *this->status = s;
+ }
+ }
+ };
+};
+
+// The secondary instance shares access to the storage as the primary.
+// The secondary is able to read and replay changes described in both the
+// MANIFEST and the WAL files without coordination with the primary.
+// The secondary instance can be opened using `DB::OpenAsSecondary`. After
+// that, it can call `DBImplSecondary::TryCatchUpWithPrimary` to make best
+// effort attempts to catch up with the primary.
+class DBImplSecondary : public DBImpl {
+ public:
+ DBImplSecondary(const DBOptions& options, const std::string& dbname);
+ ~DBImplSecondary() override;
+
+ // Recover by replaying MANIFEST and WAL. Also initialize manifest_reader_
+ // and log_readers_ to facilitate future operations.
+ Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
+ bool read_only, bool error_if_log_file_exist,
+ bool error_if_data_exists_in_logs,
+ uint64_t* = nullptr) override;
+
+ // Implementations of the DB interface
+ using DB::Get;
+ Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
+ const Slice& key, PinnableSlice* value) override;
+
+ Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family,
+ const Slice& key, PinnableSlice* value);
+
+ using DBImpl::NewIterator;
+ Iterator* NewIterator(const ReadOptions&,
+ ColumnFamilyHandle* column_family) override;
+
+ ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& read_options,
+ ColumnFamilyData* cfd,
+ SequenceNumber snapshot,
+ ReadCallback* read_callback);
+
+ Status NewIterators(const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_families,
+ std::vector<Iterator*>* iterators) override;
+
+ using DBImpl::Put;
+ Status Put(const WriteOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
+ const Slice& /*value*/) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ using DBImpl::Merge;
+ Status Merge(const WriteOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
+ const Slice& /*value*/) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ using DBImpl::Delete;
+ Status Delete(const WriteOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*key*/) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ using DBImpl::SingleDelete;
+ Status SingleDelete(const WriteOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*key*/) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ Status Write(const WriteOptions& /*options*/,
+ WriteBatch* /*updates*/) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ using DBImpl::CompactRange;
+ Status CompactRange(const CompactRangeOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice* /*begin*/, const Slice* /*end*/) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ using DBImpl::CompactFiles;
+ Status CompactFiles(
+ const CompactionOptions& /*compact_options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const std::vector<std::string>& /*input_file_names*/,
+ const int /*output_level*/, const int /*output_path_id*/ = -1,
+ std::vector<std::string>* const /*output_file_names*/ = nullptr,
+ CompactionJobInfo* /*compaction_job_info*/ = nullptr) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ Status DisableFileDeletions() override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ Status EnableFileDeletions(bool /*force*/) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ Status GetLiveFiles(std::vector<std::string>&,
+ uint64_t* /*manifest_file_size*/,
+ bool /*flush_memtable*/ = true) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ using DBImpl::Flush;
+ Status Flush(const FlushOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ using DBImpl::SetDBOptions;
+ Status SetDBOptions(const std::unordered_map<std::string, std::string>&
+ /*options_map*/) override {
+ // Currently not supported because changing certain options may cause
+ // flush/compaction.
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ using DBImpl::SetOptions;
+ Status SetOptions(
+ ColumnFamilyHandle* /*cfd*/,
+ const std::unordered_map<std::string, std::string>& /*options_map*/)
+ override {
+ // Currently not supported because changing certain options may cause
+ // flush/compaction and/or write to MANIFEST.
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ using DBImpl::SyncWAL;
+ Status SyncWAL() override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ using DB::IngestExternalFile;
+ Status IngestExternalFile(
+ ColumnFamilyHandle* /*column_family*/,
+ const std::vector<std::string>& /*external_files*/,
+ const IngestExternalFileOptions& /*ingestion_options*/) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ // Try to catch up with the primary by reading as much as possible from the
+ // log files until there is nothing more to read or encounters an error. If
+ // the amount of information in the log files to process is huge, this
+ // method can take long time due to all the I/O and CPU costs.
+ Status TryCatchUpWithPrimary() override;
+
+
+ // Try to find log reader using log_number from log_readers_ map, initialize
+ // if it doesn't exist
+ Status MaybeInitLogReader(uint64_t log_number,
+ log::FragmentBufferedReader** log_reader);
+
+ // Check if all live files exist on file system and that their file sizes
+ // matche to the in-memory records. It is possible that some live files may
+ // have been deleted by the primary. In this case, CheckConsistency() does
+ // not flag the missing file as inconsistency.
+ Status CheckConsistency() override;
+
+ protected:
+ // ColumnFamilyCollector is a write batch handler which does nothing
+ // except recording unique column family IDs
+ class ColumnFamilyCollector : public WriteBatch::Handler {
+ std::unordered_set<uint32_t> column_family_ids_;
+
+ Status AddColumnFamilyId(uint32_t column_family_id) {
+ if (column_family_ids_.find(column_family_id) ==
+ column_family_ids_.end()) {
+ column_family_ids_.insert(column_family_id);
+ }
+ return Status::OK();
+ }
+
+ public:
+ explicit ColumnFamilyCollector() {}
+
+ ~ColumnFamilyCollector() override {}
+
+ Status PutCF(uint32_t column_family_id, const Slice&,
+ const Slice&) override {
+ return AddColumnFamilyId(column_family_id);
+ }
+
+ Status DeleteCF(uint32_t column_family_id, const Slice&) override {
+ return AddColumnFamilyId(column_family_id);
+ }
+
+ Status SingleDeleteCF(uint32_t column_family_id, const Slice&) override {
+ return AddColumnFamilyId(column_family_id);
+ }
+
+ Status DeleteRangeCF(uint32_t column_family_id, const Slice&,
+ const Slice&) override {
+ return AddColumnFamilyId(column_family_id);
+ }
+
+ Status MergeCF(uint32_t column_family_id, const Slice&,
+ const Slice&) override {
+ return AddColumnFamilyId(column_family_id);
+ }
+
+ Status PutBlobIndexCF(uint32_t column_family_id, const Slice&,
+ const Slice&) override {
+ return AddColumnFamilyId(column_family_id);
+ }
+
+ const std::unordered_set<uint32_t>& column_families() const {
+ return column_family_ids_;
+ }
+ };
+
+ Status CollectColumnFamilyIdsFromWriteBatch(
+ const WriteBatch& batch, std::vector<uint32_t>* column_family_ids) {
+ assert(column_family_ids != nullptr);
+ column_family_ids->clear();
+ ColumnFamilyCollector handler;
+ Status s = batch.Iterate(&handler);
+ if (s.ok()) {
+ for (const auto& cf : handler.column_families()) {
+ column_family_ids->push_back(cf);
+ }
+ }
+ return s;
+ }
+
+ bool OwnTablesAndLogs() const override {
+ // Currently, the secondary instance does not own the database files. It
+ // simply opens the files of the primary instance and tracks their file
+ // descriptors until they become obsolete. In the future, the secondary may
+ // create links to database files. OwnTablesAndLogs will return true then.
+ return false;
+ }
+
+ private:
+ friend class DB;
+
+ // No copying allowed
+ DBImplSecondary(const DBImplSecondary&);
+ void operator=(const DBImplSecondary&);
+
+ using DBImpl::Recover;
+
+ Status FindAndRecoverLogFiles(
+ std::unordered_set<ColumnFamilyData*>* cfds_changed,
+ JobContext* job_context);
+ Status FindNewLogNumbers(std::vector<uint64_t>* logs);
+ // After manifest recovery, replay WALs and refresh log_readers_ if necessary
+ // REQUIRES: log_numbers are sorted in ascending order
+ Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
+ SequenceNumber* next_sequence,
+ std::unordered_set<ColumnFamilyData*>* cfds_changed,
+ JobContext* job_context);
+
+ std::unique_ptr<log::FragmentBufferedReader> manifest_reader_;
+ std::unique_ptr<log::Reader::Reporter> manifest_reporter_;
+ std::unique_ptr<Status> manifest_reader_status_;
+
+ // Cache log readers for each log number, used for continue WAL replay
+ // after recovery
+ std::map<uint64_t, std::unique_ptr<LogReaderContainer>> log_readers_;
+
+ // Current WAL number replayed for each column family.
+ std::unordered_map<ColumnFamilyData*, uint64_t> cfd_to_current_log_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/db_impl/db_impl_write.cc b/src/rocksdb/db/db_impl/db_impl_write.cc
new file mode 100644
index 000000000..8f6f685e4
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_write.cc
@@ -0,0 +1,1839 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/db_impl/db_impl.h"
+
+#include <cinttypes>
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "monitoring/perf_context_imp.h"
+#include "options/options_helper.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Convenience methods
+Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& val) {
+ return DB::Put(o, column_family, key, val);
+}
+
+Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& val) {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ if (!cfh->cfd()->ioptions()->merge_operator) {
+ return Status::NotSupported("Provide a merge_operator when opening DB");
+ } else {
+ return DB::Merge(o, column_family, key, val);
+ }
+}
+
+Status DBImpl::Delete(const WriteOptions& write_options,
+ ColumnFamilyHandle* column_family, const Slice& key) {
+ return DB::Delete(write_options, column_family, key);
+}
+
+Status DBImpl::SingleDelete(const WriteOptions& write_options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key) {
+ return DB::SingleDelete(write_options, column_family, key);
+}
+
+void DBImpl::SetRecoverableStatePreReleaseCallback(
+ PreReleaseCallback* callback) {
+ recoverable_state_pre_release_callback_.reset(callback);
+}
+
+Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
+ return WriteImpl(write_options, my_batch, nullptr, nullptr);
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::WriteWithCallback(const WriteOptions& write_options,
+ WriteBatch* my_batch,
+ WriteCallback* callback) {
+ return WriteImpl(write_options, my_batch, callback, nullptr);
+}
+#endif // ROCKSDB_LITE
+
+// The main write queue. This is the only write queue that updates LastSequence.
+// When using one write queue, the same sequence also indicates the last
+// published sequence.
+Status DBImpl::WriteImpl(const WriteOptions& write_options,
+ WriteBatch* my_batch, WriteCallback* callback,
+ uint64_t* log_used, uint64_t log_ref,
+ bool disable_memtable, uint64_t* seq_used,
+ size_t batch_cnt,
+ PreReleaseCallback* pre_release_callback) {
+ assert(!seq_per_batch_ || batch_cnt != 0);
+ if (my_batch == nullptr) {
+ return Status::Corruption("Batch is nullptr!");
+ }
+ if (tracer_) {
+ InstrumentedMutexLock lock(&trace_mutex_);
+ if (tracer_) {
+ tracer_->Write(my_batch);
+ }
+ }
+ if (write_options.sync && write_options.disableWAL) {
+ return Status::InvalidArgument("Sync writes has to enable WAL.");
+ }
+ if (two_write_queues_ && immutable_db_options_.enable_pipelined_write) {
+ return Status::NotSupported(
+ "pipelined_writes is not compatible with concurrent prepares");
+ }
+ if (seq_per_batch_ && immutable_db_options_.enable_pipelined_write) {
+ // TODO(yiwu): update pipeline write with seq_per_batch and batch_cnt
+ return Status::NotSupported(
+ "pipelined_writes is not compatible with seq_per_batch");
+ }
+ if (immutable_db_options_.unordered_write &&
+ immutable_db_options_.enable_pipelined_write) {
+ return Status::NotSupported(
+ "pipelined_writes is not compatible with unordered_write");
+ }
+ // Otherwise IsLatestPersistentState optimization does not make sense
+ assert(!WriteBatchInternal::IsLatestPersistentState(my_batch) ||
+ disable_memtable);
+
+ Status status;
+ if (write_options.low_pri) {
+ status = ThrottleLowPriWritesIfNeeded(write_options, my_batch);
+ if (!status.ok()) {
+ return status;
+ }
+ }
+
+ if (two_write_queues_ && disable_memtable) {
+ AssignOrder assign_order =
+ seq_per_batch_ ? kDoAssignOrder : kDontAssignOrder;
+ // Otherwise it is WAL-only Prepare batches in WriteCommitted policy and
+ // they don't consume sequence.
+ return WriteImplWALOnly(&nonmem_write_thread_, write_options, my_batch,
+ callback, log_used, log_ref, seq_used, batch_cnt,
+ pre_release_callback, assign_order,
+ kDontPublishLastSeq, disable_memtable);
+ }
+
+ if (immutable_db_options_.unordered_write) {
+ const size_t sub_batch_cnt = batch_cnt != 0
+ ? batch_cnt
+ // every key is a sub-batch consuming a seq
+ : WriteBatchInternal::Count(my_batch);
+ uint64_t seq;
+ // Use a write thread to i) optimize for WAL write, ii) publish last
+ // sequence in in increasing order, iii) call pre_release_callback serially
+ status = WriteImplWALOnly(&write_thread_, write_options, my_batch, callback,
+ log_used, log_ref, &seq, sub_batch_cnt,
+ pre_release_callback, kDoAssignOrder,
+ kDoPublishLastSeq, disable_memtable);
+ TEST_SYNC_POINT("DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL");
+ if (!status.ok()) {
+ return status;
+ }
+ if (seq_used) {
+ *seq_used = seq;
+ }
+ if (!disable_memtable) {
+ TEST_SYNC_POINT("DBImpl::WriteImpl:BeforeUnorderedWriteMemtable");
+ status = UnorderedWriteMemtable(write_options, my_batch, callback,
+ log_ref, seq, sub_batch_cnt);
+ }
+ return status;
+ }
+
+ if (immutable_db_options_.enable_pipelined_write) {
+ return PipelinedWriteImpl(write_options, my_batch, callback, log_used,
+ log_ref, disable_memtable, seq_used);
+ }
+
+ PERF_TIMER_GUARD(write_pre_and_post_process_time);
+ WriteThread::Writer w(write_options, my_batch, callback, log_ref,
+ disable_memtable, batch_cnt, pre_release_callback);
+
+ if (!write_options.disableWAL) {
+ RecordTick(stats_, WRITE_WITH_WAL);
+ }
+
+ StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE);
+
+ write_thread_.JoinBatchGroup(&w);
+ if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) {
+ // we are a non-leader in a parallel group
+
+ if (w.ShouldWriteToMemtable()) {
+ PERF_TIMER_STOP(write_pre_and_post_process_time);
+ PERF_TIMER_GUARD(write_memtable_time);
+
+ ColumnFamilyMemTablesImpl column_family_memtables(
+ versions_->GetColumnFamilySet());
+ w.status = WriteBatchInternal::InsertInto(
+ &w, w.sequence, &column_family_memtables, &flush_scheduler_,
+ &trim_history_scheduler_,
+ write_options.ignore_missing_column_families, 0 /*log_number*/, this,
+ true /*concurrent_memtable_writes*/, seq_per_batch_, w.batch_cnt,
+ batch_per_txn_, write_options.memtable_insert_hint_per_batch);
+
+ PERF_TIMER_START(write_pre_and_post_process_time);
+ }
+
+ if (write_thread_.CompleteParallelMemTableWriter(&w)) {
+ // we're responsible for exit batch group
+ // TODO(myabandeh): propagate status to write_group
+ auto last_sequence = w.write_group->last_sequence;
+ versions_->SetLastSequence(last_sequence);
+ MemTableInsertStatusCheck(w.status);
+ write_thread_.ExitAsBatchGroupFollower(&w);
+ }
+ assert(w.state == WriteThread::STATE_COMPLETED);
+ // STATE_COMPLETED conditional below handles exit
+
+ status = w.FinalStatus();
+ }
+ if (w.state == WriteThread::STATE_COMPLETED) {
+ if (log_used != nullptr) {
+ *log_used = w.log_used;
+ }
+ if (seq_used != nullptr) {
+ *seq_used = w.sequence;
+ }
+ // write is complete and leader has updated sequence
+ return w.FinalStatus();
+ }
+ // else we are the leader of the write batch group
+ assert(w.state == WriteThread::STATE_GROUP_LEADER);
+
+ // Once reaches this point, the current writer "w" will try to do its write
+ // job. It may also pick up some of the remaining writers in the "writers_"
+ // when it finds suitable, and finish them in the same write batch.
+ // This is how a write job could be done by the other writer.
+ WriteContext write_context;
+ WriteThread::WriteGroup write_group;
+ bool in_parallel_group = false;
+ uint64_t last_sequence = kMaxSequenceNumber;
+
+ mutex_.Lock();
+
+ bool need_log_sync = write_options.sync;
+ bool need_log_dir_sync = need_log_sync && !log_dir_synced_;
+ if (!two_write_queues_ || !disable_memtable) {
+ // With concurrent writes we do preprocess only in the write thread that
+ // also does write to memtable to avoid sync issue on shared data structure
+ // with the other thread
+
+ // PreprocessWrite does its own perf timing.
+ PERF_TIMER_STOP(write_pre_and_post_process_time);
+
+ status = PreprocessWrite(write_options, &need_log_sync, &write_context);
+ if (!two_write_queues_) {
+ // Assign it after ::PreprocessWrite since the sequence might advance
+ // inside it by WriteRecoverableState
+ last_sequence = versions_->LastSequence();
+ }
+
+ PERF_TIMER_START(write_pre_and_post_process_time);
+ }
+ log::Writer* log_writer = logs_.back().writer;
+
+ mutex_.Unlock();
+
+ // Add to log and apply to memtable. We can release the lock
+ // during this phase since &w is currently responsible for logging
+ // and protects against concurrent loggers and concurrent writes
+ // into memtables
+
+ TEST_SYNC_POINT("DBImpl::WriteImpl:BeforeLeaderEnters");
+ last_batch_group_size_ =
+ write_thread_.EnterAsBatchGroupLeader(&w, &write_group);
+
+ if (status.ok()) {
+ // Rules for when we can update the memtable concurrently
+ // 1. supported by memtable
+ // 2. Puts are not okay if inplace_update_support
+ // 3. Merges are not okay
+ //
+ // Rules 1..2 are enforced by checking the options
+ // during startup (CheckConcurrentWritesSupported), so if
+ // options.allow_concurrent_memtable_write is true then they can be
+ // assumed to be true. Rule 3 is checked for each batch. We could
+ // relax rules 2 if we could prevent write batches from referring
+ // more than once to a particular key.
+ bool parallel = immutable_db_options_.allow_concurrent_memtable_write &&
+ write_group.size > 1;
+ size_t total_count = 0;
+ size_t valid_batches = 0;
+ size_t total_byte_size = 0;
+ size_t pre_release_callback_cnt = 0;
+ for (auto* writer : write_group) {
+ if (writer->CheckCallback(this)) {
+ valid_batches += writer->batch_cnt;
+ if (writer->ShouldWriteToMemtable()) {
+ total_count += WriteBatchInternal::Count(writer->batch);
+ parallel = parallel && !writer->batch->HasMerge();
+ }
+ total_byte_size = WriteBatchInternal::AppendedByteSize(
+ total_byte_size, WriteBatchInternal::ByteSize(writer->batch));
+ if (writer->pre_release_callback) {
+ pre_release_callback_cnt++;
+ }
+ }
+ }
+ // Note about seq_per_batch_: either disableWAL is set for the entire write
+ // group or not. In either case we inc seq for each write batch with no
+ // failed callback. This means that there could be a batch with
+ // disalbe_memtable in between; although we do not write this batch to
+ // memtable it still consumes a seq. Otherwise, if !seq_per_batch_, we inc
+ // the seq per valid written key to mem.
+ size_t seq_inc = seq_per_batch_ ? valid_batches : total_count;
+
+ const bool concurrent_update = two_write_queues_;
+ // Update stats while we are an exclusive group leader, so we know
+ // that nobody else can be writing to these particular stats.
+ // We're optimistic, updating the stats before we successfully
+ // commit. That lets us release our leader status early.
+ auto stats = default_cf_internal_stats_;
+ stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count,
+ concurrent_update);
+ RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count);
+ stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size,
+ concurrent_update);
+ RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
+ stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1,
+ concurrent_update);
+ RecordTick(stats_, WRITE_DONE_BY_SELF);
+ auto write_done_by_other = write_group.size - 1;
+ if (write_done_by_other > 0) {
+ stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther,
+ write_done_by_other, concurrent_update);
+ RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other);
+ }
+ RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size);
+
+ if (write_options.disableWAL) {
+ has_unpersisted_data_.store(true, std::memory_order_relaxed);
+ }
+
+ PERF_TIMER_STOP(write_pre_and_post_process_time);
+
+ if (!two_write_queues_) {
+ if (status.ok() && !write_options.disableWAL) {
+ PERF_TIMER_GUARD(write_wal_time);
+ status = WriteToWAL(write_group, log_writer, log_used, need_log_sync,
+ need_log_dir_sync, last_sequence + 1);
+ }
+ } else {
+ if (status.ok() && !write_options.disableWAL) {
+ PERF_TIMER_GUARD(write_wal_time);
+ // LastAllocatedSequence is increased inside WriteToWAL under
+ // wal_write_mutex_ to ensure ordered events in WAL
+ status = ConcurrentWriteToWAL(write_group, log_used, &last_sequence,
+ seq_inc);
+ } else {
+ // Otherwise we inc seq number for memtable writes
+ last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
+ }
+ }
+ assert(last_sequence != kMaxSequenceNumber);
+ const SequenceNumber current_sequence = last_sequence + 1;
+ last_sequence += seq_inc;
+
+ // PreReleaseCallback is called after WAL write and before memtable write
+ if (status.ok()) {
+ SequenceNumber next_sequence = current_sequence;
+ size_t index = 0;
+ // Note: the logic for advancing seq here must be consistent with the
+ // logic in WriteBatchInternal::InsertInto(write_group...) as well as
+ // with WriteBatchInternal::InsertInto(write_batch...) that is called on
+ // the merged batch during recovery from the WAL.
+ for (auto* writer : write_group) {
+ if (writer->CallbackFailed()) {
+ continue;
+ }
+ writer->sequence = next_sequence;
+ if (writer->pre_release_callback) {
+ Status ws = writer->pre_release_callback->Callback(
+ writer->sequence, disable_memtable, writer->log_used, index++,
+ pre_release_callback_cnt);
+ if (!ws.ok()) {
+ status = ws;
+ break;
+ }
+ }
+ if (seq_per_batch_) {
+ assert(writer->batch_cnt);
+ next_sequence += writer->batch_cnt;
+ } else if (writer->ShouldWriteToMemtable()) {
+ next_sequence += WriteBatchInternal::Count(writer->batch);
+ }
+ }
+ }
+
+ if (status.ok()) {
+ PERF_TIMER_GUARD(write_memtable_time);
+
+ if (!parallel) {
+ // w.sequence will be set inside InsertInto
+ w.status = WriteBatchInternal::InsertInto(
+ write_group, current_sequence, column_family_memtables_.get(),
+ &flush_scheduler_, &trim_history_scheduler_,
+ write_options.ignore_missing_column_families,
+ 0 /*recovery_log_number*/, this, parallel, seq_per_batch_,
+ batch_per_txn_);
+ } else {
+ write_group.last_sequence = last_sequence;
+ write_thread_.LaunchParallelMemTableWriters(&write_group);
+ in_parallel_group = true;
+
+ // Each parallel follower is doing each own writes. The leader should
+ // also do its own.
+ if (w.ShouldWriteToMemtable()) {
+ ColumnFamilyMemTablesImpl column_family_memtables(
+ versions_->GetColumnFamilySet());
+ assert(w.sequence == current_sequence);
+ w.status = WriteBatchInternal::InsertInto(
+ &w, w.sequence, &column_family_memtables, &flush_scheduler_,
+ &trim_history_scheduler_,
+ write_options.ignore_missing_column_families, 0 /*log_number*/,
+ this, true /*concurrent_memtable_writes*/, seq_per_batch_,
+ w.batch_cnt, batch_per_txn_,
+ write_options.memtable_insert_hint_per_batch);
+ }
+ }
+ if (seq_used != nullptr) {
+ *seq_used = w.sequence;
+ }
+ }
+ }
+ PERF_TIMER_START(write_pre_and_post_process_time);
+
+ if (!w.CallbackFailed()) {
+ WriteStatusCheck(status);
+ }
+
+ if (need_log_sync) {
+ mutex_.Lock();
+ MarkLogsSynced(logfile_number_, need_log_dir_sync, status);
+ mutex_.Unlock();
+ // Requesting sync with two_write_queues_ is expected to be very rare. We
+ // hence provide a simple implementation that is not necessarily efficient.
+ if (two_write_queues_) {
+ if (manual_wal_flush_) {
+ status = FlushWAL(true);
+ } else {
+ status = SyncWAL();
+ }
+ }
+ }
+
+ bool should_exit_batch_group = true;
+ if (in_parallel_group) {
+ // CompleteParallelWorker returns true if this thread should
+ // handle exit, false means somebody else did
+ should_exit_batch_group = write_thread_.CompleteParallelMemTableWriter(&w);
+ }
+ if (should_exit_batch_group) {
+ if (status.ok()) {
+ // Note: if we are to resume after non-OK statuses we need to revisit how
+ // we reacts to non-OK statuses here.
+ versions_->SetLastSequence(last_sequence);
+ }
+ MemTableInsertStatusCheck(w.status);
+ write_thread_.ExitAsBatchGroupLeader(write_group, status);
+ }
+
+ if (status.ok()) {
+ status = w.FinalStatus();
+ }
+ return status;
+}
+
+Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
+ WriteBatch* my_batch, WriteCallback* callback,
+ uint64_t* log_used, uint64_t log_ref,
+ bool disable_memtable, uint64_t* seq_used) {
+ PERF_TIMER_GUARD(write_pre_and_post_process_time);
+ StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE);
+
+ WriteContext write_context;
+
+ WriteThread::Writer w(write_options, my_batch, callback, log_ref,
+ disable_memtable);
+ write_thread_.JoinBatchGroup(&w);
+ if (w.state == WriteThread::STATE_GROUP_LEADER) {
+ WriteThread::WriteGroup wal_write_group;
+ if (w.callback && !w.callback->AllowWriteBatching()) {
+ write_thread_.WaitForMemTableWriters();
+ }
+ mutex_.Lock();
+ bool need_log_sync = !write_options.disableWAL && write_options.sync;
+ bool need_log_dir_sync = need_log_sync && !log_dir_synced_;
+ // PreprocessWrite does its own perf timing.
+ PERF_TIMER_STOP(write_pre_and_post_process_time);
+ w.status = PreprocessWrite(write_options, &need_log_sync, &write_context);
+ PERF_TIMER_START(write_pre_and_post_process_time);
+ log::Writer* log_writer = logs_.back().writer;
+ mutex_.Unlock();
+
+ // This can set non-OK status if callback fail.
+ last_batch_group_size_ =
+ write_thread_.EnterAsBatchGroupLeader(&w, &wal_write_group);
+ const SequenceNumber current_sequence =
+ write_thread_.UpdateLastSequence(versions_->LastSequence()) + 1;
+ size_t total_count = 0;
+ size_t total_byte_size = 0;
+
+ if (w.status.ok()) {
+ SequenceNumber next_sequence = current_sequence;
+ for (auto writer : wal_write_group) {
+ if (writer->CheckCallback(this)) {
+ if (writer->ShouldWriteToMemtable()) {
+ writer->sequence = next_sequence;
+ size_t count = WriteBatchInternal::Count(writer->batch);
+ next_sequence += count;
+ total_count += count;
+ }
+ total_byte_size = WriteBatchInternal::AppendedByteSize(
+ total_byte_size, WriteBatchInternal::ByteSize(writer->batch));
+ }
+ }
+ if (w.disable_wal) {
+ has_unpersisted_data_.store(true, std::memory_order_relaxed);
+ }
+ write_thread_.UpdateLastSequence(current_sequence + total_count - 1);
+ }
+
+ auto stats = default_cf_internal_stats_;
+ stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count);
+ RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count);
+ stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size);
+ RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
+ RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size);
+
+ PERF_TIMER_STOP(write_pre_and_post_process_time);
+
+ if (w.status.ok() && !write_options.disableWAL) {
+ PERF_TIMER_GUARD(write_wal_time);
+ stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1);
+ RecordTick(stats_, WRITE_DONE_BY_SELF, 1);
+ if (wal_write_group.size > 1) {
+ stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther,
+ wal_write_group.size - 1);
+ RecordTick(stats_, WRITE_DONE_BY_OTHER, wal_write_group.size - 1);
+ }
+ w.status = WriteToWAL(wal_write_group, log_writer, log_used,
+ need_log_sync, need_log_dir_sync, current_sequence);
+ }
+
+ if (!w.CallbackFailed()) {
+ WriteStatusCheck(w.status);
+ }
+
+ if (need_log_sync) {
+ mutex_.Lock();
+ MarkLogsSynced(logfile_number_, need_log_dir_sync, w.status);
+ mutex_.Unlock();
+ }
+
+ write_thread_.ExitAsBatchGroupLeader(wal_write_group, w.status);
+ }
+
+ WriteThread::WriteGroup memtable_write_group;
+ if (w.state == WriteThread::STATE_MEMTABLE_WRITER_LEADER) {
+ PERF_TIMER_GUARD(write_memtable_time);
+ assert(w.ShouldWriteToMemtable());
+ write_thread_.EnterAsMemTableWriter(&w, &memtable_write_group);
+ if (memtable_write_group.size > 1 &&
+ immutable_db_options_.allow_concurrent_memtable_write) {
+ write_thread_.LaunchParallelMemTableWriters(&memtable_write_group);
+ } else {
+ memtable_write_group.status = WriteBatchInternal::InsertInto(
+ memtable_write_group, w.sequence, column_family_memtables_.get(),
+ &flush_scheduler_, &trim_history_scheduler_,
+ write_options.ignore_missing_column_families, 0 /*log_number*/, this,
+ false /*concurrent_memtable_writes*/, seq_per_batch_, batch_per_txn_);
+ versions_->SetLastSequence(memtable_write_group.last_sequence);
+ write_thread_.ExitAsMemTableWriter(&w, memtable_write_group);
+ }
+ }
+
+ if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) {
+ assert(w.ShouldWriteToMemtable());
+ ColumnFamilyMemTablesImpl column_family_memtables(
+ versions_->GetColumnFamilySet());
+ w.status = WriteBatchInternal::InsertInto(
+ &w, w.sequence, &column_family_memtables, &flush_scheduler_,
+ &trim_history_scheduler_, write_options.ignore_missing_column_families,
+ 0 /*log_number*/, this, true /*concurrent_memtable_writes*/,
+ false /*seq_per_batch*/, 0 /*batch_cnt*/, true /*batch_per_txn*/,
+ write_options.memtable_insert_hint_per_batch);
+ if (write_thread_.CompleteParallelMemTableWriter(&w)) {
+ MemTableInsertStatusCheck(w.status);
+ versions_->SetLastSequence(w.write_group->last_sequence);
+ write_thread_.ExitAsMemTableWriter(&w, *w.write_group);
+ }
+ }
+ if (seq_used != nullptr) {
+ *seq_used = w.sequence;
+ }
+
+ assert(w.state == WriteThread::STATE_COMPLETED);
+ return w.FinalStatus();
+}
+
+Status DBImpl::UnorderedWriteMemtable(const WriteOptions& write_options,
+ WriteBatch* my_batch,
+ WriteCallback* callback, uint64_t log_ref,
+ SequenceNumber seq,
+ const size_t sub_batch_cnt) {
+ PERF_TIMER_GUARD(write_pre_and_post_process_time);
+ StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE);
+
+ WriteThread::Writer w(write_options, my_batch, callback, log_ref,
+ false /*disable_memtable*/);
+
+ if (w.CheckCallback(this) && w.ShouldWriteToMemtable()) {
+ w.sequence = seq;
+ size_t total_count = WriteBatchInternal::Count(my_batch);
+ InternalStats* stats = default_cf_internal_stats_;
+ stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count);
+ RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count);
+
+ ColumnFamilyMemTablesImpl column_family_memtables(
+ versions_->GetColumnFamilySet());
+ w.status = WriteBatchInternal::InsertInto(
+ &w, w.sequence, &column_family_memtables, &flush_scheduler_,
+ &trim_history_scheduler_, write_options.ignore_missing_column_families,
+ 0 /*log_number*/, this, true /*concurrent_memtable_writes*/,
+ seq_per_batch_, sub_batch_cnt, true /*batch_per_txn*/,
+ write_options.memtable_insert_hint_per_batch);
+
+ WriteStatusCheck(w.status);
+ if (write_options.disableWAL) {
+ has_unpersisted_data_.store(true, std::memory_order_relaxed);
+ }
+ }
+
+ size_t pending_cnt = pending_memtable_writes_.fetch_sub(1) - 1;
+ if (pending_cnt == 0) {
+ // switch_cv_ waits until pending_memtable_writes_ = 0. Locking its mutex
+ // before notify ensures that cv is in waiting state when it is notified
+ // thus not missing the update to pending_memtable_writes_ even though it is
+ // not modified under the mutex.
+ std::lock_guard<std::mutex> lck(switch_mutex_);
+ switch_cv_.notify_all();
+ }
+
+ if (!w.FinalStatus().ok()) {
+ return w.FinalStatus();
+ }
+ return Status::OK();
+}
+
+// The 2nd write queue. If enabled it will be used only for WAL-only writes.
+// This is the only queue that updates LastPublishedSequence which is only
+// applicable in a two-queue setting.
+Status DBImpl::WriteImplWALOnly(
+ WriteThread* write_thread, const WriteOptions& write_options,
+ WriteBatch* my_batch, WriteCallback* callback, uint64_t* log_used,
+ const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt,
+ PreReleaseCallback* pre_release_callback, const AssignOrder assign_order,
+ const PublishLastSeq publish_last_seq, const bool disable_memtable) {
+ Status status;
+ PERF_TIMER_GUARD(write_pre_and_post_process_time);
+ WriteThread::Writer w(write_options, my_batch, callback, log_ref,
+ disable_memtable, sub_batch_cnt, pre_release_callback);
+ RecordTick(stats_, WRITE_WITH_WAL);
+ StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE);
+
+ write_thread->JoinBatchGroup(&w);
+ assert(w.state != WriteThread::STATE_PARALLEL_MEMTABLE_WRITER);
+ if (w.state == WriteThread::STATE_COMPLETED) {
+ if (log_used != nullptr) {
+ *log_used = w.log_used;
+ }
+ if (seq_used != nullptr) {
+ *seq_used = w.sequence;
+ }
+ return w.FinalStatus();
+ }
+ // else we are the leader of the write batch group
+ assert(w.state == WriteThread::STATE_GROUP_LEADER);
+
+ if (publish_last_seq == kDoPublishLastSeq) {
+ // Currently we only use kDoPublishLastSeq in unordered_write
+ assert(immutable_db_options_.unordered_write);
+ WriteContext write_context;
+ if (error_handler_.IsDBStopped()) {
+ status = error_handler_.GetBGError();
+ }
+ // TODO(myabandeh): Make preliminary checks thread-safe so we could do them
+ // without paying the cost of obtaining the mutex.
+ if (status.ok()) {
+ InstrumentedMutexLock l(&mutex_);
+ bool need_log_sync = false;
+ status = PreprocessWrite(write_options, &need_log_sync, &write_context);
+ WriteStatusCheck(status);
+ }
+ if (!status.ok()) {
+ WriteThread::WriteGroup write_group;
+ write_thread->EnterAsBatchGroupLeader(&w, &write_group);
+ write_thread->ExitAsBatchGroupLeader(write_group, status);
+ return status;
+ }
+ }
+
+ WriteThread::WriteGroup write_group;
+ uint64_t last_sequence;
+ write_thread->EnterAsBatchGroupLeader(&w, &write_group);
+ // Note: no need to update last_batch_group_size_ here since the batch writes
+ // to WAL only
+
+ size_t pre_release_callback_cnt = 0;
+ size_t total_byte_size = 0;
+ for (auto* writer : write_group) {
+ if (writer->CheckCallback(this)) {
+ total_byte_size = WriteBatchInternal::AppendedByteSize(
+ total_byte_size, WriteBatchInternal::ByteSize(writer->batch));
+ if (writer->pre_release_callback) {
+ pre_release_callback_cnt++;
+ }
+ }
+ }
+
+ const bool concurrent_update = true;
+ // Update stats while we are an exclusive group leader, so we know
+ // that nobody else can be writing to these particular stats.
+ // We're optimistic, updating the stats before we successfully
+ // commit. That lets us release our leader status early.
+ auto stats = default_cf_internal_stats_;
+ stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size,
+ concurrent_update);
+ RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
+ stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1,
+ concurrent_update);
+ RecordTick(stats_, WRITE_DONE_BY_SELF);
+ auto write_done_by_other = write_group.size - 1;
+ if (write_done_by_other > 0) {
+ stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther,
+ write_done_by_other, concurrent_update);
+ RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other);
+ }
+ RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size);
+
+ PERF_TIMER_STOP(write_pre_and_post_process_time);
+
+ PERF_TIMER_GUARD(write_wal_time);
+ // LastAllocatedSequence is increased inside WriteToWAL under
+ // wal_write_mutex_ to ensure ordered events in WAL
+ size_t seq_inc = 0 /* total_count */;
+ if (assign_order == kDoAssignOrder) {
+ size_t total_batch_cnt = 0;
+ for (auto* writer : write_group) {
+ assert(writer->batch_cnt || !seq_per_batch_);
+ if (!writer->CallbackFailed()) {
+ total_batch_cnt += writer->batch_cnt;
+ }
+ }
+ seq_inc = total_batch_cnt;
+ }
+ if (!write_options.disableWAL) {
+ status =
+ ConcurrentWriteToWAL(write_group, log_used, &last_sequence, seq_inc);
+ } else {
+ // Otherwise we inc seq number to do solely the seq allocation
+ last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
+ }
+
+ size_t memtable_write_cnt = 0;
+ auto curr_seq = last_sequence + 1;
+ for (auto* writer : write_group) {
+ if (writer->CallbackFailed()) {
+ continue;
+ }
+ writer->sequence = curr_seq;
+ if (assign_order == kDoAssignOrder) {
+ assert(writer->batch_cnt || !seq_per_batch_);
+ curr_seq += writer->batch_cnt;
+ }
+ if (!writer->disable_memtable) {
+ memtable_write_cnt++;
+ }
+ // else seq advances only by memtable writes
+ }
+ if (status.ok() && write_options.sync) {
+ assert(!write_options.disableWAL);
+ // Requesting sync with two_write_queues_ is expected to be very rare. We
+ // hance provide a simple implementation that is not necessarily efficient.
+ if (manual_wal_flush_) {
+ status = FlushWAL(true);
+ } else {
+ status = SyncWAL();
+ }
+ }
+ PERF_TIMER_START(write_pre_and_post_process_time);
+
+ if (!w.CallbackFailed()) {
+ WriteStatusCheck(status);
+ }
+ if (status.ok()) {
+ size_t index = 0;
+ for (auto* writer : write_group) {
+ if (!writer->CallbackFailed() && writer->pre_release_callback) {
+ assert(writer->sequence != kMaxSequenceNumber);
+ Status ws = writer->pre_release_callback->Callback(
+ writer->sequence, disable_memtable, writer->log_used, index++,
+ pre_release_callback_cnt);
+ if (!ws.ok()) {
+ status = ws;
+ break;
+ }
+ }
+ }
+ }
+ if (publish_last_seq == kDoPublishLastSeq) {
+ versions_->SetLastSequence(last_sequence + seq_inc);
+ // Currently we only use kDoPublishLastSeq in unordered_write
+ assert(immutable_db_options_.unordered_write);
+ }
+ if (immutable_db_options_.unordered_write && status.ok()) {
+ pending_memtable_writes_ += memtable_write_cnt;
+ }
+ write_thread->ExitAsBatchGroupLeader(write_group, status);
+ if (status.ok()) {
+ status = w.FinalStatus();
+ }
+ if (seq_used != nullptr) {
+ *seq_used = w.sequence;
+ }
+ return status;
+}
+
+void DBImpl::WriteStatusCheck(const Status& status) {
+ // Is setting bg_error_ enough here? This will at least stop
+ // compaction and fail any further writes.
+ if (immutable_db_options_.paranoid_checks && !status.ok() &&
+ !status.IsBusy() && !status.IsIncomplete()) {
+ mutex_.Lock();
+ error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback);
+ mutex_.Unlock();
+ }
+}
+
+void DBImpl::MemTableInsertStatusCheck(const Status& status) {
+ // A non-OK status here indicates that the state implied by the
+ // WAL has diverged from the in-memory state. This could be
+ // because of a corrupt write_batch (very bad), or because the
+ // client specified an invalid column family and didn't specify
+ // ignore_missing_column_families.
+ if (!status.ok()) {
+ mutex_.Lock();
+ assert(!error_handler_.IsBGWorkStopped());
+ error_handler_.SetBGError(status, BackgroundErrorReason::kMemTable);
+ mutex_.Unlock();
+ }
+}
+
+Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
+ bool* need_log_sync,
+ WriteContext* write_context) {
+ mutex_.AssertHeld();
+ assert(write_context != nullptr && need_log_sync != nullptr);
+ Status status;
+
+ if (error_handler_.IsDBStopped()) {
+ status = error_handler_.GetBGError();
+ }
+
+ PERF_TIMER_GUARD(write_scheduling_flushes_compactions_time);
+
+ assert(!single_column_family_mode_ ||
+ versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1);
+ if (UNLIKELY(status.ok() && !single_column_family_mode_ &&
+ total_log_size_ > GetMaxTotalWalSize())) {
+ WaitForPendingWrites();
+ status = SwitchWAL(write_context);
+ }
+
+ if (UNLIKELY(status.ok() && write_buffer_manager_->ShouldFlush())) {
+ // Before a new memtable is added in SwitchMemtable(),
+ // write_buffer_manager_->ShouldFlush() will keep returning true. If another
+ // thread is writing to another DB with the same write buffer, they may also
+ // be flushed. We may end up with flushing much more DBs than needed. It's
+ // suboptimal but still correct.
+ WaitForPendingWrites();
+ status = HandleWriteBufferFull(write_context);
+ }
+
+ if (UNLIKELY(status.ok() && !trim_history_scheduler_.Empty())) {
+ status = TrimMemtableHistory(write_context);
+ }
+
+ if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) {
+ WaitForPendingWrites();
+ status = ScheduleFlushes(write_context);
+ }
+
+ PERF_TIMER_STOP(write_scheduling_flushes_compactions_time);
+ PERF_TIMER_GUARD(write_pre_and_post_process_time);
+
+ if (UNLIKELY(status.ok() && (write_controller_.IsStopped() ||
+ write_controller_.NeedsDelay()))) {
+ PERF_TIMER_STOP(write_pre_and_post_process_time);
+ PERF_TIMER_GUARD(write_delay_time);
+ // We don't know size of curent batch so that we always use the size
+ // for previous one. It might create a fairness issue that expiration
+ // might happen for smaller writes but larger writes can go through.
+ // Can optimize it if it is an issue.
+ status = DelayWrite(last_batch_group_size_, write_options);
+ PERF_TIMER_START(write_pre_and_post_process_time);
+ }
+
+ if (status.ok() && *need_log_sync) {
+ // Wait until the parallel syncs are finished. Any sync process has to sync
+ // the front log too so it is enough to check the status of front()
+ // We do a while loop since log_sync_cv_ is signalled when any sync is
+ // finished
+ // Note: there does not seem to be a reason to wait for parallel sync at
+ // this early step but it is not important since parallel sync (SyncWAL) and
+ // need_log_sync are usually not used together.
+ while (logs_.front().getting_synced) {
+ log_sync_cv_.Wait();
+ }
+ for (auto& log : logs_) {
+ assert(!log.getting_synced);
+ // This is just to prevent the logs to be synced by a parallel SyncWAL
+ // call. We will do the actual syncing later after we will write to the
+ // WAL.
+ // Note: there does not seem to be a reason to set this early before we
+ // actually write to the WAL
+ log.getting_synced = true;
+ }
+ } else {
+ *need_log_sync = false;
+ }
+
+ return status;
+}
+
+WriteBatch* DBImpl::MergeBatch(const WriteThread::WriteGroup& write_group,
+ WriteBatch* tmp_batch, size_t* write_with_wal,
+ WriteBatch** to_be_cached_state) {
+ assert(write_with_wal != nullptr);
+ assert(tmp_batch != nullptr);
+ assert(*to_be_cached_state == nullptr);
+ WriteBatch* merged_batch = nullptr;
+ *write_with_wal = 0;
+ auto* leader = write_group.leader;
+ assert(!leader->disable_wal); // Same holds for all in the batch group
+ if (write_group.size == 1 && !leader->CallbackFailed() &&
+ leader->batch->GetWalTerminationPoint().is_cleared()) {
+ // we simply write the first WriteBatch to WAL if the group only
+ // contains one batch, that batch should be written to the WAL,
+ // and the batch is not wanting to be truncated
+ merged_batch = leader->batch;
+ if (WriteBatchInternal::IsLatestPersistentState(merged_batch)) {
+ *to_be_cached_state = merged_batch;
+ }
+ *write_with_wal = 1;
+ } else {
+ // WAL needs all of the batches flattened into a single batch.
+ // We could avoid copying here with an iov-like AddRecord
+ // interface
+ merged_batch = tmp_batch;
+ for (auto writer : write_group) {
+ if (!writer->CallbackFailed()) {
+ WriteBatchInternal::Append(merged_batch, writer->batch,
+ /*WAL_only*/ true);
+ if (WriteBatchInternal::IsLatestPersistentState(writer->batch)) {
+ // We only need to cache the last of such write batch
+ *to_be_cached_state = writer->batch;
+ }
+ (*write_with_wal)++;
+ }
+ }
+ }
+ return merged_batch;
+}
+
+// When two_write_queues_ is disabled, this function is called from the only
+// write thread. Otherwise this must be called holding log_write_mutex_.
+Status DBImpl::WriteToWAL(const WriteBatch& merged_batch,
+ log::Writer* log_writer, uint64_t* log_used,
+ uint64_t* log_size) {
+ assert(log_size != nullptr);
+ Slice log_entry = WriteBatchInternal::Contents(&merged_batch);
+ *log_size = log_entry.size();
+ // When two_write_queues_ WriteToWAL has to be protected from concurretn calls
+ // from the two queues anyway and log_write_mutex_ is already held. Otherwise
+ // if manual_wal_flush_ is enabled we need to protect log_writer->AddRecord
+ // from possible concurrent calls via the FlushWAL by the application.
+ const bool needs_locking = manual_wal_flush_ && !two_write_queues_;
+ // Due to performance cocerns of missed branch prediction penalize the new
+ // manual_wal_flush_ feature (by UNLIKELY) instead of the more common case
+ // when we do not need any locking.
+ if (UNLIKELY(needs_locking)) {
+ log_write_mutex_.Lock();
+ }
+ Status status = log_writer->AddRecord(log_entry);
+ if (UNLIKELY(needs_locking)) {
+ log_write_mutex_.Unlock();
+ }
+ if (log_used != nullptr) {
+ *log_used = logfile_number_;
+ }
+ total_log_size_ += log_entry.size();
+ // TODO(myabandeh): it might be unsafe to access alive_log_files_.back() here
+ // since alive_log_files_ might be modified concurrently
+ alive_log_files_.back().AddSize(log_entry.size());
+ log_empty_ = false;
+ return status;
+}
+
+Status DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
+ log::Writer* log_writer, uint64_t* log_used,
+ bool need_log_sync, bool need_log_dir_sync,
+ SequenceNumber sequence) {
+ Status status;
+
+ assert(!write_group.leader->disable_wal);
+ // Same holds for all in the batch group
+ size_t write_with_wal = 0;
+ WriteBatch* to_be_cached_state = nullptr;
+ WriteBatch* merged_batch = MergeBatch(write_group, &tmp_batch_,
+ &write_with_wal, &to_be_cached_state);
+ if (merged_batch == write_group.leader->batch) {
+ write_group.leader->log_used = logfile_number_;
+ } else if (write_with_wal > 1) {
+ for (auto writer : write_group) {
+ writer->log_used = logfile_number_;
+ }
+ }
+
+ WriteBatchInternal::SetSequence(merged_batch, sequence);
+
+ uint64_t log_size;
+ status = WriteToWAL(*merged_batch, log_writer, log_used, &log_size);
+ if (to_be_cached_state) {
+ cached_recoverable_state_ = *to_be_cached_state;
+ cached_recoverable_state_empty_ = false;
+ }
+
+ if (status.ok() && need_log_sync) {
+ StopWatch sw(env_, stats_, WAL_FILE_SYNC_MICROS);
+ // It's safe to access logs_ with unlocked mutex_ here because:
+ // - we've set getting_synced=true for all logs,
+ // so other threads won't pop from logs_ while we're here,
+ // - only writer thread can push to logs_, and we're in
+ // writer thread, so no one will push to logs_,
+ // - as long as other threads don't modify it, it's safe to read
+ // from std::deque from multiple threads concurrently.
+ for (auto& log : logs_) {
+ status = log.writer->file()->Sync(immutable_db_options_.use_fsync);
+ if (!status.ok()) {
+ break;
+ }
+ }
+ if (status.ok() && need_log_dir_sync) {
+ // We only sync WAL directory the first time WAL syncing is
+ // requested, so that in case users never turn on WAL sync,
+ // we can avoid the disk I/O in the write code path.
+ status = directories_.GetWalDir()->Fsync();
+ }
+ }
+
+ if (merged_batch == &tmp_batch_) {
+ tmp_batch_.Clear();
+ }
+ if (status.ok()) {
+ auto stats = default_cf_internal_stats_;
+ if (need_log_sync) {
+ stats->AddDBStats(InternalStats::kIntStatsWalFileSynced, 1);
+ RecordTick(stats_, WAL_FILE_SYNCED);
+ }
+ stats->AddDBStats(InternalStats::kIntStatsWalFileBytes, log_size);
+ RecordTick(stats_, WAL_FILE_BYTES, log_size);
+ stats->AddDBStats(InternalStats::kIntStatsWriteWithWal, write_with_wal);
+ RecordTick(stats_, WRITE_WITH_WAL, write_with_wal);
+ }
+ return status;
+}
+
+Status DBImpl::ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group,
+ uint64_t* log_used,
+ SequenceNumber* last_sequence,
+ size_t seq_inc) {
+ Status status;
+
+ assert(!write_group.leader->disable_wal);
+ // Same holds for all in the batch group
+ WriteBatch tmp_batch;
+ size_t write_with_wal = 0;
+ WriteBatch* to_be_cached_state = nullptr;
+ WriteBatch* merged_batch =
+ MergeBatch(write_group, &tmp_batch, &write_with_wal, &to_be_cached_state);
+
+ // We need to lock log_write_mutex_ since logs_ and alive_log_files might be
+ // pushed back concurrently
+ log_write_mutex_.Lock();
+ if (merged_batch == write_group.leader->batch) {
+ write_group.leader->log_used = logfile_number_;
+ } else if (write_with_wal > 1) {
+ for (auto writer : write_group) {
+ writer->log_used = logfile_number_;
+ }
+ }
+ *last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
+ auto sequence = *last_sequence + 1;
+ WriteBatchInternal::SetSequence(merged_batch, sequence);
+
+ log::Writer* log_writer = logs_.back().writer;
+ uint64_t log_size;
+ status = WriteToWAL(*merged_batch, log_writer, log_used, &log_size);
+ if (to_be_cached_state) {
+ cached_recoverable_state_ = *to_be_cached_state;
+ cached_recoverable_state_empty_ = false;
+ }
+ log_write_mutex_.Unlock();
+
+ if (status.ok()) {
+ const bool concurrent = true;
+ auto stats = default_cf_internal_stats_;
+ stats->AddDBStats(InternalStats::kIntStatsWalFileBytes, log_size,
+ concurrent);
+ RecordTick(stats_, WAL_FILE_BYTES, log_size);
+ stats->AddDBStats(InternalStats::kIntStatsWriteWithWal, write_with_wal,
+ concurrent);
+ RecordTick(stats_, WRITE_WITH_WAL, write_with_wal);
+ }
+ return status;
+}
+
+Status DBImpl::WriteRecoverableState() {
+ mutex_.AssertHeld();
+ if (!cached_recoverable_state_empty_) {
+ bool dont_care_bool;
+ SequenceNumber next_seq;
+ if (two_write_queues_) {
+ log_write_mutex_.Lock();
+ }
+ SequenceNumber seq;
+ if (two_write_queues_) {
+ seq = versions_->FetchAddLastAllocatedSequence(0);
+ } else {
+ seq = versions_->LastSequence();
+ }
+ WriteBatchInternal::SetSequence(&cached_recoverable_state_, seq + 1);
+ auto status = WriteBatchInternal::InsertInto(
+ &cached_recoverable_state_, column_family_memtables_.get(),
+ &flush_scheduler_, &trim_history_scheduler_, true,
+ 0 /*recovery_log_number*/, this, false /* concurrent_memtable_writes */,
+ &next_seq, &dont_care_bool, seq_per_batch_);
+ auto last_seq = next_seq - 1;
+ if (two_write_queues_) {
+ versions_->FetchAddLastAllocatedSequence(last_seq - seq);
+ versions_->SetLastPublishedSequence(last_seq);
+ }
+ versions_->SetLastSequence(last_seq);
+ if (two_write_queues_) {
+ log_write_mutex_.Unlock();
+ }
+ if (status.ok() && recoverable_state_pre_release_callback_) {
+ const bool DISABLE_MEMTABLE = true;
+ for (uint64_t sub_batch_seq = seq + 1;
+ sub_batch_seq < next_seq && status.ok(); sub_batch_seq++) {
+ uint64_t const no_log_num = 0;
+ // Unlock it since the callback might end up locking mutex. e.g.,
+ // AddCommitted -> AdvanceMaxEvictedSeq -> GetSnapshotListFromDB
+ mutex_.Unlock();
+ status = recoverable_state_pre_release_callback_->Callback(
+ sub_batch_seq, !DISABLE_MEMTABLE, no_log_num, 0, 1);
+ mutex_.Lock();
+ }
+ }
+ if (status.ok()) {
+ cached_recoverable_state_.Clear();
+ cached_recoverable_state_empty_ = true;
+ }
+ return status;
+ }
+ return Status::OK();
+}
+
+void DBImpl::SelectColumnFamiliesForAtomicFlush(
+ autovector<ColumnFamilyData*>* cfds) {
+ for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
+ !cached_recoverable_state_empty_.load()) {
+ cfds->push_back(cfd);
+ }
+ }
+}
+
+// Assign sequence number for atomic flush.
+void DBImpl::AssignAtomicFlushSeq(const autovector<ColumnFamilyData*>& cfds) {
+ assert(immutable_db_options_.atomic_flush);
+ auto seq = versions_->LastSequence();
+ for (auto cfd : cfds) {
+ cfd->imm()->AssignAtomicFlushSeq(seq);
+ }
+}
+
+Status DBImpl::SwitchWAL(WriteContext* write_context) {
+ mutex_.AssertHeld();
+ assert(write_context != nullptr);
+ Status status;
+
+ if (alive_log_files_.begin()->getting_flushed) {
+ return status;
+ }
+
+ auto oldest_alive_log = alive_log_files_.begin()->number;
+ bool flush_wont_release_oldest_log = false;
+ if (allow_2pc()) {
+ auto oldest_log_with_uncommitted_prep =
+ logs_with_prep_tracker_.FindMinLogContainingOutstandingPrep();
+
+ assert(oldest_log_with_uncommitted_prep == 0 ||
+ oldest_log_with_uncommitted_prep >= oldest_alive_log);
+ if (oldest_log_with_uncommitted_prep > 0 &&
+ oldest_log_with_uncommitted_prep == oldest_alive_log) {
+ if (unable_to_release_oldest_log_) {
+ // we already attempted to flush all column families dependent on
+ // the oldest alive log but the log still contained uncommitted
+ // transactions so there is still nothing that we can do.
+ return status;
+ } else {
+ ROCKS_LOG_WARN(
+ immutable_db_options_.info_log,
+ "Unable to release oldest log due to uncommitted transaction");
+ unable_to_release_oldest_log_ = true;
+ flush_wont_release_oldest_log = true;
+ }
+ }
+ }
+ if (!flush_wont_release_oldest_log) {
+ // we only mark this log as getting flushed if we have successfully
+ // flushed all data in this log. If this log contains outstanding prepared
+ // transactions then we cannot flush this log until those transactions are
+ // commited.
+ unable_to_release_oldest_log_ = false;
+ alive_log_files_.begin()->getting_flushed = true;
+ }
+
+ ROCKS_LOG_INFO(
+ immutable_db_options_.info_log,
+ "Flushing all column families with data in WAL number %" PRIu64
+ ". Total log size is %" PRIu64 " while max_total_wal_size is %" PRIu64,
+ oldest_alive_log, total_log_size_.load(), GetMaxTotalWalSize());
+ // no need to refcount because drop is happening in write thread, so can't
+ // happen while we're in the write thread
+ autovector<ColumnFamilyData*> cfds;
+ if (immutable_db_options_.atomic_flush) {
+ SelectColumnFamiliesForAtomicFlush(&cfds);
+ } else {
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ if (cfd->OldestLogToKeep() <= oldest_alive_log) {
+ cfds.push_back(cfd);
+ }
+ }
+ MaybeFlushStatsCF(&cfds);
+ }
+ WriteThread::Writer nonmem_w;
+ if (two_write_queues_) {
+ nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+ }
+
+ for (const auto cfd : cfds) {
+ cfd->Ref();
+ status = SwitchMemtable(cfd, write_context);
+ cfd->UnrefAndTryDelete();
+ if (!status.ok()) {
+ break;
+ }
+ }
+ if (two_write_queues_) {
+ nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+ }
+
+ if (status.ok()) {
+ if (immutable_db_options_.atomic_flush) {
+ AssignAtomicFlushSeq(cfds);
+ }
+ for (auto cfd : cfds) {
+ cfd->imm()->FlushRequested();
+ }
+ FlushRequest flush_req;
+ GenerateFlushRequest(cfds, &flush_req);
+ SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager);
+ MaybeScheduleFlushOrCompaction();
+ }
+ return status;
+}
+
+Status DBImpl::HandleWriteBufferFull(WriteContext* write_context) {
+ mutex_.AssertHeld();
+ assert(write_context != nullptr);
+ Status status;
+
+ // Before a new memtable is added in SwitchMemtable(),
+ // write_buffer_manager_->ShouldFlush() will keep returning true. If another
+ // thread is writing to another DB with the same write buffer, they may also
+ // be flushed. We may end up with flushing much more DBs than needed. It's
+ // suboptimal but still correct.
+ ROCKS_LOG_INFO(
+ immutable_db_options_.info_log,
+ "Flushing column family with oldest memtable entry. Write buffer is "
+ "using %" ROCKSDB_PRIszt " bytes out of a total of %" ROCKSDB_PRIszt ".",
+ write_buffer_manager_->memory_usage(),
+ write_buffer_manager_->buffer_size());
+ // no need to refcount because drop is happening in write thread, so can't
+ // happen while we're in the write thread
+ autovector<ColumnFamilyData*> cfds;
+ if (immutable_db_options_.atomic_flush) {
+ SelectColumnFamiliesForAtomicFlush(&cfds);
+ } else {
+ ColumnFamilyData* cfd_picked = nullptr;
+ SequenceNumber seq_num_for_cf_picked = kMaxSequenceNumber;
+
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ if (!cfd->mem()->IsEmpty()) {
+ // We only consider active mem table, hoping immutable memtable is
+ // already in the process of flushing.
+ uint64_t seq = cfd->mem()->GetCreationSeq();
+ if (cfd_picked == nullptr || seq < seq_num_for_cf_picked) {
+ cfd_picked = cfd;
+ seq_num_for_cf_picked = seq;
+ }
+ }
+ }
+ if (cfd_picked != nullptr) {
+ cfds.push_back(cfd_picked);
+ }
+ MaybeFlushStatsCF(&cfds);
+ }
+
+ WriteThread::Writer nonmem_w;
+ if (two_write_queues_) {
+ nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+ }
+ for (const auto cfd : cfds) {
+ if (cfd->mem()->IsEmpty()) {
+ continue;
+ }
+ cfd->Ref();
+ status = SwitchMemtable(cfd, write_context);
+ cfd->UnrefAndTryDelete();
+ if (!status.ok()) {
+ break;
+ }
+ }
+ if (two_write_queues_) {
+ nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+ }
+
+ if (status.ok()) {
+ if (immutable_db_options_.atomic_flush) {
+ AssignAtomicFlushSeq(cfds);
+ }
+ for (const auto cfd : cfds) {
+ cfd->imm()->FlushRequested();
+ }
+ FlushRequest flush_req;
+ GenerateFlushRequest(cfds, &flush_req);
+ SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull);
+ MaybeScheduleFlushOrCompaction();
+ }
+ return status;
+}
+
+uint64_t DBImpl::GetMaxTotalWalSize() const {
+ mutex_.AssertHeld();
+ return mutable_db_options_.max_total_wal_size == 0
+ ? 4 * max_total_in_memory_state_
+ : mutable_db_options_.max_total_wal_size;
+}
+
+// REQUIRES: mutex_ is held
+// REQUIRES: this thread is currently at the front of the writer queue
+Status DBImpl::DelayWrite(uint64_t num_bytes,
+ const WriteOptions& write_options) {
+ uint64_t time_delayed = 0;
+ bool delayed = false;
+ {
+ StopWatch sw(env_, stats_, WRITE_STALL, &time_delayed);
+ uint64_t delay = write_controller_.GetDelay(env_, num_bytes);
+ if (delay > 0) {
+ if (write_options.no_slowdown) {
+ return Status::Incomplete("Write stall");
+ }
+ TEST_SYNC_POINT("DBImpl::DelayWrite:Sleep");
+
+ // Notify write_thread_ about the stall so it can setup a barrier and
+ // fail any pending writers with no_slowdown
+ write_thread_.BeginWriteStall();
+ TEST_SYNC_POINT("DBImpl::DelayWrite:BeginWriteStallDone");
+ mutex_.Unlock();
+ // We will delay the write until we have slept for delay ms or
+ // we don't need a delay anymore
+ const uint64_t kDelayInterval = 1000;
+ uint64_t stall_end = sw.start_time() + delay;
+ while (write_controller_.NeedsDelay()) {
+ if (env_->NowMicros() >= stall_end) {
+ // We already delayed this write `delay` microseconds
+ break;
+ }
+
+ delayed = true;
+ // Sleep for 0.001 seconds
+ env_->SleepForMicroseconds(kDelayInterval);
+ }
+ mutex_.Lock();
+ write_thread_.EndWriteStall();
+ }
+
+ // Don't wait if there's a background error, even if its a soft error. We
+ // might wait here indefinitely as the background compaction may never
+ // finish successfully, resulting in the stall condition lasting
+ // indefinitely
+ while (error_handler_.GetBGError().ok() && write_controller_.IsStopped()) {
+ if (write_options.no_slowdown) {
+ return Status::Incomplete("Write stall");
+ }
+ delayed = true;
+
+ // Notify write_thread_ about the stall so it can setup a barrier and
+ // fail any pending writers with no_slowdown
+ write_thread_.BeginWriteStall();
+ TEST_SYNC_POINT("DBImpl::DelayWrite:Wait");
+ bg_cv_.Wait();
+ write_thread_.EndWriteStall();
+ }
+ }
+ assert(!delayed || !write_options.no_slowdown);
+ if (delayed) {
+ default_cf_internal_stats_->AddDBStats(
+ InternalStats::kIntStatsWriteStallMicros, time_delayed);
+ RecordTick(stats_, STALL_MICROS, time_delayed);
+ }
+
+ // If DB is not in read-only mode and write_controller is not stopping
+ // writes, we can ignore any background errors and allow the write to
+ // proceed
+ Status s;
+ if (write_controller_.IsStopped()) {
+ // If writes are still stopped, it means we bailed due to a background
+ // error
+ s = Status::Incomplete(error_handler_.GetBGError().ToString());
+ }
+ if (error_handler_.IsDBStopped()) {
+ s = error_handler_.GetBGError();
+ }
+ return s;
+}
+
+Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
+ WriteBatch* my_batch) {
+ assert(write_options.low_pri);
+ // This is called outside the DB mutex. Although it is safe to make the call,
+ // the consistency condition is not guaranteed to hold. It's OK to live with
+ // it in this case.
+ // If we need to speed compaction, it means the compaction is left behind
+ // and we start to limit low pri writes to a limit.
+ if (write_controller_.NeedSpeedupCompaction()) {
+ if (allow_2pc() && (my_batch->HasCommit() || my_batch->HasRollback())) {
+ // For 2PC, we only rate limit prepare, not commit.
+ return Status::OK();
+ }
+ if (write_options.no_slowdown) {
+ return Status::Incomplete("Low priority write stall");
+ } else {
+ assert(my_batch != nullptr);
+ // Rate limit those writes. The reason that we don't completely wait
+ // is that in case the write is heavy, low pri writes may never have
+ // a chance to run. Now we guarantee we are still slowly making
+ // progress.
+ PERF_TIMER_GUARD(write_delay_time);
+ write_controller_.low_pri_rate_limiter()->Request(
+ my_batch->GetDataSize(), Env::IO_HIGH, nullptr /* stats */,
+ RateLimiter::OpType::kWrite);
+ }
+ }
+ return Status::OK();
+}
+
+void DBImpl::MaybeFlushStatsCF(autovector<ColumnFamilyData*>* cfds) {
+ assert(cfds != nullptr);
+ if (!cfds->empty() && immutable_db_options_.persist_stats_to_disk) {
+ ColumnFamilyData* cfd_stats =
+ versions_->GetColumnFamilySet()->GetColumnFamily(
+ kPersistentStatsColumnFamilyName);
+ if (cfd_stats != nullptr && !cfd_stats->mem()->IsEmpty()) {
+ for (ColumnFamilyData* cfd : *cfds) {
+ if (cfd == cfd_stats) {
+ // stats CF already included in cfds
+ return;
+ }
+ }
+ // force flush stats CF when its log number is less than all other CF's
+ // log numbers
+ bool force_flush_stats_cf = true;
+ for (auto* loop_cfd : *versions_->GetColumnFamilySet()) {
+ if (loop_cfd == cfd_stats) {
+ continue;
+ }
+ if (loop_cfd->GetLogNumber() <= cfd_stats->GetLogNumber()) {
+ force_flush_stats_cf = false;
+ }
+ }
+ if (force_flush_stats_cf) {
+ cfds->push_back(cfd_stats);
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Force flushing stats CF with automated flush "
+ "to avoid holding old logs");
+ }
+ }
+ }
+}
+
+Status DBImpl::TrimMemtableHistory(WriteContext* context) {
+ autovector<ColumnFamilyData*> cfds;
+ ColumnFamilyData* tmp_cfd;
+ while ((tmp_cfd = trim_history_scheduler_.TakeNextColumnFamily()) !=
+ nullptr) {
+ cfds.push_back(tmp_cfd);
+ }
+ for (auto& cfd : cfds) {
+ autovector<MemTable*> to_delete;
+ cfd->imm()->TrimHistory(&to_delete, cfd->mem()->ApproximateMemoryUsage());
+ if (!to_delete.empty()) {
+ for (auto m : to_delete) {
+ delete m;
+ }
+ context->superversion_context.NewSuperVersion();
+ assert(context->superversion_context.new_superversion.get() != nullptr);
+ cfd->InstallSuperVersion(&context->superversion_context, &mutex_);
+ }
+
+ if (cfd->UnrefAndTryDelete()) {
+ cfd = nullptr;
+ }
+ }
+ return Status::OK();
+}
+
+Status DBImpl::ScheduleFlushes(WriteContext* context) {
+ autovector<ColumnFamilyData*> cfds;
+ if (immutable_db_options_.atomic_flush) {
+ SelectColumnFamiliesForAtomicFlush(&cfds);
+ for (auto cfd : cfds) {
+ cfd->Ref();
+ }
+ flush_scheduler_.Clear();
+ } else {
+ ColumnFamilyData* tmp_cfd;
+ while ((tmp_cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) {
+ cfds.push_back(tmp_cfd);
+ }
+ MaybeFlushStatsCF(&cfds);
+ }
+ Status status;
+ WriteThread::Writer nonmem_w;
+ if (two_write_queues_) {
+ nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+ }
+
+ for (auto& cfd : cfds) {
+ if (!cfd->mem()->IsEmpty()) {
+ status = SwitchMemtable(cfd, context);
+ }
+ if (cfd->UnrefAndTryDelete()) {
+ cfd = nullptr;
+ }
+ if (!status.ok()) {
+ break;
+ }
+ }
+
+ if (two_write_queues_) {
+ nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+ }
+
+ if (status.ok()) {
+ if (immutable_db_options_.atomic_flush) {
+ AssignAtomicFlushSeq(cfds);
+ }
+ FlushRequest flush_req;
+ GenerateFlushRequest(cfds, &flush_req);
+ SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull);
+ MaybeScheduleFlushOrCompaction();
+ }
+ return status;
+}
+
+#ifndef ROCKSDB_LITE
+void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* /*cfd*/,
+ const MemTableInfo& mem_table_info) {
+ if (immutable_db_options_.listeners.size() == 0U) {
+ return;
+ }
+ if (shutting_down_.load(std::memory_order_acquire)) {
+ return;
+ }
+
+ for (auto listener : immutable_db_options_.listeners) {
+ listener->OnMemTableSealed(mem_table_info);
+ }
+}
+#endif // ROCKSDB_LITE
+
+// REQUIRES: mutex_ is held
+// REQUIRES: this thread is currently at the front of the writer queue
+// REQUIRES: this thread is currently at the front of the 2nd writer queue if
+// two_write_queues_ is true (This is to simplify the reasoning.)
+Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
+ mutex_.AssertHeld();
+ WriteThread::Writer nonmem_w;
+ std::unique_ptr<WritableFile> lfile;
+ log::Writer* new_log = nullptr;
+ MemTable* new_mem = nullptr;
+
+ // Recoverable state is persisted in WAL. After memtable switch, WAL might
+ // be deleted, so we write the state to memtable to be persisted as well.
+ Status s = WriteRecoverableState();
+ if (!s.ok()) {
+ return s;
+ }
+
+ // Attempt to switch to a new memtable and trigger flush of old.
+ // Do this without holding the dbmutex lock.
+ assert(versions_->prev_log_number() == 0);
+ if (two_write_queues_) {
+ log_write_mutex_.Lock();
+ }
+ bool creating_new_log = !log_empty_;
+ if (two_write_queues_) {
+ log_write_mutex_.Unlock();
+ }
+ uint64_t recycle_log_number = 0;
+ if (creating_new_log && immutable_db_options_.recycle_log_file_num &&
+ !log_recycle_files_.empty()) {
+ recycle_log_number = log_recycle_files_.front();
+ }
+ uint64_t new_log_number =
+ creating_new_log ? versions_->NewFileNumber() : logfile_number_;
+ const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
+
+ // Set memtable_info for memtable sealed callback
+#ifndef ROCKSDB_LITE
+ MemTableInfo memtable_info;
+ memtable_info.cf_name = cfd->GetName();
+ memtable_info.first_seqno = cfd->mem()->GetFirstSequenceNumber();
+ memtable_info.earliest_seqno = cfd->mem()->GetEarliestSequenceNumber();
+ memtable_info.num_entries = cfd->mem()->num_entries();
+ memtable_info.num_deletes = cfd->mem()->num_deletes();
+#endif // ROCKSDB_LITE
+ // Log this later after lock release. It may be outdated, e.g., if background
+ // flush happens before logging, but that should be ok.
+ int num_imm_unflushed = cfd->imm()->NumNotFlushed();
+ const auto preallocate_block_size =
+ GetWalPreallocateBlockSize(mutable_cf_options.write_buffer_size);
+ mutex_.Unlock();
+ if (creating_new_log) {
+ // TODO: Write buffer size passed in should be max of all CF's instead
+ // of mutable_cf_options.write_buffer_size.
+ s = CreateWAL(new_log_number, recycle_log_number, preallocate_block_size,
+ &new_log);
+ }
+ if (s.ok()) {
+ SequenceNumber seq = versions_->LastSequence();
+ new_mem = cfd->ConstructNewMemtable(mutable_cf_options, seq);
+ context->superversion_context.NewSuperVersion();
+ }
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[%s] New memtable created with log file: #%" PRIu64
+ ". Immutable memtables: %d.\n",
+ cfd->GetName().c_str(), new_log_number, num_imm_unflushed);
+ mutex_.Lock();
+ if (recycle_log_number != 0) {
+ // Since renaming the file is done outside DB mutex, we need to ensure
+ // concurrent full purges don't delete the file while we're recycling it.
+ // To achieve that we hold the old log number in the recyclable list until
+ // after it has been renamed.
+ assert(log_recycle_files_.front() == recycle_log_number);
+ log_recycle_files_.pop_front();
+ }
+ if (s.ok() && creating_new_log) {
+ log_write_mutex_.Lock();
+ assert(new_log != nullptr);
+ if (!logs_.empty()) {
+ // Alway flush the buffer of the last log before switching to a new one
+ log::Writer* cur_log_writer = logs_.back().writer;
+ s = cur_log_writer->WriteBuffer();
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "[%s] Failed to switch from #%" PRIu64 " to #%" PRIu64
+ " WAL file\n",
+ cfd->GetName().c_str(), cur_log_writer->get_log_number(),
+ new_log_number);
+ }
+ }
+ if (s.ok()) {
+ logfile_number_ = new_log_number;
+ log_empty_ = true;
+ log_dir_synced_ = false;
+ logs_.emplace_back(logfile_number_, new_log);
+ alive_log_files_.push_back(LogFileNumberSize(logfile_number_));
+ }
+ log_write_mutex_.Unlock();
+ }
+
+ if (!s.ok()) {
+ // how do we fail if we're not creating new log?
+ assert(creating_new_log);
+ if (new_mem) {
+ delete new_mem;
+ }
+ if (new_log) {
+ delete new_log;
+ }
+ SuperVersion* new_superversion =
+ context->superversion_context.new_superversion.release();
+ if (new_superversion != nullptr) {
+ delete new_superversion;
+ }
+ // We may have lost data from the WritableFileBuffer in-memory buffer for
+ // the current log, so treat it as a fatal error and set bg_error
+ error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable);
+ // Read back bg_error in order to get the right severity
+ s = error_handler_.GetBGError();
+ return s;
+ }
+
+ for (auto loop_cfd : *versions_->GetColumnFamilySet()) {
+ // all this is just optimization to delete logs that
+ // are no longer needed -- if CF is empty, that means it
+ // doesn't need that particular log to stay alive, so we just
+ // advance the log number. no need to persist this in the manifest
+ if (loop_cfd->mem()->GetFirstSequenceNumber() == 0 &&
+ loop_cfd->imm()->NumNotFlushed() == 0) {
+ if (creating_new_log) {
+ loop_cfd->SetLogNumber(logfile_number_);
+ }
+ loop_cfd->mem()->SetCreationSeq(versions_->LastSequence());
+ }
+ }
+
+ cfd->mem()->SetNextLogNumber(logfile_number_);
+ cfd->imm()->Add(cfd->mem(), &context->memtables_to_free_);
+ new_mem->Ref();
+ cfd->SetMemtable(new_mem);
+ InstallSuperVersionAndScheduleWork(cfd, &context->superversion_context,
+ mutable_cf_options);
+#ifndef ROCKSDB_LITE
+ mutex_.Unlock();
+ // Notify client that memtable is sealed, now that we have successfully
+ // installed a new memtable
+ NotifyOnMemTableSealed(cfd, memtable_info);
+ mutex_.Lock();
+#endif // ROCKSDB_LITE
+ return s;
+}
+
+size_t DBImpl::GetWalPreallocateBlockSize(uint64_t write_buffer_size) const {
+ mutex_.AssertHeld();
+ size_t bsize =
+ static_cast<size_t>(write_buffer_size / 10 + write_buffer_size);
+ // Some users might set very high write_buffer_size and rely on
+ // max_total_wal_size or other parameters to control the WAL size.
+ if (mutable_db_options_.max_total_wal_size > 0) {
+ bsize = std::min<size_t>(
+ bsize, static_cast<size_t>(mutable_db_options_.max_total_wal_size));
+ }
+ if (immutable_db_options_.db_write_buffer_size > 0) {
+ bsize = std::min<size_t>(bsize, immutable_db_options_.db_write_buffer_size);
+ }
+ if (immutable_db_options_.write_buffer_manager &&
+ immutable_db_options_.write_buffer_manager->enabled()) {
+ bsize = std::min<size_t>(
+ bsize, immutable_db_options_.write_buffer_manager->buffer_size());
+ }
+
+ return bsize;
+}
+
+// Default implementations of convenience methods that subclasses of DB
+// can call if they wish
+Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& value) {
+ if (nullptr == opt.timestamp) {
+ // Pre-allocate size of write batch conservatively.
+ // 8 bytes are taken by header, 4 bytes for count, 1 byte for type,
+ // and we allocate 11 extra bytes for key length, as well as value length.
+ WriteBatch batch(key.size() + value.size() + 24);
+ Status s = batch.Put(column_family, key, value);
+ if (!s.ok()) {
+ return s;
+ }
+ return Write(opt, &batch);
+ }
+ const Slice* ts = opt.timestamp;
+ assert(nullptr != ts);
+ size_t ts_sz = ts->size();
+ WriteBatch batch(key.size() + ts_sz + value.size() + 24, /*max_bytes=*/0,
+ ts_sz);
+ Status s = batch.Put(column_family, key, value);
+ if (!s.ok()) {
+ return s;
+ }
+ s = batch.AssignTimestamp(*ts);
+ if (!s.ok()) {
+ return s;
+ }
+ return Write(opt, &batch);
+}
+
+Status DB::Delete(const WriteOptions& opt, ColumnFamilyHandle* column_family,
+ const Slice& key) {
+ WriteBatch batch;
+ batch.Delete(column_family, key);
+ return Write(opt, &batch);
+}
+
+Status DB::SingleDelete(const WriteOptions& opt,
+ ColumnFamilyHandle* column_family, const Slice& key) {
+ WriteBatch batch;
+ batch.SingleDelete(column_family, key);
+ return Write(opt, &batch);
+}
+
+Status DB::DeleteRange(const WriteOptions& opt,
+ ColumnFamilyHandle* column_family,
+ const Slice& begin_key, const Slice& end_key) {
+ WriteBatch batch;
+ batch.DeleteRange(column_family, begin_key, end_key);
+ return Write(opt, &batch);
+}
+
+Status DB::Merge(const WriteOptions& opt, ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& value) {
+ WriteBatch batch;
+ Status s = batch.Merge(column_family, key, value);
+ if (!s.ok()) {
+ return s;
+ }
+ return Write(opt, &batch);
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_secondary_test.cc b/src/rocksdb/db/db_impl/db_secondary_test.cc
new file mode 100644
index 000000000..0b34181de
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_secondary_test.cc
@@ -0,0 +1,869 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_impl/db_impl_secondary.h"
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+class DBSecondaryTest : public DBTestBase {
+ public:
+ DBSecondaryTest()
+ : DBTestBase("/db_secondary_test"),
+ secondary_path_(),
+ handles_secondary_(),
+ db_secondary_(nullptr) {
+ secondary_path_ =
+ test::PerThreadDBPath(env_, "/db_secondary_test_secondary");
+ }
+
+ ~DBSecondaryTest() override {
+ CloseSecondary();
+ if (getenv("KEEP_DB") != nullptr) {
+ fprintf(stdout, "Secondary DB is still at %s\n", secondary_path_.c_str());
+ } else {
+ Options options;
+ options.env = env_;
+ EXPECT_OK(DestroyDB(secondary_path_, options));
+ }
+ }
+
+ protected:
+ Status ReopenAsSecondary(const Options& options) {
+ return DB::OpenAsSecondary(options, dbname_, secondary_path_, &db_);
+ }
+
+ void OpenSecondary(const Options& options);
+
+ void OpenSecondaryWithColumnFamilies(
+ const std::vector<std::string>& column_families, const Options& options);
+
+ void CloseSecondary() {
+ for (auto h : handles_secondary_) {
+ db_secondary_->DestroyColumnFamilyHandle(h);
+ }
+ handles_secondary_.clear();
+ delete db_secondary_;
+ db_secondary_ = nullptr;
+ }
+
+ DBImplSecondary* db_secondary_full() {
+ return static_cast<DBImplSecondary*>(db_secondary_);
+ }
+
+ void CheckFileTypeCounts(const std::string& dir, int expected_log,
+ int expected_sst, int expected_manifest) const;
+
+ std::string secondary_path_;
+ std::vector<ColumnFamilyHandle*> handles_secondary_;
+ DB* db_secondary_;
+};
+
+void DBSecondaryTest::OpenSecondary(const Options& options) {
+ Status s =
+ DB::OpenAsSecondary(options, dbname_, secondary_path_, &db_secondary_);
+ ASSERT_OK(s);
+}
+
+void DBSecondaryTest::OpenSecondaryWithColumnFamilies(
+ const std::vector<std::string>& column_families, const Options& options) {
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+ for (const auto& cf_name : column_families) {
+ cf_descs.emplace_back(cf_name, options);
+ }
+ Status s = DB::OpenAsSecondary(options, dbname_, secondary_path_, cf_descs,
+ &handles_secondary_, &db_secondary_);
+ ASSERT_OK(s);
+}
+
+void DBSecondaryTest::CheckFileTypeCounts(const std::string& dir,
+ int expected_log, int expected_sst,
+ int expected_manifest) const {
+ std::vector<std::string> filenames;
+ env_->GetChildren(dir, &filenames);
+
+ int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0;
+ for (auto file : filenames) {
+ uint64_t number;
+ FileType type;
+ if (ParseFileName(file, &number, &type)) {
+ log_cnt += (type == kLogFile);
+ sst_cnt += (type == kTableFile);
+ manifest_cnt += (type == kDescriptorFile);
+ }
+ }
+ ASSERT_EQ(expected_log, log_cnt);
+ ASSERT_EQ(expected_sst, sst_cnt);
+ ASSERT_EQ(expected_manifest, manifest_cnt);
+}
+
+TEST_F(DBSecondaryTest, ReopenAsSecondary) {
+ Options options;
+ options.env = env_;
+ Reopen(options);
+ ASSERT_OK(Put("foo", "foo_value"));
+ ASSERT_OK(Put("bar", "bar_value"));
+ ASSERT_OK(dbfull()->Flush(FlushOptions()));
+ Close();
+
+ ASSERT_OK(ReopenAsSecondary(options));
+ ASSERT_EQ("foo_value", Get("foo"));
+ ASSERT_EQ("bar_value", Get("bar"));
+ ReadOptions ropts;
+ ropts.verify_checksums = true;
+ auto db1 = static_cast<DBImplSecondary*>(db_);
+ ASSERT_NE(nullptr, db1);
+ Iterator* iter = db1->NewIterator(ropts);
+ ASSERT_NE(nullptr, iter);
+ size_t count = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ if (0 == count) {
+ ASSERT_EQ("bar", iter->key().ToString());
+ ASSERT_EQ("bar_value", iter->value().ToString());
+ } else if (1 == count) {
+ ASSERT_EQ("foo", iter->key().ToString());
+ ASSERT_EQ("foo_value", iter->value().ToString());
+ }
+ ++count;
+ }
+ delete iter;
+ ASSERT_EQ(2, count);
+}
+
+TEST_F(DBSecondaryTest, OpenAsSecondary) {
+ Options options;
+ options.env = env_;
+ options.level0_file_num_compaction_trigger = 4;
+ Reopen(options);
+ for (int i = 0; i < 3; ++i) {
+ ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+ ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+ ASSERT_OK(Flush());
+ }
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondary(options1);
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ReadOptions ropts;
+ ropts.verify_checksums = true;
+ const auto verify_db_func = [&](const std::string& foo_val,
+ const std::string& bar_val) {
+ std::string value;
+ ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+ ASSERT_EQ(foo_val, value);
+ ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
+ ASSERT_EQ(bar_val, value);
+ Iterator* iter = db_secondary_->NewIterator(ropts);
+ ASSERT_NE(nullptr, iter);
+ iter->Seek("foo");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("foo", iter->key().ToString());
+ ASSERT_EQ(foo_val, iter->value().ToString());
+ iter->Seek("bar");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("bar", iter->key().ToString());
+ ASSERT_EQ(bar_val, iter->value().ToString());
+ size_t count = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ++count;
+ }
+ ASSERT_EQ(2, count);
+ delete iter;
+ };
+
+ verify_db_func("foo_value2", "bar_value2");
+
+ ASSERT_OK(Put("foo", "new_foo_value"));
+ ASSERT_OK(Put("bar", "new_bar_value"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ verify_db_func("new_foo_value", "new_bar_value");
+}
+
+namespace {
+class TraceFileEnv : public EnvWrapper {
+ public:
+ explicit TraceFileEnv(Env* _target) : EnvWrapper(_target) {}
+ Status NewRandomAccessFile(const std::string& f,
+ std::unique_ptr<RandomAccessFile>* r,
+ const EnvOptions& env_options) override {
+ class TracedRandomAccessFile : public RandomAccessFile {
+ public:
+ TracedRandomAccessFile(std::unique_ptr<RandomAccessFile>&& target,
+ std::atomic<int>& counter)
+ : target_(std::move(target)), files_closed_(counter) {}
+ ~TracedRandomAccessFile() override {
+ files_closed_.fetch_add(1, std::memory_order_relaxed);
+ }
+ Status Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const override {
+ return target_->Read(offset, n, result, scratch);
+ }
+
+ private:
+ std::unique_ptr<RandomAccessFile> target_;
+ std::atomic<int>& files_closed_;
+ };
+ Status s = target()->NewRandomAccessFile(f, r, env_options);
+ if (s.ok()) {
+ r->reset(new TracedRandomAccessFile(std::move(*r), files_closed_));
+ }
+ return s;
+ }
+
+ int files_closed() const {
+ return files_closed_.load(std::memory_order_relaxed);
+ }
+
+ private:
+ std::atomic<int> files_closed_{0};
+};
+} // namespace
+
+TEST_F(DBSecondaryTest, SecondaryCloseFiles) {
+ Options options;
+ options.env = env_;
+ options.max_open_files = 1;
+ options.disable_auto_compactions = true;
+ Reopen(options);
+ Options options1;
+ std::unique_ptr<Env> traced_env(new TraceFileEnv(env_));
+ options1.env = traced_env.get();
+ OpenSecondary(options1);
+
+ static const auto verify_db = [&]() {
+ std::unique_ptr<Iterator> iter1(dbfull()->NewIterator(ReadOptions()));
+ std::unique_ptr<Iterator> iter2(db_secondary_->NewIterator(ReadOptions()));
+ for (iter1->SeekToFirst(), iter2->SeekToFirst();
+ iter1->Valid() && iter2->Valid(); iter1->Next(), iter2->Next()) {
+ ASSERT_EQ(iter1->key(), iter2->key());
+ ASSERT_EQ(iter1->value(), iter2->value());
+ }
+ ASSERT_FALSE(iter1->Valid());
+ ASSERT_FALSE(iter2->Valid());
+ };
+
+ ASSERT_OK(Put("a", "value"));
+ ASSERT_OK(Put("c", "value"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ verify_db();
+
+ ASSERT_OK(Put("b", "value"));
+ ASSERT_OK(Put("d", "value"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ verify_db();
+
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ ASSERT_EQ(2, static_cast<TraceFileEnv*>(traced_env.get())->files_closed());
+
+ Status s = db_secondary_->SetDBOptions({{"max_open_files", "-1"}});
+ ASSERT_TRUE(s.IsNotSupported());
+ CloseSecondary();
+}
+
+TEST_F(DBSecondaryTest, OpenAsSecondaryWALTailing) {
+ Options options;
+ options.env = env_;
+ options.level0_file_num_compaction_trigger = 4;
+ Reopen(options);
+ for (int i = 0; i < 3; ++i) {
+ ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+ ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+ }
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondary(options1);
+
+ ReadOptions ropts;
+ ropts.verify_checksums = true;
+ const auto verify_db_func = [&](const std::string& foo_val,
+ const std::string& bar_val) {
+ std::string value;
+ ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+ ASSERT_EQ(foo_val, value);
+ ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
+ ASSERT_EQ(bar_val, value);
+ Iterator* iter = db_secondary_->NewIterator(ropts);
+ ASSERT_NE(nullptr, iter);
+ iter->Seek("foo");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("foo", iter->key().ToString());
+ ASSERT_EQ(foo_val, iter->value().ToString());
+ iter->Seek("bar");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("bar", iter->key().ToString());
+ ASSERT_EQ(bar_val, iter->value().ToString());
+ size_t count = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ++count;
+ }
+ ASSERT_EQ(2, count);
+ delete iter;
+ };
+
+ verify_db_func("foo_value2", "bar_value2");
+
+ ASSERT_OK(Put("foo", "new_foo_value"));
+ ASSERT_OK(Put("bar", "new_bar_value"));
+
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ verify_db_func("new_foo_value", "new_bar_value");
+
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("foo", "new_foo_value_1"));
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ verify_db_func("new_foo_value_1", "new_bar_value");
+}
+
+TEST_F(DBSecondaryTest, OpenWithNonExistColumnFamily) {
+ Options options;
+ options.env = env_;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ cf_descs.emplace_back(kDefaultColumnFamilyName, options1);
+ cf_descs.emplace_back("pikachu", options1);
+ cf_descs.emplace_back("eevee", options1);
+ Status s = DB::OpenAsSecondary(options1, dbname_, secondary_path_, cf_descs,
+ &handles_secondary_, &db_secondary_);
+ ASSERT_NOK(s);
+}
+
+TEST_F(DBSecondaryTest, OpenWithSubsetOfColumnFamilies) {
+ Options options;
+ options.env = env_;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondary(options1);
+ ASSERT_EQ(0, handles_secondary_.size());
+ ASSERT_NE(nullptr, db_secondary_);
+
+ ASSERT_OK(Put(0 /*cf*/, "foo", "foo_value"));
+ ASSERT_OK(Put(1 /*cf*/, "foo", "foo_value"));
+ ASSERT_OK(Flush(0 /*cf*/));
+ ASSERT_OK(Flush(1 /*cf*/));
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ ReadOptions ropts;
+ ropts.verify_checksums = true;
+ std::string value;
+ ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+ ASSERT_EQ("foo_value", value);
+}
+
+TEST_F(DBSecondaryTest, SwitchToNewManifestDuringOpen) {
+ Options options;
+ options.env = env_;
+ Reopen(options);
+ Close();
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:0",
+ "VersionSet::ProcessManifestWrites:BeforeNewManifest"},
+ {"VersionSet::ProcessManifestWrites:AfterNewManifest",
+ "ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:"
+ "1"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ // Make sure db calls RecoverLogFiles so as to trigger a manifest write,
+ // which causes the db to switch to a new MANIFEST upon start.
+ port::Thread ro_db_thread([&]() {
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondary(options1);
+ CloseSecondary();
+ });
+ Reopen(options);
+ ro_db_thread.join();
+}
+
+TEST_F(DBSecondaryTest, MissingTableFileDuringOpen) {
+ Options options;
+ options.env = env_;
+ options.level0_file_num_compaction_trigger = 4;
+ Reopen(options);
+ for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) {
+ ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+ ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+ ASSERT_OK(dbfull()->Flush(FlushOptions()));
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondary(options1);
+ ReadOptions ropts;
+ ropts.verify_checksums = true;
+ std::string value;
+ ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+ ASSERT_EQ("foo_value" +
+ std::to_string(options.level0_file_num_compaction_trigger - 1),
+ value);
+ ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
+ ASSERT_EQ("bar_value" +
+ std::to_string(options.level0_file_num_compaction_trigger - 1),
+ value);
+ Iterator* iter = db_secondary_->NewIterator(ropts);
+ ASSERT_NE(nullptr, iter);
+ iter->Seek("bar");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("bar", iter->key().ToString());
+ ASSERT_EQ("bar_value" +
+ std::to_string(options.level0_file_num_compaction_trigger - 1),
+ iter->value().ToString());
+ iter->Seek("foo");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("foo", iter->key().ToString());
+ ASSERT_EQ("foo_value" +
+ std::to_string(options.level0_file_num_compaction_trigger - 1),
+ iter->value().ToString());
+ size_t count = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ++count;
+ }
+ ASSERT_EQ(2, count);
+ delete iter;
+}
+
+TEST_F(DBSecondaryTest, MissingTableFile) {
+ int table_files_not_exist = 0;
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "ReactiveVersionSet::ApplyOneVersionEditToBuilder:AfterLoadTableHandlers",
+ [&](void* arg) {
+ Status s = *reinterpret_cast<Status*>(arg);
+ if (s.IsPathNotFound()) {
+ ++table_files_not_exist;
+ } else if (!s.ok()) {
+ assert(false); // Should not reach here
+ }
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ Options options;
+ options.env = env_;
+ options.level0_file_num_compaction_trigger = 4;
+ Reopen(options);
+
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondary(options1);
+
+ for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) {
+ ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+ ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+ ASSERT_OK(dbfull()->Flush(FlushOptions()));
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_NE(nullptr, db_secondary_full());
+ ReadOptions ropts;
+ ropts.verify_checksums = true;
+ std::string value;
+ ASSERT_NOK(db_secondary_->Get(ropts, "foo", &value));
+ ASSERT_NOK(db_secondary_->Get(ropts, "bar", &value));
+
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ ASSERT_EQ(options.level0_file_num_compaction_trigger, table_files_not_exist);
+ ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+ ASSERT_EQ("foo_value" +
+ std::to_string(options.level0_file_num_compaction_trigger - 1),
+ value);
+ ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
+ ASSERT_EQ("bar_value" +
+ std::to_string(options.level0_file_num_compaction_trigger - 1),
+ value);
+ Iterator* iter = db_secondary_->NewIterator(ropts);
+ ASSERT_NE(nullptr, iter);
+ iter->Seek("bar");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("bar", iter->key().ToString());
+ ASSERT_EQ("bar_value" +
+ std::to_string(options.level0_file_num_compaction_trigger - 1),
+ iter->value().ToString());
+ iter->Seek("foo");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("foo", iter->key().ToString());
+ ASSERT_EQ("foo_value" +
+ std::to_string(options.level0_file_num_compaction_trigger - 1),
+ iter->value().ToString());
+ size_t count = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ++count;
+ }
+ ASSERT_EQ(2, count);
+ delete iter;
+}
+
+TEST_F(DBSecondaryTest, PrimaryDropColumnFamily) {
+ Options options;
+ options.env = env_;
+ const std::string kCfName1 = "pikachu";
+ CreateAndReopenWithCF({kCfName1}, options);
+
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondaryWithColumnFamilies({kCfName1}, options1);
+ ASSERT_EQ(2, handles_secondary_.size());
+
+ ASSERT_OK(Put(1 /*cf*/, "foo", "foo_val_1"));
+ ASSERT_OK(Flush(1 /*cf*/));
+
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ ReadOptions ropts;
+ ropts.verify_checksums = true;
+ std::string value;
+ ASSERT_OK(db_secondary_->Get(ropts, handles_secondary_[1], "foo", &value));
+ ASSERT_EQ("foo_val_1", value);
+
+ ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+ Close();
+ CheckFileTypeCounts(dbname_, 1, 0, 1);
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ value.clear();
+ ASSERT_OK(db_secondary_->Get(ropts, handles_secondary_[1], "foo", &value));
+ ASSERT_EQ("foo_val_1", value);
+}
+
+TEST_F(DBSecondaryTest, SwitchManifest) {
+ Options options;
+ options.env = env_;
+ options.level0_file_num_compaction_trigger = 4;
+ Reopen(options);
+
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondary(options1);
+
+ const int kNumFiles = options.level0_file_num_compaction_trigger - 1;
+ // Keep it smaller than 10 so that key0, key1, ..., key9 are sorted as 0, 1,
+ // ..., 9.
+ const int kNumKeys = 10;
+ // Create two sst
+ for (int i = 0; i != kNumFiles; ++i) {
+ for (int j = 0; j != kNumKeys; ++j) {
+ ASSERT_OK(Put("key" + std::to_string(j), "value_" + std::to_string(i)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ const auto& range_scan_db = [&]() {
+ ReadOptions tmp_ropts;
+ tmp_ropts.total_order_seek = true;
+ tmp_ropts.verify_checksums = true;
+ std::unique_ptr<Iterator> iter(db_secondary_->NewIterator(tmp_ropts));
+ int cnt = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++cnt) {
+ ASSERT_EQ("key" + std::to_string(cnt), iter->key().ToString());
+ ASSERT_EQ("value_" + std::to_string(kNumFiles - 1),
+ iter->value().ToString());
+ }
+ };
+
+ range_scan_db();
+
+ // While secondary instance still keeps old MANIFEST open, we close primary,
+ // restart primary, performs full compaction, close again, restart again so
+ // that next time secondary tries to catch up with primary, the secondary
+ // will skip the MANIFEST in middle.
+ Reopen(options);
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ Reopen(options);
+ ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
+
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ range_scan_db();
+}
+
+// Here, "Snapshot" refers to the version edits written by
+// VersionSet::WriteSnapshot() at the beginning of the new MANIFEST after
+// switching from the old one.
+TEST_F(DBSecondaryTest, SkipSnapshotAfterManifestSwitch) {
+ Options options;
+ options.env = env_;
+ options.disable_auto_compactions = true;
+ Reopen(options);
+
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondary(options1);
+
+ ASSERT_OK(Put("0", "value0"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ std::string value;
+ ReadOptions ropts;
+ ropts.verify_checksums = true;
+ ASSERT_OK(db_secondary_->Get(ropts, "0", &value));
+ ASSERT_EQ("value0", value);
+
+ Reopen(options);
+ ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+}
+
+TEST_F(DBSecondaryTest, SwitchWAL) {
+ const int kNumKeysPerMemtable = 1;
+ Options options;
+ options.env = env_;
+ options.max_write_buffer_number = 4;
+ options.min_write_buffer_number_to_merge = 2;
+ options.memtable_factory.reset(
+ new SpecialSkipListFactory(kNumKeysPerMemtable));
+ Reopen(options);
+
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondary(options1);
+
+ const auto& verify_db = [](DB* db1, DB* db2) {
+ ASSERT_NE(nullptr, db1);
+ ASSERT_NE(nullptr, db2);
+ ReadOptions read_opts;
+ read_opts.verify_checksums = true;
+ std::unique_ptr<Iterator> it1(db1->NewIterator(read_opts));
+ std::unique_ptr<Iterator> it2(db2->NewIterator(read_opts));
+ it1->SeekToFirst();
+ it2->SeekToFirst();
+ for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) {
+ ASSERT_EQ(it1->key(), it2->key());
+ ASSERT_EQ(it1->value(), it2->value());
+ }
+ ASSERT_FALSE(it1->Valid());
+ ASSERT_FALSE(it2->Valid());
+
+ for (it1->SeekToFirst(); it1->Valid(); it1->Next()) {
+ std::string value;
+ ASSERT_OK(db2->Get(read_opts, it1->key(), &value));
+ ASSERT_EQ(it1->value(), value);
+ }
+ for (it2->SeekToFirst(); it2->Valid(); it2->Next()) {
+ std::string value;
+ ASSERT_OK(db1->Get(read_opts, it2->key(), &value));
+ ASSERT_EQ(it2->value(), value);
+ }
+ };
+ for (int k = 0; k != 16; ++k) {
+ ASSERT_OK(Put("key" + std::to_string(k), "value" + std::to_string(k)));
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ verify_db(dbfull(), db_secondary_);
+ }
+}
+
+TEST_F(DBSecondaryTest, SwitchWALMultiColumnFamilies) {
+ const int kNumKeysPerMemtable = 1;
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::BackgroundCallFlush:ContextCleanedUp",
+ "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+ const std::string kCFName1 = "pikachu";
+ Options options;
+ options.env = env_;
+ options.max_write_buffer_number = 4;
+ options.min_write_buffer_number_to_merge = 2;
+ options.memtable_factory.reset(
+ new SpecialSkipListFactory(kNumKeysPerMemtable));
+ CreateAndReopenWithCF({kCFName1}, options);
+
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondaryWithColumnFamilies({kCFName1}, options1);
+ ASSERT_EQ(2, handles_secondary_.size());
+
+ const auto& verify_db = [](DB* db1,
+ const std::vector<ColumnFamilyHandle*>& handles1,
+ DB* db2,
+ const std::vector<ColumnFamilyHandle*>& handles2) {
+ ASSERT_NE(nullptr, db1);
+ ASSERT_NE(nullptr, db2);
+ ReadOptions read_opts;
+ read_opts.verify_checksums = true;
+ ASSERT_EQ(handles1.size(), handles2.size());
+ for (size_t i = 0; i != handles1.size(); ++i) {
+ std::unique_ptr<Iterator> it1(db1->NewIterator(read_opts, handles1[i]));
+ std::unique_ptr<Iterator> it2(db2->NewIterator(read_opts, handles2[i]));
+ it1->SeekToFirst();
+ it2->SeekToFirst();
+ for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) {
+ ASSERT_EQ(it1->key(), it2->key());
+ ASSERT_EQ(it1->value(), it2->value());
+ }
+ ASSERT_FALSE(it1->Valid());
+ ASSERT_FALSE(it2->Valid());
+
+ for (it1->SeekToFirst(); it1->Valid(); it1->Next()) {
+ std::string value;
+ ASSERT_OK(db2->Get(read_opts, handles2[i], it1->key(), &value));
+ ASSERT_EQ(it1->value(), value);
+ }
+ for (it2->SeekToFirst(); it2->Valid(); it2->Next()) {
+ std::string value;
+ ASSERT_OK(db1->Get(read_opts, handles1[i], it2->key(), &value));
+ ASSERT_EQ(it2->value(), value);
+ }
+ }
+ };
+ for (int k = 0; k != 8; ++k) {
+ ASSERT_OK(
+ Put(0 /*cf*/, "key" + std::to_string(k), "value" + std::to_string(k)));
+ ASSERT_OK(
+ Put(1 /*cf*/, "key" + std::to_string(k), "value" + std::to_string(k)));
+ TEST_SYNC_POINT(
+ "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp");
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ verify_db(dbfull(), handles_, db_secondary_, handles_secondary_);
+ SyncPoint::GetInstance()->ClearTrace();
+ }
+}
+
+TEST_F(DBSecondaryTest, CatchUpAfterFlush) {
+ const int kNumKeysPerMemtable = 16;
+ Options options;
+ options.env = env_;
+ options.max_write_buffer_number = 4;
+ options.min_write_buffer_number_to_merge = 2;
+ options.memtable_factory.reset(
+ new SpecialSkipListFactory(kNumKeysPerMemtable));
+ Reopen(options);
+
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondary(options1);
+
+ WriteOptions write_opts;
+ WriteBatch wb;
+ wb.Put("key0", "value0");
+ wb.Put("key1", "value1");
+ ASSERT_OK(dbfull()->Write(write_opts, &wb));
+ ReadOptions read_opts;
+ std::unique_ptr<Iterator> iter1(db_secondary_->NewIterator(read_opts));
+ iter1->Seek("key0");
+ ASSERT_FALSE(iter1->Valid());
+ iter1->Seek("key1");
+ ASSERT_FALSE(iter1->Valid());
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ iter1->Seek("key0");
+ ASSERT_FALSE(iter1->Valid());
+ iter1->Seek("key1");
+ ASSERT_FALSE(iter1->Valid());
+ std::unique_ptr<Iterator> iter2(db_secondary_->NewIterator(read_opts));
+ iter2->Seek("key0");
+ ASSERT_TRUE(iter2->Valid());
+ ASSERT_EQ("value0", iter2->value());
+ iter2->Seek("key1");
+ ASSERT_TRUE(iter2->Valid());
+ ASSERT_EQ("value1", iter2->value());
+
+ {
+ WriteBatch wb1;
+ wb1.Put("key0", "value01");
+ wb1.Put("key1", "value11");
+ ASSERT_OK(dbfull()->Write(write_opts, &wb1));
+ }
+
+ {
+ WriteBatch wb2;
+ wb2.Put("key0", "new_value0");
+ wb2.Delete("key1");
+ ASSERT_OK(dbfull()->Write(write_opts, &wb2));
+ }
+
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ std::unique_ptr<Iterator> iter3(db_secondary_->NewIterator(read_opts));
+ // iter3 should not see value01 and value11 at all.
+ iter3->Seek("key0");
+ ASSERT_TRUE(iter3->Valid());
+ ASSERT_EQ("new_value0", iter3->value());
+ iter3->Seek("key1");
+ ASSERT_FALSE(iter3->Valid());
+}
+
+TEST_F(DBSecondaryTest, CheckConsistencyWhenOpen) {
+ bool called = false;
+ Options options;
+ options.env = env_;
+ options.disable_auto_compactions = true;
+ Reopen(options);
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImplSecondary::CheckConsistency:AfterFirstAttempt", [&](void* arg) {
+ ASSERT_NE(nullptr, arg);
+ called = true;
+ auto* s = reinterpret_cast<Status*>(arg);
+ ASSERT_NOK(*s);
+ });
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::CheckConsistency:AfterGetLiveFilesMetaData",
+ "BackgroundCallCompaction:0"},
+ {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
+ "DBImpl::CheckConsistency:BeforeGetFileSize"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put("a", "value0"));
+ ASSERT_OK(Put("c", "value0"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("b", "value1"));
+ ASSERT_OK(Put("d", "value1"));
+ ASSERT_OK(Flush());
+ port::Thread thread([this]() {
+ Options opts;
+ opts.env = env_;
+ opts.max_open_files = -1;
+ OpenSecondary(opts);
+ });
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ thread.join();
+ ASSERT_TRUE(called);
+}
+#endif //! ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_info_dumper.cc b/src/rocksdb/db/db_info_dumper.cc
new file mode 100644
index 000000000..7008ca6ff
--- /dev/null
+++ b/src/rocksdb/db/db_info_dumper.cc
@@ -0,0 +1,123 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_info_dumper.h"
+
+#include <stdio.h>
+#include <algorithm>
+#include <cinttypes>
+#include <string>
+#include <vector>
+
+#include "file/filename.h"
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void DumpDBFileSummary(const ImmutableDBOptions& options,
+ const std::string& dbname) {
+ if (options.info_log == nullptr) {
+ return;
+ }
+
+ auto* env = options.env;
+ uint64_t number = 0;
+ FileType type = kInfoLogFile;
+
+ std::vector<std::string> files;
+ uint64_t file_num = 0;
+ uint64_t file_size;
+ std::string file_info, wal_info;
+
+ Header(options.info_log, "DB SUMMARY\n");
+ // Get files in dbname dir
+ if (!env->GetChildren(dbname, &files).ok()) {
+ Error(options.info_log,
+ "Error when reading %s dir\n", dbname.c_str());
+ }
+ std::sort(files.begin(), files.end());
+ for (const std::string& file : files) {
+ if (!ParseFileName(file, &number, &type)) {
+ continue;
+ }
+ switch (type) {
+ case kCurrentFile:
+ Header(options.info_log, "CURRENT file: %s\n", file.c_str());
+ break;
+ case kIdentityFile:
+ Header(options.info_log, "IDENTITY file: %s\n", file.c_str());
+ break;
+ case kDescriptorFile:
+ env->GetFileSize(dbname + "/" + file, &file_size);
+ Header(options.info_log, "MANIFEST file: %s size: %" PRIu64 " Bytes\n",
+ file.c_str(), file_size);
+ break;
+ case kLogFile:
+ env->GetFileSize(dbname + "/" + file, &file_size);
+ char str[16];
+ snprintf(str, sizeof(str), "%" PRIu64, file_size);
+ wal_info.append(file).append(" size: ").
+ append(str).append(" ; ");
+ break;
+ case kTableFile:
+ if (++file_num < 10) {
+ file_info.append(file).append(" ");
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ // Get sst files in db_path dir
+ for (auto& db_path : options.db_paths) {
+ if (dbname.compare(db_path.path) != 0) {
+ if (!env->GetChildren(db_path.path, &files).ok()) {
+ Error(options.info_log,
+ "Error when reading %s dir\n",
+ db_path.path.c_str());
+ continue;
+ }
+ std::sort(files.begin(), files.end());
+ for (const std::string& file : files) {
+ if (ParseFileName(file, &number, &type)) {
+ if (type == kTableFile && ++file_num < 10) {
+ file_info.append(file).append(" ");
+ }
+ }
+ }
+ }
+ Header(options.info_log,
+ "SST files in %s dir, Total Num: %" PRIu64 ", files: %s\n",
+ db_path.path.c_str(), file_num, file_info.c_str());
+ file_num = 0;
+ file_info.clear();
+ }
+
+ // Get wal file in wal_dir
+ if (dbname.compare(options.wal_dir) != 0) {
+ if (!env->GetChildren(options.wal_dir, &files).ok()) {
+ Error(options.info_log,
+ "Error when reading %s dir\n",
+ options.wal_dir.c_str());
+ return;
+ }
+ wal_info.clear();
+ for (const std::string& file : files) {
+ if (ParseFileName(file, &number, &type)) {
+ if (type == kLogFile) {
+ env->GetFileSize(options.wal_dir + "/" + file, &file_size);
+ char str[16];
+ snprintf(str, sizeof(str), "%" PRIu64, file_size);
+ wal_info.append(file).append(" size: ").
+ append(str).append(" ; ");
+ }
+ }
+ }
+ }
+ Header(options.info_log, "Write Ahead Log file in %s: %s\n",
+ options.wal_dir.c_str(), wal_info.c_str());
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_info_dumper.h b/src/rocksdb/db/db_info_dumper.h
new file mode 100644
index 000000000..91404cbd7
--- /dev/null
+++ b/src/rocksdb/db/db_info_dumper.h
@@ -0,0 +1,14 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <string>
+
+#include "options/db_options.h"
+
+namespace ROCKSDB_NAMESPACE {
+void DumpDBFileSummary(const ImmutableDBOptions& options,
+ const std::string& dbname);
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_inplace_update_test.cc b/src/rocksdb/db/db_inplace_update_test.cc
new file mode 100644
index 000000000..26405864e
--- /dev/null
+++ b/src/rocksdb/db/db_inplace_update_test.cc
@@ -0,0 +1,177 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBTestInPlaceUpdate : public DBTestBase {
+ public:
+ DBTestInPlaceUpdate() : DBTestBase("/db_inplace_update_test") {}
+};
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdate) {
+ do {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.inplace_update_support = true;
+ options.env = env_;
+ options.write_buffer_size = 100000;
+ options.allow_concurrent_memtable_write = false;
+ Reopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Update key with values of smaller size
+ int numValues = 10;
+ for (int i = numValues; i > 0; i--) {
+ std::string value = DummyString(i, 'a');
+ ASSERT_OK(Put(1, "key", value));
+ ASSERT_EQ(value, Get(1, "key"));
+ }
+
+ // Only 1 instance for that key.
+ validateNumberOfEntries(1, 1);
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateLargeNewValue) {
+ do {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.inplace_update_support = true;
+ options.env = env_;
+ options.write_buffer_size = 100000;
+ options.allow_concurrent_memtable_write = false;
+ Reopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Update key with values of larger size
+ int numValues = 10;
+ for (int i = 0; i < numValues; i++) {
+ std::string value = DummyString(i, 'a');
+ ASSERT_OK(Put(1, "key", value));
+ ASSERT_EQ(value, Get(1, "key"));
+ }
+
+ // All 10 updates exist in the internal iterator
+ validateNumberOfEntries(numValues, 1);
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackSmallerSize) {
+ do {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.inplace_update_support = true;
+
+ options.env = env_;
+ options.write_buffer_size = 100000;
+ options.inplace_callback =
+ ROCKSDB_NAMESPACE::DBTestInPlaceUpdate::updateInPlaceSmallerSize;
+ options.allow_concurrent_memtable_write = false;
+ Reopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Update key with values of smaller size
+ int numValues = 10;
+ ASSERT_OK(Put(1, "key", DummyString(numValues, 'a')));
+ ASSERT_EQ(DummyString(numValues, 'c'), Get(1, "key"));
+
+ for (int i = numValues; i > 0; i--) {
+ ASSERT_OK(Put(1, "key", DummyString(i, 'a')));
+ ASSERT_EQ(DummyString(i - 1, 'b'), Get(1, "key"));
+ }
+
+ // Only 1 instance for that key.
+ validateNumberOfEntries(1, 1);
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackSmallerVarintSize) {
+ do {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.inplace_update_support = true;
+
+ options.env = env_;
+ options.write_buffer_size = 100000;
+ options.inplace_callback =
+ ROCKSDB_NAMESPACE::DBTestInPlaceUpdate::updateInPlaceSmallerVarintSize;
+ options.allow_concurrent_memtable_write = false;
+ Reopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Update key with values of smaller varint size
+ int numValues = 265;
+ ASSERT_OK(Put(1, "key", DummyString(numValues, 'a')));
+ ASSERT_EQ(DummyString(numValues, 'c'), Get(1, "key"));
+
+ for (int i = numValues; i > 0; i--) {
+ ASSERT_OK(Put(1, "key", DummyString(i, 'a')));
+ ASSERT_EQ(DummyString(1, 'b'), Get(1, "key"));
+ }
+
+ // Only 1 instance for that key.
+ validateNumberOfEntries(1, 1);
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackLargeNewValue) {
+ do {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.inplace_update_support = true;
+
+ options.env = env_;
+ options.write_buffer_size = 100000;
+ options.inplace_callback =
+ ROCKSDB_NAMESPACE::DBTestInPlaceUpdate::updateInPlaceLargerSize;
+ options.allow_concurrent_memtable_write = false;
+ Reopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Update key with values of larger size
+ int numValues = 10;
+ for (int i = 0; i < numValues; i++) {
+ ASSERT_OK(Put(1, "key", DummyString(i, 'a')));
+ ASSERT_EQ(DummyString(i, 'c'), Get(1, "key"));
+ }
+
+ // No inplace updates. All updates are puts with new seq number
+ // All 10 updates exist in the internal iterator
+ validateNumberOfEntries(numValues, 1);
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackNoAction) {
+ do {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.inplace_update_support = true;
+
+ options.env = env_;
+ options.write_buffer_size = 100000;
+ options.inplace_callback =
+ ROCKSDB_NAMESPACE::DBTestInPlaceUpdate::updateInPlaceNoAction;
+ options.allow_concurrent_memtable_write = false;
+ Reopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Callback function requests no actions from db
+ ASSERT_OK(Put(1, "key", DummyString(1, 'a')));
+ ASSERT_EQ(Get(1, "key"), "NOT_FOUND");
+ } while (ChangeCompactOptions());
+}
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_io_failure_test.cc b/src/rocksdb/db/db_io_failure_test.cc
new file mode 100644
index 000000000..f8d562447
--- /dev/null
+++ b/src/rocksdb/db/db_io_failure_test.cc
@@ -0,0 +1,568 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBIOFailureTest : public DBTestBase {
+ public:
+ DBIOFailureTest() : DBTestBase("/db_io_failure_test") {}
+};
+
+#ifndef ROCKSDB_LITE
+// Check that number of files does not grow when writes are dropped
+TEST_F(DBIOFailureTest, DropWrites) {
+ do {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.paranoid_checks = false;
+ Reopen(options);
+
+ ASSERT_OK(Put("foo", "v1"));
+ ASSERT_EQ("v1", Get("foo"));
+ Compact("a", "z");
+ const size_t num_files = CountFiles();
+ // Force out-of-space errors
+ env_->drop_writes_.store(true, std::memory_order_release);
+ env_->sleep_counter_.Reset();
+ env_->no_slowdown_ = true;
+ for (int i = 0; i < 5; i++) {
+ if (option_config_ != kUniversalCompactionMultiLevel &&
+ option_config_ != kUniversalSubcompactions) {
+ for (int level = 0; level < dbfull()->NumberLevels(); level++) {
+ if (level > 0 && level == dbfull()->NumberLevels() - 1) {
+ break;
+ }
+ dbfull()->TEST_CompactRange(level, nullptr, nullptr, nullptr,
+ true /* disallow trivial move */);
+ }
+ } else {
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ }
+ }
+
+ std::string property_value;
+ ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
+ ASSERT_EQ("5", property_value);
+
+ env_->drop_writes_.store(false, std::memory_order_release);
+ ASSERT_LT(CountFiles(), num_files + 3);
+
+ // Check that compaction attempts slept after errors
+ // TODO @krad: Figure out why ASSERT_EQ 5 keeps failing in certain compiler
+ // versions
+ ASSERT_GE(env_->sleep_counter_.Read(), 4);
+ } while (ChangeCompactOptions());
+}
+
+// Check background error counter bumped on flush failures.
+TEST_F(DBIOFailureTest, DropWritesFlush) {
+ do {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.max_background_flushes = 1;
+ Reopen(options);
+
+ ASSERT_OK(Put("foo", "v1"));
+ // Force out-of-space errors
+ env_->drop_writes_.store(true, std::memory_order_release);
+
+ std::string property_value;
+ // Background error count is 0 now.
+ ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
+ ASSERT_EQ("0", property_value);
+
+ dbfull()->TEST_FlushMemTable(true);
+
+ ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
+ ASSERT_EQ("1", property_value);
+
+ env_->drop_writes_.store(false, std::memory_order_release);
+ } while (ChangeCompactOptions());
+}
+
+// Check that CompactRange() returns failure if there is not enough space left
+// on device
+TEST_F(DBIOFailureTest, NoSpaceCompactRange) {
+ do {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.disable_auto_compactions = true;
+ Reopen(options);
+
+ // generate 5 tables
+ for (int i = 0; i < 5; ++i) {
+ ASSERT_OK(Put(Key(i), Key(i) + "v"));
+ ASSERT_OK(Flush());
+ }
+
+ // Force out-of-space errors
+ env_->no_space_.store(true, std::memory_order_release);
+
+ Status s = dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+ true /* disallow trivial move */);
+ ASSERT_TRUE(s.IsIOError());
+ ASSERT_TRUE(s.IsNoSpace());
+
+ env_->no_space_.store(false, std::memory_order_release);
+ } while (ChangeCompactOptions());
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBIOFailureTest, NonWritableFileSystem) {
+ do {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 4096;
+ options.arena_block_size = 4096;
+ options.env = env_;
+ Reopen(options);
+ ASSERT_OK(Put("foo", "v1"));
+ env_->non_writeable_rate_.store(100);
+ std::string big(100000, 'x');
+ int errors = 0;
+ for (int i = 0; i < 20; i++) {
+ if (!Put("foo", big).ok()) {
+ errors++;
+ env_->SleepForMicroseconds(100000);
+ }
+ }
+ ASSERT_GT(errors, 0);
+ env_->non_writeable_rate_.store(0);
+ } while (ChangeCompactOptions());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBIOFailureTest, ManifestWriteError) {
+ // Test for the following problem:
+ // (a) Compaction produces file F
+ // (b) Log record containing F is written to MANIFEST file, but Sync() fails
+ // (c) GC deletes F
+ // (d) After reopening DB, reads fail since deleted F is named in log record
+
+ // We iterate twice. In the second iteration, everything is the
+ // same except the log record never makes it to the MANIFEST file.
+ for (int iter = 0; iter < 2; iter++) {
+ std::atomic<bool>* error_type = (iter == 0) ? &env_->manifest_sync_error_
+ : &env_->manifest_write_error_;
+
+ // Insert foo=>bar mapping
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.error_if_exists = false;
+ options.paranoid_checks = true;
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_EQ("bar", Get("foo"));
+
+ // Memtable compaction (will succeed)
+ Flush();
+ ASSERT_EQ("bar", Get("foo"));
+ const int last = 2;
+ MoveFilesToLevel(2);
+ ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo=>bar is now in last level
+
+ // Merging compaction (will fail)
+ error_type->store(true, std::memory_order_release);
+ dbfull()->TEST_CompactRange(last, nullptr, nullptr); // Should fail
+ ASSERT_EQ("bar", Get("foo"));
+
+ error_type->store(false, std::memory_order_release);
+
+ // Since paranoid_checks=true, writes should fail
+ ASSERT_NOK(Put("foo2", "bar2"));
+
+ // Recovery: should not lose data
+ ASSERT_EQ("bar", Get("foo"));
+
+ // Try again with paranoid_checks=false
+ Close();
+ options.paranoid_checks = false;
+ Reopen(options);
+
+ // Merging compaction (will fail)
+ error_type->store(true, std::memory_order_release);
+ dbfull()->TEST_CompactRange(last, nullptr, nullptr); // Should fail
+ ASSERT_EQ("bar", Get("foo"));
+
+ // Recovery: should not lose data
+ error_type->store(false, std::memory_order_release);
+ Reopen(options);
+ ASSERT_EQ("bar", Get("foo"));
+
+ // Since paranoid_checks=false, writes should succeed
+ ASSERT_OK(Put("foo2", "bar2"));
+ ASSERT_EQ("bar", Get("foo"));
+ ASSERT_EQ("bar2", Get("foo2"));
+ }
+}
+
+TEST_F(DBIOFailureTest, PutFailsParanoid) {
+ // Test the following:
+ // (a) A random put fails in paranoid mode (simulate by sync fail)
+ // (b) All other puts have to fail, even if writes would succeed
+ // (c) All of that should happen ONLY if paranoid_checks = true
+
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.error_if_exists = false;
+ options.paranoid_checks = true;
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ Status s;
+
+ ASSERT_OK(Put(1, "foo", "bar"));
+ ASSERT_OK(Put(1, "foo1", "bar1"));
+ // simulate error
+ env_->log_write_error_.store(true, std::memory_order_release);
+ s = Put(1, "foo2", "bar2");
+ ASSERT_TRUE(!s.ok());
+ env_->log_write_error_.store(false, std::memory_order_release);
+ s = Put(1, "foo3", "bar3");
+ // the next put should fail, too
+ ASSERT_TRUE(!s.ok());
+ // but we're still able to read
+ ASSERT_EQ("bar", Get(1, "foo"));
+
+ // do the same thing with paranoid checks off
+ options.paranoid_checks = false;
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_OK(Put(1, "foo", "bar"));
+ ASSERT_OK(Put(1, "foo1", "bar1"));
+ // simulate error
+ env_->log_write_error_.store(true, std::memory_order_release);
+ s = Put(1, "foo2", "bar2");
+ ASSERT_TRUE(!s.ok());
+ env_->log_write_error_.store(false, std::memory_order_release);
+ s = Put(1, "foo3", "bar3");
+ // the next put should NOT fail
+ ASSERT_TRUE(s.ok());
+}
+#if !(defined NDEBUG) || !defined(OS_WIN)
+TEST_F(DBIOFailureTest, FlushSstRangeSyncError) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.error_if_exists = false;
+ options.paranoid_checks = true;
+ options.write_buffer_size = 256 * 1024 * 1024;
+ options.writable_file_max_buffer_size = 128 * 1024;
+ options.bytes_per_sync = 128 * 1024;
+ options.level0_file_num_compaction_trigger = 4;
+ options.memtable_factory.reset(new SpecialSkipListFactory(10));
+ BlockBasedTableOptions table_options;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ Status s;
+
+ std::atomic<int> range_sync_called(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SpecialEnv::SStableFile::RangeSync", [&](void* arg) {
+ if (range_sync_called.fetch_add(1) == 0) {
+ Status* st = static_cast<Status*>(arg);
+ *st = Status::IOError("range sync dummy error");
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ std::string rnd_str =
+ RandomString(&rnd, static_cast<int>(options.bytes_per_sync / 2));
+ std::string rnd_str_512kb = RandomString(&rnd, 512 * 1024);
+
+ ASSERT_OK(Put(1, "foo", "bar"));
+ // First 1MB doesn't get range synced
+ ASSERT_OK(Put(1, "foo0_0", rnd_str_512kb));
+ ASSERT_OK(Put(1, "foo0_1", rnd_str_512kb));
+ ASSERT_OK(Put(1, "foo1_1", rnd_str));
+ ASSERT_OK(Put(1, "foo1_2", rnd_str));
+ ASSERT_OK(Put(1, "foo1_3", rnd_str));
+ ASSERT_OK(Put(1, "foo2", "bar"));
+ ASSERT_OK(Put(1, "foo3_1", rnd_str));
+ ASSERT_OK(Put(1, "foo3_2", rnd_str));
+ ASSERT_OK(Put(1, "foo3_3", rnd_str));
+ ASSERT_OK(Put(1, "foo4", "bar"));
+ dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+
+ // Following writes should fail as flush failed.
+ ASSERT_NOK(Put(1, "foo2", "bar3"));
+ ASSERT_EQ("bar", Get(1, "foo"));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ASSERT_GE(1, range_sync_called.load());
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_EQ("bar", Get(1, "foo"));
+}
+
+TEST_F(DBIOFailureTest, CompactSstRangeSyncError) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.error_if_exists = false;
+ options.paranoid_checks = true;
+ options.write_buffer_size = 256 * 1024 * 1024;
+ options.writable_file_max_buffer_size = 128 * 1024;
+ options.bytes_per_sync = 128 * 1024;
+ options.level0_file_num_compaction_trigger = 2;
+ options.target_file_size_base = 256 * 1024 * 1024;
+ options.disable_auto_compactions = true;
+ BlockBasedTableOptions table_options;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ Status s;
+
+ Random rnd(301);
+ std::string rnd_str =
+ RandomString(&rnd, static_cast<int>(options.bytes_per_sync / 2));
+ std::string rnd_str_512kb = RandomString(&rnd, 512 * 1024);
+
+ ASSERT_OK(Put(1, "foo", "bar"));
+ // First 1MB doesn't get range synced
+ ASSERT_OK(Put(1, "foo0_0", rnd_str_512kb));
+ ASSERT_OK(Put(1, "foo0_1", rnd_str_512kb));
+ ASSERT_OK(Put(1, "foo1_1", rnd_str));
+ ASSERT_OK(Put(1, "foo1_2", rnd_str));
+ ASSERT_OK(Put(1, "foo1_3", rnd_str));
+ Flush(1);
+ ASSERT_OK(Put(1, "foo", "bar"));
+ ASSERT_OK(Put(1, "foo3_1", rnd_str));
+ ASSERT_OK(Put(1, "foo3_2", rnd_str));
+ ASSERT_OK(Put(1, "foo3_3", rnd_str));
+ ASSERT_OK(Put(1, "foo4", "bar"));
+ Flush(1);
+ dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+
+ std::atomic<int> range_sync_called(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SpecialEnv::SStableFile::RangeSync", [&](void* arg) {
+ if (range_sync_called.fetch_add(1) == 0) {
+ Status* st = static_cast<Status*>(arg);
+ *st = Status::IOError("range sync dummy error");
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(dbfull()->SetOptions(handles_[1],
+ {
+ {"disable_auto_compactions", "false"},
+ }));
+ dbfull()->TEST_WaitForCompact();
+
+ // Following writes should fail as flush failed.
+ ASSERT_NOK(Put(1, "foo2", "bar3"));
+ ASSERT_EQ("bar", Get(1, "foo"));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ASSERT_GE(1, range_sync_called.load());
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_EQ("bar", Get(1, "foo"));
+}
+
+TEST_F(DBIOFailureTest, FlushSstCloseError) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.error_if_exists = false;
+ options.paranoid_checks = true;
+ options.level0_file_num_compaction_trigger = 4;
+ options.memtable_factory.reset(new SpecialSkipListFactory(2));
+
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ Status s;
+ std::atomic<int> close_called(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SpecialEnv::SStableFile::Close", [&](void* arg) {
+ if (close_called.fetch_add(1) == 0) {
+ Status* st = static_cast<Status*>(arg);
+ *st = Status::IOError("close dummy error");
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put(1, "foo", "bar"));
+ ASSERT_OK(Put(1, "foo1", "bar1"));
+ ASSERT_OK(Put(1, "foo", "bar2"));
+ dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+
+ // Following writes should fail as flush failed.
+ ASSERT_NOK(Put(1, "foo2", "bar3"));
+ ASSERT_EQ("bar2", Get(1, "foo"));
+ ASSERT_EQ("bar1", Get(1, "foo1"));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_EQ("bar2", Get(1, "foo"));
+ ASSERT_EQ("bar1", Get(1, "foo1"));
+}
+
+TEST_F(DBIOFailureTest, CompactionSstCloseError) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.error_if_exists = false;
+ options.paranoid_checks = true;
+ options.level0_file_num_compaction_trigger = 2;
+ options.disable_auto_compactions = true;
+
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ Status s;
+
+ ASSERT_OK(Put(1, "foo", "bar"));
+ ASSERT_OK(Put(1, "foo2", "bar"));
+ Flush(1);
+ ASSERT_OK(Put(1, "foo", "bar2"));
+ ASSERT_OK(Put(1, "foo2", "bar"));
+ Flush(1);
+ ASSERT_OK(Put(1, "foo", "bar3"));
+ ASSERT_OK(Put(1, "foo2", "bar"));
+ Flush(1);
+ dbfull()->TEST_WaitForCompact();
+
+ std::atomic<int> close_called(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SpecialEnv::SStableFile::Close", [&](void* arg) {
+ if (close_called.fetch_add(1) == 0) {
+ Status* st = static_cast<Status*>(arg);
+ *st = Status::IOError("close dummy error");
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(dbfull()->SetOptions(handles_[1],
+ {
+ {"disable_auto_compactions", "false"},
+ }));
+ dbfull()->TEST_WaitForCompact();
+
+ // Following writes should fail as compaction failed.
+ ASSERT_NOK(Put(1, "foo2", "bar3"));
+ ASSERT_EQ("bar3", Get(1, "foo"));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_EQ("bar3", Get(1, "foo"));
+}
+
+TEST_F(DBIOFailureTest, FlushSstSyncError) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.error_if_exists = false;
+ options.paranoid_checks = true;
+ options.use_fsync = false;
+ options.level0_file_num_compaction_trigger = 4;
+ options.memtable_factory.reset(new SpecialSkipListFactory(2));
+
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ Status s;
+ std::atomic<int> sync_called(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SpecialEnv::SStableFile::Sync", [&](void* arg) {
+ if (sync_called.fetch_add(1) == 0) {
+ Status* st = static_cast<Status*>(arg);
+ *st = Status::IOError("sync dummy error");
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put(1, "foo", "bar"));
+ ASSERT_OK(Put(1, "foo1", "bar1"));
+ ASSERT_OK(Put(1, "foo", "bar2"));
+ dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+
+ // Following writes should fail as flush failed.
+ ASSERT_NOK(Put(1, "foo2", "bar3"));
+ ASSERT_EQ("bar2", Get(1, "foo"));
+ ASSERT_EQ("bar1", Get(1, "foo1"));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_EQ("bar2", Get(1, "foo"));
+ ASSERT_EQ("bar1", Get(1, "foo1"));
+}
+
+TEST_F(DBIOFailureTest, CompactionSstSyncError) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.error_if_exists = false;
+ options.paranoid_checks = true;
+ options.level0_file_num_compaction_trigger = 2;
+ options.disable_auto_compactions = true;
+ options.use_fsync = false;
+
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ Status s;
+
+ ASSERT_OK(Put(1, "foo", "bar"));
+ ASSERT_OK(Put(1, "foo2", "bar"));
+ Flush(1);
+ ASSERT_OK(Put(1, "foo", "bar2"));
+ ASSERT_OK(Put(1, "foo2", "bar"));
+ Flush(1);
+ ASSERT_OK(Put(1, "foo", "bar3"));
+ ASSERT_OK(Put(1, "foo2", "bar"));
+ Flush(1);
+ dbfull()->TEST_WaitForCompact();
+
+ std::atomic<int> sync_called(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SpecialEnv::SStableFile::Sync", [&](void* arg) {
+ if (sync_called.fetch_add(1) == 0) {
+ Status* st = static_cast<Status*>(arg);
+ *st = Status::IOError("close dummy error");
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(dbfull()->SetOptions(handles_[1],
+ {
+ {"disable_auto_compactions", "false"},
+ }));
+ dbfull()->TEST_WaitForCompact();
+
+ // Following writes should fail as compaction failed.
+ ASSERT_NOK(Put(1, "foo2", "bar3"));
+ ASSERT_EQ("bar3", Get(1, "foo"));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_EQ("bar3", Get(1, "foo"));
+}
+#endif // !(defined NDEBUG) || !defined(OS_WIN)
+#endif // ROCKSDB_LITE
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_iter.cc b/src/rocksdb/db/db_iter.cc
new file mode 100644
index 000000000..e5d402948
--- /dev/null
+++ b/src/rocksdb/db/db_iter.cc
@@ -0,0 +1,1310 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_iter.h"
+#include <string>
+#include <iostream>
+#include <limits>
+
+#include "db/dbformat.h"
+#include "db/merge_context.h"
+#include "db/merge_helper.h"
+#include "db/pinned_iterators_manager.h"
+#include "file/filename.h"
+#include "logging/logging.h"
+#include "memory/arena.h"
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/options.h"
+#include "table/internal_iterator.h"
+#include "table/iterator_wrapper.h"
+#include "trace_replay/trace_replay.h"
+#include "util/mutexlock.h"
+#include "util/string_util.h"
+#include "util/user_comparator_wrapper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+DBIter::DBIter(Env* _env, const ReadOptions& read_options,
+ const ImmutableCFOptions& cf_options,
+ const MutableCFOptions& mutable_cf_options,
+ const Comparator* cmp, InternalIterator* iter, SequenceNumber s,
+ bool arena_mode, uint64_t max_sequential_skip_in_iterations,
+ ReadCallback* read_callback, DBImpl* db_impl,
+ ColumnFamilyData* cfd, bool allow_blob)
+ : prefix_extractor_(mutable_cf_options.prefix_extractor.get()),
+ env_(_env),
+ logger_(cf_options.info_log),
+ user_comparator_(cmp),
+ merge_operator_(cf_options.merge_operator),
+ iter_(iter),
+ read_callback_(read_callback),
+ sequence_(s),
+ statistics_(cf_options.statistics),
+ num_internal_keys_skipped_(0),
+ iterate_lower_bound_(read_options.iterate_lower_bound),
+ iterate_upper_bound_(read_options.iterate_upper_bound),
+ direction_(kForward),
+ valid_(false),
+ current_entry_is_merged_(false),
+ is_key_seqnum_zero_(false),
+ prefix_same_as_start_(mutable_cf_options.prefix_extractor
+ ? read_options.prefix_same_as_start
+ : false),
+ pin_thru_lifetime_(read_options.pin_data),
+ expect_total_order_inner_iter_(prefix_extractor_ == nullptr ||
+ read_options.total_order_seek ||
+ read_options.auto_prefix_mode),
+ allow_blob_(allow_blob),
+ is_blob_(false),
+ arena_mode_(arena_mode),
+ range_del_agg_(&cf_options.internal_comparator, s),
+ db_impl_(db_impl),
+ cfd_(cfd),
+ start_seqnum_(read_options.iter_start_seqnum) {
+ RecordTick(statistics_, NO_ITERATOR_CREATED);
+ max_skip_ = max_sequential_skip_in_iterations;
+ max_skippable_internal_keys_ = read_options.max_skippable_internal_keys;
+ if (pin_thru_lifetime_) {
+ pinned_iters_mgr_.StartPinning();
+ }
+ if (iter_.iter()) {
+ iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_);
+ }
+}
+
+Status DBIter::GetProperty(std::string prop_name, std::string* prop) {
+ if (prop == nullptr) {
+ return Status::InvalidArgument("prop is nullptr");
+ }
+ if (prop_name == "rocksdb.iterator.super-version-number") {
+ // First try to pass the value returned from inner iterator.
+ return iter_.iter()->GetProperty(prop_name, prop);
+ } else if (prop_name == "rocksdb.iterator.is-key-pinned") {
+ if (valid_) {
+ *prop = (pin_thru_lifetime_ && saved_key_.IsKeyPinned()) ? "1" : "0";
+ } else {
+ *prop = "Iterator is not valid.";
+ }
+ return Status::OK();
+ } else if (prop_name == "rocksdb.iterator.internal-key") {
+ *prop = saved_key_.GetUserKey().ToString();
+ return Status::OK();
+ }
+ return Status::InvalidArgument("Unidentified property.");
+}
+
+bool DBIter::ParseKey(ParsedInternalKey* ikey) {
+ if (!ParseInternalKey(iter_.key(), ikey)) {
+ status_ = Status::Corruption("corrupted internal key in DBIter");
+ valid_ = false;
+ ROCKS_LOG_ERROR(logger_, "corrupted internal key in DBIter: %s",
+ iter_.key().ToString(true).c_str());
+ return false;
+ } else {
+ return true;
+ }
+}
+
+void DBIter::Next() {
+ assert(valid_);
+ assert(status_.ok());
+
+ PERF_CPU_TIMER_GUARD(iter_next_cpu_nanos, env_);
+ // Release temporarily pinned blocks from last operation
+ ReleaseTempPinnedData();
+ local_stats_.skip_count_ += num_internal_keys_skipped_;
+ local_stats_.skip_count_--;
+ num_internal_keys_skipped_ = 0;
+ bool ok = true;
+ if (direction_ == kReverse) {
+ is_key_seqnum_zero_ = false;
+ if (!ReverseToForward()) {
+ ok = false;
+ }
+ } else if (!current_entry_is_merged_) {
+ // If the current value is not a merge, the iter position is the
+ // current key, which is already returned. We can safely issue a
+ // Next() without checking the current key.
+ // If the current key is a merge, very likely iter already points
+ // to the next internal position.
+ assert(iter_.Valid());
+ iter_.Next();
+ PERF_COUNTER_ADD(internal_key_skipped_count, 1);
+ }
+
+ local_stats_.next_count_++;
+ if (ok && iter_.Valid()) {
+ Slice prefix;
+ if (prefix_same_as_start_) {
+ assert(prefix_extractor_ != nullptr);
+ prefix = prefix_.GetUserKey();
+ }
+ FindNextUserEntry(true /* skipping the current user key */,
+ prefix_same_as_start_ ? &prefix : nullptr);
+ } else {
+ is_key_seqnum_zero_ = false;
+ valid_ = false;
+ }
+ if (statistics_ != nullptr && valid_) {
+ local_stats_.next_found_count_++;
+ local_stats_.bytes_read_ += (key().size() + value().size());
+ }
+}
+
+// PRE: saved_key_ has the current user key if skipping_saved_key
+// POST: saved_key_ should have the next user key if valid_,
+// if the current entry is a result of merge
+// current_entry_is_merged_ => true
+// saved_value_ => the merged value
+//
+// NOTE: In between, saved_key_ can point to a user key that has
+// a delete marker or a sequence number higher than sequence_
+// saved_key_ MUST have a proper user_key before calling this function
+//
+// The prefix parameter, if not null, indicates that we need to iterate
+// within the prefix, and the iterator needs to be made invalid, if no
+// more entry for the prefix can be found.
+bool DBIter::FindNextUserEntry(bool skipping_saved_key, const Slice* prefix) {
+ PERF_TIMER_GUARD(find_next_user_entry_time);
+ return FindNextUserEntryInternal(skipping_saved_key, prefix);
+}
+
+// Actual implementation of DBIter::FindNextUserEntry()
+bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
+ const Slice* prefix) {
+ // Loop until we hit an acceptable entry to yield
+ assert(iter_.Valid());
+ assert(status_.ok());
+ assert(direction_ == kForward);
+ current_entry_is_merged_ = false;
+
+ // How many times in a row we have skipped an entry with user key less than
+ // or equal to saved_key_. We could skip these entries either because
+ // sequence numbers were too high or because skipping_saved_key = true.
+ // What saved_key_ contains throughout this method:
+ // - if skipping_saved_key : saved_key_ contains the key that we need
+ // to skip, and we haven't seen any keys greater
+ // than that,
+ // - if num_skipped > 0 : saved_key_ contains the key that we have skipped
+ // num_skipped times, and we haven't seen any keys
+ // greater than that,
+ // - none of the above : saved_key_ can contain anything, it doesn't
+ // matter.
+ uint64_t num_skipped = 0;
+ // For write unprepared, the target sequence number in reseek could be larger
+ // than the snapshot, and thus needs to be skipped again. This could result in
+ // an infinite loop of reseeks. To avoid that, we limit the number of reseeks
+ // to one.
+ bool reseek_done = false;
+
+ is_blob_ = false;
+
+ do {
+ // Will update is_key_seqnum_zero_ as soon as we parsed the current key
+ // but we need to save the previous value to be used in the loop.
+ bool is_prev_key_seqnum_zero = is_key_seqnum_zero_;
+ if (!ParseKey(&ikey_)) {
+ is_key_seqnum_zero_ = false;
+ return false;
+ }
+
+ is_key_seqnum_zero_ = (ikey_.sequence == 0);
+
+ assert(iterate_upper_bound_ == nullptr || iter_.MayBeOutOfUpperBound() ||
+ user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) < 0);
+ if (iterate_upper_bound_ != nullptr && iter_.MayBeOutOfUpperBound() &&
+ user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) >= 0) {
+ break;
+ }
+
+ assert(prefix == nullptr || prefix_extractor_ != nullptr);
+ if (prefix != nullptr &&
+ prefix_extractor_->Transform(ikey_.user_key).compare(*prefix) != 0) {
+ assert(prefix_same_as_start_);
+ break;
+ }
+
+ if (TooManyInternalKeysSkipped()) {
+ return false;
+ }
+
+ if (IsVisible(ikey_.sequence)) {
+ // If the previous entry is of seqnum 0, the current entry will not
+ // possibly be skipped. This condition can potentially be relaxed to
+ // prev_key.seq <= ikey_.sequence. We are cautious because it will be more
+ // prone to bugs causing the same user key with the same sequence number.
+ if (!is_prev_key_seqnum_zero && skipping_saved_key &&
+ user_comparator_.Compare(ikey_.user_key, saved_key_.GetUserKey()) <=
+ 0) {
+ num_skipped++; // skip this entry
+ PERF_COUNTER_ADD(internal_key_skipped_count, 1);
+ } else {
+ assert(!skipping_saved_key ||
+ user_comparator_.Compare(ikey_.user_key,
+ saved_key_.GetUserKey()) > 0);
+ num_skipped = 0;
+ reseek_done = false;
+ switch (ikey_.type) {
+ case kTypeDeletion:
+ case kTypeSingleDeletion:
+ // Arrange to skip all upcoming entries for this key since
+ // they are hidden by this deletion.
+ // if iterartor specified start_seqnum we
+ // 1) return internal key, including the type
+ // 2) return ikey only if ikey.seqnum >= start_seqnum_
+ // note that if deletion seqnum is < start_seqnum_ we
+ // just skip it like in normal iterator.
+ if (start_seqnum_ > 0 && ikey_.sequence >= start_seqnum_) {
+ saved_key_.SetInternalKey(ikey_);
+ valid_ = true;
+ return true;
+ } else {
+ saved_key_.SetUserKey(
+ ikey_.user_key, !pin_thru_lifetime_ ||
+ !iter_.iter()->IsKeyPinned() /* copy */);
+ skipping_saved_key = true;
+ PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
+ }
+ break;
+ case kTypeValue:
+ case kTypeBlobIndex:
+ if (start_seqnum_ > 0) {
+ // we are taking incremental snapshot here
+ // incremental snapshots aren't supported on DB with range deletes
+ assert(ikey_.type != kTypeBlobIndex);
+ if (ikey_.sequence >= start_seqnum_) {
+ saved_key_.SetInternalKey(ikey_);
+ valid_ = true;
+ return true;
+ } else {
+ // this key and all previous versions shouldn't be included,
+ // skipping_saved_key
+ saved_key_.SetUserKey(
+ ikey_.user_key,
+ !pin_thru_lifetime_ ||
+ !iter_.iter()->IsKeyPinned() /* copy */);
+ skipping_saved_key = true;
+ }
+ } else {
+ saved_key_.SetUserKey(
+ ikey_.user_key, !pin_thru_lifetime_ ||
+ !iter_.iter()->IsKeyPinned() /* copy */);
+ if (range_del_agg_.ShouldDelete(
+ ikey_, RangeDelPositioningMode::kForwardTraversal)) {
+ // Arrange to skip all upcoming entries for this key since
+ // they are hidden by this deletion.
+ skipping_saved_key = true;
+ num_skipped = 0;
+ reseek_done = false;
+ PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
+ } else if (ikey_.type == kTypeBlobIndex) {
+ if (!allow_blob_) {
+ ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
+ status_ = Status::NotSupported(
+ "Encounter unexpected blob index. Please open DB with "
+ "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
+ valid_ = false;
+ return false;
+ }
+
+ is_blob_ = true;
+ valid_ = true;
+ return true;
+ } else {
+ valid_ = true;
+ return true;
+ }
+ }
+ break;
+ case kTypeMerge:
+ saved_key_.SetUserKey(
+ ikey_.user_key,
+ !pin_thru_lifetime_ || !iter_.iter()->IsKeyPinned() /* copy */);
+ if (range_del_agg_.ShouldDelete(
+ ikey_, RangeDelPositioningMode::kForwardTraversal)) {
+ // Arrange to skip all upcoming entries for this key since
+ // they are hidden by this deletion.
+ skipping_saved_key = true;
+ num_skipped = 0;
+ reseek_done = false;
+ PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
+ } else {
+ // By now, we are sure the current ikey is going to yield a
+ // value
+ current_entry_is_merged_ = true;
+ valid_ = true;
+ return MergeValuesNewToOld(); // Go to a different state machine
+ }
+ break;
+ default:
+ assert(false);
+ break;
+ }
+ }
+ } else {
+ PERF_COUNTER_ADD(internal_recent_skipped_count, 1);
+
+ // This key was inserted after our snapshot was taken.
+ // If this happens too many times in a row for the same user key, we want
+ // to seek to the target sequence number.
+ int cmp =
+ user_comparator_.Compare(ikey_.user_key, saved_key_.GetUserKey());
+ if (cmp == 0 || (skipping_saved_key && cmp < 0)) {
+ num_skipped++;
+ } else {
+ saved_key_.SetUserKey(
+ ikey_.user_key,
+ !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
+ skipping_saved_key = false;
+ num_skipped = 0;
+ reseek_done = false;
+ }
+ }
+
+ // If we have sequentially iterated via numerous equal keys, then it's
+ // better to seek so that we can avoid too many key comparisons.
+ //
+ // To avoid infinite loops, do not reseek if we have already attempted to
+ // reseek previously.
+ //
+ // TODO(lth): If we reseek to sequence number greater than ikey_.sequence,
+ // then it does not make sense to reseek as we would actually land further
+ // away from the desired key. There is opportunity for optimization here.
+ if (num_skipped > max_skip_ && !reseek_done) {
+ is_key_seqnum_zero_ = false;
+ num_skipped = 0;
+ reseek_done = true;
+ std::string last_key;
+ if (skipping_saved_key) {
+ // We're looking for the next user-key but all we see are the same
+ // user-key with decreasing sequence numbers. Fast forward to
+ // sequence number 0 and type deletion (the smallest type).
+ AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetUserKey(),
+ 0, kTypeDeletion));
+ // Don't set skipping_saved_key = false because we may still see more
+ // user-keys equal to saved_key_.
+ } else {
+ // We saw multiple entries with this user key and sequence numbers
+ // higher than sequence_. Fast forward to sequence_.
+ // Note that this only covers a case when a higher key was overwritten
+ // many times since our snapshot was taken, not the case when a lot of
+ // different keys were inserted after our snapshot was taken.
+ AppendInternalKey(&last_key,
+ ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
+ kValueTypeForSeek));
+ }
+ iter_.Seek(last_key);
+ RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+ } else {
+ iter_.Next();
+ }
+ } while (iter_.Valid());
+
+ valid_ = false;
+ return iter_.status().ok();
+}
+
+// Merge values of the same user key starting from the current iter_ position
+// Scan from the newer entries to older entries.
+// PRE: iter_.key() points to the first merge type entry
+// saved_key_ stores the user key
+// POST: saved_value_ has the merged value for the user key
+// iter_ points to the next entry (or invalid)
+bool DBIter::MergeValuesNewToOld() {
+ if (!merge_operator_) {
+ ROCKS_LOG_ERROR(logger_, "Options::merge_operator is null.");
+ status_ = Status::InvalidArgument("merge_operator_ must be set.");
+ valid_ = false;
+ return false;
+ }
+
+ // Temporarily pin the blocks that hold merge operands
+ TempPinData();
+ merge_context_.Clear();
+ // Start the merge process by pushing the first operand
+ merge_context_.PushOperand(
+ iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
+ TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:PushedFirstOperand");
+
+ ParsedInternalKey ikey;
+ Status s;
+ for (iter_.Next(); iter_.Valid(); iter_.Next()) {
+ TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:SteppedToNextOperand");
+ if (!ParseKey(&ikey)) {
+ return false;
+ }
+
+ if (!user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) {
+ // hit the next user key, stop right here
+ break;
+ } else if (kTypeDeletion == ikey.type || kTypeSingleDeletion == ikey.type ||
+ range_del_agg_.ShouldDelete(
+ ikey, RangeDelPositioningMode::kForwardTraversal)) {
+ // hit a delete with the same user key, stop right here
+ // iter_ is positioned after delete
+ iter_.Next();
+ break;
+ } else if (kTypeValue == ikey.type) {
+ // hit a put, merge the put value with operands and store the
+ // final result in saved_value_. We are done!
+ const Slice val = iter_.value();
+ s = MergeHelper::TimedFullMerge(
+ merge_operator_, ikey.user_key, &val, merge_context_.GetOperands(),
+ &saved_value_, logger_, statistics_, env_, &pinned_value_, true);
+ if (!s.ok()) {
+ valid_ = false;
+ status_ = s;
+ return false;
+ }
+ // iter_ is positioned after put
+ iter_.Next();
+ if (!iter_.status().ok()) {
+ valid_ = false;
+ return false;
+ }
+ return true;
+ } else if (kTypeMerge == ikey.type) {
+ // hit a merge, add the value as an operand and run associative merge.
+ // when complete, add result to operands and continue.
+ merge_context_.PushOperand(
+ iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
+ PERF_COUNTER_ADD(internal_merge_count, 1);
+ } else if (kTypeBlobIndex == ikey.type) {
+ if (!allow_blob_) {
+ ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
+ status_ = Status::NotSupported(
+ "Encounter unexpected blob index. Please open DB with "
+ "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
+ } else {
+ status_ =
+ Status::NotSupported("Blob DB does not support merge operator.");
+ }
+ valid_ = false;
+ return false;
+ } else {
+ assert(false);
+ }
+ }
+
+ if (!iter_.status().ok()) {
+ valid_ = false;
+ return false;
+ }
+
+ // we either exhausted all internal keys under this user key, or hit
+ // a deletion marker.
+ // feed null as the existing value to the merge operator, such that
+ // client can differentiate this scenario and do things accordingly.
+ s = MergeHelper::TimedFullMerge(merge_operator_, saved_key_.GetUserKey(),
+ nullptr, merge_context_.GetOperands(),
+ &saved_value_, logger_, statistics_, env_,
+ &pinned_value_, true);
+ if (!s.ok()) {
+ valid_ = false;
+ status_ = s;
+ return false;
+ }
+
+ assert(status_.ok());
+ return true;
+}
+
+void DBIter::Prev() {
+ assert(valid_);
+ assert(status_.ok());
+
+ PERF_CPU_TIMER_GUARD(iter_prev_cpu_nanos, env_);
+ ReleaseTempPinnedData();
+ ResetInternalKeysSkippedCounter();
+ bool ok = true;
+ if (direction_ == kForward) {
+ if (!ReverseToBackward()) {
+ ok = false;
+ }
+ }
+ if (ok) {
+ Slice prefix;
+ if (prefix_same_as_start_) {
+ assert(prefix_extractor_ != nullptr);
+ prefix = prefix_.GetUserKey();
+ }
+ PrevInternal(prefix_same_as_start_ ? &prefix : nullptr);
+ }
+
+ if (statistics_ != nullptr) {
+ local_stats_.prev_count_++;
+ if (valid_) {
+ local_stats_.prev_found_count_++;
+ local_stats_.bytes_read_ += (key().size() + value().size());
+ }
+ }
+}
+
+bool DBIter::ReverseToForward() {
+ assert(iter_.status().ok());
+
+ // When moving backwards, iter_ is positioned on _previous_ key, which may
+ // not exist or may have different prefix than the current key().
+ // If that's the case, seek iter_ to current key.
+ if (!expect_total_order_inner_iter() || !iter_.Valid()) {
+ IterKey last_key;
+ last_key.SetInternalKey(ParsedInternalKey(
+ saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek));
+ iter_.Seek(last_key.GetInternalKey());
+ }
+
+ direction_ = kForward;
+ // Skip keys less than the current key() (a.k.a. saved_key_).
+ while (iter_.Valid()) {
+ ParsedInternalKey ikey;
+ if (!ParseKey(&ikey)) {
+ return false;
+ }
+ if (user_comparator_.Compare(ikey.user_key, saved_key_.GetUserKey()) >= 0) {
+ return true;
+ }
+ iter_.Next();
+ }
+
+ if (!iter_.status().ok()) {
+ valid_ = false;
+ return false;
+ }
+
+ return true;
+}
+
+// Move iter_ to the key before saved_key_.
+bool DBIter::ReverseToBackward() {
+ assert(iter_.status().ok());
+
+ // When current_entry_is_merged_ is true, iter_ may be positioned on the next
+ // key, which may not exist or may have prefix different from current.
+ // If that's the case, seek to saved_key_.
+ if (current_entry_is_merged_ &&
+ (!expect_total_order_inner_iter() || !iter_.Valid())) {
+ IterKey last_key;
+ // Using kMaxSequenceNumber and kValueTypeForSeek
+ // (not kValueTypeForSeekForPrev) to seek to a key strictly smaller
+ // than saved_key_.
+ last_key.SetInternalKey(ParsedInternalKey(
+ saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek));
+ if (!expect_total_order_inner_iter()) {
+ iter_.SeekForPrev(last_key.GetInternalKey());
+ } else {
+ // Some iterators may not support SeekForPrev(), so we avoid using it
+ // when prefix seek mode is disabled. This is somewhat expensive
+ // (an extra Prev(), as well as an extra change of direction of iter_),
+ // so we may need to reconsider it later.
+ iter_.Seek(last_key.GetInternalKey());
+ if (!iter_.Valid() && iter_.status().ok()) {
+ iter_.SeekToLast();
+ }
+ }
+ }
+
+ direction_ = kReverse;
+ return FindUserKeyBeforeSavedKey();
+}
+
+void DBIter::PrevInternal(const Slice* prefix) {
+ while (iter_.Valid()) {
+ saved_key_.SetUserKey(
+ ExtractUserKey(iter_.key()),
+ !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
+
+ assert(prefix == nullptr || prefix_extractor_ != nullptr);
+ if (prefix != nullptr &&
+ prefix_extractor_->Transform(saved_key_.GetUserKey())
+ .compare(*prefix) != 0) {
+ assert(prefix_same_as_start_);
+ // Current key does not have the same prefix as start
+ valid_ = false;
+ return;
+ }
+
+ assert(iterate_lower_bound_ == nullptr || iter_.MayBeOutOfLowerBound() ||
+ user_comparator_.Compare(saved_key_.GetUserKey(),
+ *iterate_lower_bound_) >= 0);
+ if (iterate_lower_bound_ != nullptr && iter_.MayBeOutOfLowerBound() &&
+ user_comparator_.Compare(saved_key_.GetUserKey(),
+ *iterate_lower_bound_) < 0) {
+ // We've iterated earlier than the user-specified lower bound.
+ valid_ = false;
+ return;
+ }
+
+ if (!FindValueForCurrentKey()) { // assigns valid_
+ return;
+ }
+
+ // Whether or not we found a value for current key, we need iter_ to end up
+ // on a smaller key.
+ if (!FindUserKeyBeforeSavedKey()) {
+ return;
+ }
+
+ if (valid_) {
+ // Found the value.
+ return;
+ }
+
+ if (TooManyInternalKeysSkipped(false)) {
+ return;
+ }
+ }
+
+ // We haven't found any key - iterator is not valid
+ valid_ = false;
+}
+
+// Used for backwards iteration.
+// Looks at the entries with user key saved_key_ and finds the most up-to-date
+// value for it, or executes a merge, or determines that the value was deleted.
+// Sets valid_ to true if the value is found and is ready to be presented to
+// the user through value().
+// Sets valid_ to false if the value was deleted, and we should try another key.
+// Returns false if an error occurred, and !status().ok() and !valid_.
+//
+// PRE: iter_ is positioned on the last entry with user key equal to saved_key_.
+// POST: iter_ is positioned on one of the entries equal to saved_key_, or on
+// the entry just before them, or on the entry just after them.
+bool DBIter::FindValueForCurrentKey() {
+ assert(iter_.Valid());
+ merge_context_.Clear();
+ current_entry_is_merged_ = false;
+ // last entry before merge (could be kTypeDeletion, kTypeSingleDeletion or
+ // kTypeValue)
+ ValueType last_not_merge_type = kTypeDeletion;
+ ValueType last_key_entry_type = kTypeDeletion;
+
+ // Temporarily pin blocks that hold (merge operands / the value)
+ ReleaseTempPinnedData();
+ TempPinData();
+ size_t num_skipped = 0;
+ while (iter_.Valid()) {
+ ParsedInternalKey ikey;
+ if (!ParseKey(&ikey)) {
+ return false;
+ }
+
+ if (!IsVisible(ikey.sequence) ||
+ !user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) {
+ break;
+ }
+ if (TooManyInternalKeysSkipped()) {
+ return false;
+ }
+
+ // This user key has lots of entries.
+ // We're going from old to new, and it's taking too long. Let's do a Seek()
+ // and go from new to old. This helps when a key was overwritten many times.
+ if (num_skipped >= max_skip_) {
+ return FindValueForCurrentKeyUsingSeek();
+ }
+
+ last_key_entry_type = ikey.type;
+ switch (last_key_entry_type) {
+ case kTypeValue:
+ case kTypeBlobIndex:
+ if (range_del_agg_.ShouldDelete(
+ ikey, RangeDelPositioningMode::kBackwardTraversal)) {
+ last_key_entry_type = kTypeRangeDeletion;
+ PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
+ } else {
+ assert(iter_.iter()->IsValuePinned());
+ pinned_value_ = iter_.value();
+ }
+ merge_context_.Clear();
+ last_not_merge_type = last_key_entry_type;
+ break;
+ case kTypeDeletion:
+ case kTypeSingleDeletion:
+ merge_context_.Clear();
+ last_not_merge_type = last_key_entry_type;
+ PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
+ break;
+ case kTypeMerge:
+ if (range_del_agg_.ShouldDelete(
+ ikey, RangeDelPositioningMode::kBackwardTraversal)) {
+ merge_context_.Clear();
+ last_key_entry_type = kTypeRangeDeletion;
+ last_not_merge_type = last_key_entry_type;
+ PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
+ } else {
+ assert(merge_operator_ != nullptr);
+ merge_context_.PushOperandBack(
+ iter_.value(),
+ iter_.iter()->IsValuePinned() /* operand_pinned */);
+ PERF_COUNTER_ADD(internal_merge_count, 1);
+ }
+ break;
+ default:
+ assert(false);
+ }
+
+ PERF_COUNTER_ADD(internal_key_skipped_count, 1);
+ iter_.Prev();
+ ++num_skipped;
+ }
+
+ if (!iter_.status().ok()) {
+ valid_ = false;
+ return false;
+ }
+
+ Status s;
+ is_blob_ = false;
+ switch (last_key_entry_type) {
+ case kTypeDeletion:
+ case kTypeSingleDeletion:
+ case kTypeRangeDeletion:
+ valid_ = false;
+ return true;
+ case kTypeMerge:
+ current_entry_is_merged_ = true;
+ if (last_not_merge_type == kTypeDeletion ||
+ last_not_merge_type == kTypeSingleDeletion ||
+ last_not_merge_type == kTypeRangeDeletion) {
+ s = MergeHelper::TimedFullMerge(
+ merge_operator_, saved_key_.GetUserKey(), nullptr,
+ merge_context_.GetOperands(), &saved_value_, logger_, statistics_,
+ env_, &pinned_value_, true);
+ } else if (last_not_merge_type == kTypeBlobIndex) {
+ if (!allow_blob_) {
+ ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
+ status_ = Status::NotSupported(
+ "Encounter unexpected blob index. Please open DB with "
+ "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
+ } else {
+ status_ =
+ Status::NotSupported("Blob DB does not support merge operator.");
+ }
+ valid_ = false;
+ return false;
+ } else {
+ assert(last_not_merge_type == kTypeValue);
+ s = MergeHelper::TimedFullMerge(
+ merge_operator_, saved_key_.GetUserKey(), &pinned_value_,
+ merge_context_.GetOperands(), &saved_value_, logger_, statistics_,
+ env_, &pinned_value_, true);
+ }
+ break;
+ case kTypeValue:
+ // do nothing - we've already has value in pinned_value_
+ break;
+ case kTypeBlobIndex:
+ if (!allow_blob_) {
+ ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
+ status_ = Status::NotSupported(
+ "Encounter unexpected blob index. Please open DB with "
+ "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
+ valid_ = false;
+ return false;
+ }
+ is_blob_ = true;
+ break;
+ default:
+ assert(false);
+ break;
+ }
+ if (!s.ok()) {
+ valid_ = false;
+ status_ = s;
+ return false;
+ }
+ valid_ = true;
+ return true;
+}
+
+// This function is used in FindValueForCurrentKey.
+// We use Seek() function instead of Prev() to find necessary value
+// TODO: This is very similar to FindNextUserEntry() and MergeValuesNewToOld().
+// Would be nice to reuse some code.
+bool DBIter::FindValueForCurrentKeyUsingSeek() {
+ // FindValueForCurrentKey will enable pinning before calling
+ // FindValueForCurrentKeyUsingSeek()
+ assert(pinned_iters_mgr_.PinningEnabled());
+ std::string last_key;
+ AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetUserKey(),
+ sequence_, kValueTypeForSeek));
+ iter_.Seek(last_key);
+ RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+
+ // In case read_callback presents, the value we seek to may not be visible.
+ // Find the next value that's visible.
+ ParsedInternalKey ikey;
+ is_blob_ = false;
+ while (true) {
+ if (!iter_.Valid()) {
+ valid_ = false;
+ return iter_.status().ok();
+ }
+
+ if (!ParseKey(&ikey)) {
+ return false;
+ }
+ if (!user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) {
+ // No visible values for this key, even though FindValueForCurrentKey()
+ // has seen some. This is possible if we're using a tailing iterator, and
+ // the entries were discarded in a compaction.
+ valid_ = false;
+ return true;
+ }
+
+ if (IsVisible(ikey.sequence)) {
+ break;
+ }
+
+ iter_.Next();
+ }
+
+ if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion ||
+ range_del_agg_.ShouldDelete(
+ ikey, RangeDelPositioningMode::kBackwardTraversal)) {
+ valid_ = false;
+ return true;
+ }
+ if (ikey.type == kTypeBlobIndex && !allow_blob_) {
+ ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
+ status_ = Status::NotSupported(
+ "Encounter unexpected blob index. Please open DB with "
+ "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
+ valid_ = false;
+ return false;
+ }
+ if (ikey.type == kTypeValue || ikey.type == kTypeBlobIndex) {
+ assert(iter_.iter()->IsValuePinned());
+ pinned_value_ = iter_.value();
+ is_blob_ = (ikey.type == kTypeBlobIndex);
+ valid_ = true;
+ return true;
+ }
+
+ // kTypeMerge. We need to collect all kTypeMerge values and save them
+ // in operands
+ assert(ikey.type == kTypeMerge);
+ current_entry_is_merged_ = true;
+ merge_context_.Clear();
+ merge_context_.PushOperand(
+ iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
+ while (true) {
+ iter_.Next();
+
+ if (!iter_.Valid()) {
+ if (!iter_.status().ok()) {
+ valid_ = false;
+ return false;
+ }
+ break;
+ }
+ if (!ParseKey(&ikey)) {
+ return false;
+ }
+ if (!user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) {
+ break;
+ }
+
+ if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion ||
+ range_del_agg_.ShouldDelete(
+ ikey, RangeDelPositioningMode::kForwardTraversal)) {
+ break;
+ } else if (ikey.type == kTypeValue) {
+ const Slice val = iter_.value();
+ Status s = MergeHelper::TimedFullMerge(
+ merge_operator_, saved_key_.GetUserKey(), &val,
+ merge_context_.GetOperands(), &saved_value_, logger_, statistics_,
+ env_, &pinned_value_, true);
+ if (!s.ok()) {
+ valid_ = false;
+ status_ = s;
+ return false;
+ }
+ valid_ = true;
+ return true;
+ } else if (ikey.type == kTypeMerge) {
+ merge_context_.PushOperand(
+ iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
+ PERF_COUNTER_ADD(internal_merge_count, 1);
+ } else if (ikey.type == kTypeBlobIndex) {
+ if (!allow_blob_) {
+ ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
+ status_ = Status::NotSupported(
+ "Encounter unexpected blob index. Please open DB with "
+ "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
+ } else {
+ status_ =
+ Status::NotSupported("Blob DB does not support merge operator.");
+ }
+ valid_ = false;
+ return false;
+ } else {
+ assert(false);
+ }
+ }
+
+ Status s = MergeHelper::TimedFullMerge(
+ merge_operator_, saved_key_.GetUserKey(), nullptr,
+ merge_context_.GetOperands(), &saved_value_, logger_, statistics_, env_,
+ &pinned_value_, true);
+ if (!s.ok()) {
+ valid_ = false;
+ status_ = s;
+ return false;
+ }
+
+ // Make sure we leave iter_ in a good state. If it's valid and we don't care
+ // about prefixes, that's already good enough. Otherwise it needs to be
+ // seeked to the current key.
+ if (!expect_total_order_inner_iter() || !iter_.Valid()) {
+ if (!expect_total_order_inner_iter()) {
+ iter_.SeekForPrev(last_key);
+ } else {
+ iter_.Seek(last_key);
+ if (!iter_.Valid() && iter_.status().ok()) {
+ iter_.SeekToLast();
+ }
+ }
+ RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+ }
+
+ valid_ = true;
+ return true;
+}
+
+// Move backwards until the key smaller than saved_key_.
+// Changes valid_ only if return value is false.
+bool DBIter::FindUserKeyBeforeSavedKey() {
+ assert(status_.ok());
+ size_t num_skipped = 0;
+ while (iter_.Valid()) {
+ ParsedInternalKey ikey;
+ if (!ParseKey(&ikey)) {
+ return false;
+ }
+
+ if (user_comparator_.Compare(ikey.user_key, saved_key_.GetUserKey()) < 0) {
+ return true;
+ }
+
+ if (TooManyInternalKeysSkipped()) {
+ return false;
+ }
+
+ assert(ikey.sequence != kMaxSequenceNumber);
+ if (!IsVisible(ikey.sequence)) {
+ PERF_COUNTER_ADD(internal_recent_skipped_count, 1);
+ } else {
+ PERF_COUNTER_ADD(internal_key_skipped_count, 1);
+ }
+
+ if (num_skipped >= max_skip_) {
+ num_skipped = 0;
+ IterKey last_key;
+ last_key.SetInternalKey(ParsedInternalKey(
+ saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek));
+ // It would be more efficient to use SeekForPrev() here, but some
+ // iterators may not support it.
+ iter_.Seek(last_key.GetInternalKey());
+ RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+ if (!iter_.Valid()) {
+ break;
+ }
+ } else {
+ ++num_skipped;
+ }
+
+ iter_.Prev();
+ }
+
+ if (!iter_.status().ok()) {
+ valid_ = false;
+ return false;
+ }
+
+ return true;
+}
+
+bool DBIter::TooManyInternalKeysSkipped(bool increment) {
+ if ((max_skippable_internal_keys_ > 0) &&
+ (num_internal_keys_skipped_ > max_skippable_internal_keys_)) {
+ valid_ = false;
+ status_ = Status::Incomplete("Too many internal keys skipped.");
+ return true;
+ } else if (increment) {
+ num_internal_keys_skipped_++;
+ }
+ return false;
+}
+
+bool DBIter::IsVisible(SequenceNumber sequence) {
+ if (read_callback_ == nullptr) {
+ return sequence <= sequence_;
+ } else {
+ return read_callback_->IsVisible(sequence);
+ }
+}
+
+void DBIter::SetSavedKeyToSeekTarget(const Slice& target) {
+ is_key_seqnum_zero_ = false;
+ SequenceNumber seq = sequence_;
+ saved_key_.Clear();
+ saved_key_.SetInternalKey(target, seq);
+
+ if (iterate_lower_bound_ != nullptr &&
+ user_comparator_.Compare(saved_key_.GetUserKey(), *iterate_lower_bound_) <
+ 0) {
+ // Seek key is smaller than the lower bound.
+ saved_key_.Clear();
+ saved_key_.SetInternalKey(*iterate_lower_bound_, seq);
+ }
+}
+
+void DBIter::SetSavedKeyToSeekForPrevTarget(const Slice& target) {
+ is_key_seqnum_zero_ = false;
+ saved_key_.Clear();
+ // now saved_key is used to store internal key.
+ saved_key_.SetInternalKey(target, 0 /* sequence_number */,
+ kValueTypeForSeekForPrev);
+
+ if (iterate_upper_bound_ != nullptr &&
+ user_comparator_.Compare(saved_key_.GetUserKey(),
+ *iterate_upper_bound_) >= 0) {
+ saved_key_.Clear();
+ saved_key_.SetInternalKey(*iterate_upper_bound_, kMaxSequenceNumber);
+ }
+}
+
+void DBIter::Seek(const Slice& target) {
+ PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_);
+ StopWatch sw(env_, statistics_, DB_SEEK);
+
+#ifndef ROCKSDB_LITE
+ if (db_impl_ != nullptr && cfd_ != nullptr) {
+ db_impl_->TraceIteratorSeek(cfd_->GetID(), target);
+ }
+#endif // ROCKSDB_LITE
+
+ status_ = Status::OK();
+ ReleaseTempPinnedData();
+ ResetInternalKeysSkippedCounter();
+
+ // Seek the inner iterator based on the target key.
+ {
+ PERF_TIMER_GUARD(seek_internal_seek_time);
+
+ SetSavedKeyToSeekTarget(target);
+ iter_.Seek(saved_key_.GetInternalKey());
+
+ range_del_agg_.InvalidateRangeDelMapPositions();
+ RecordTick(statistics_, NUMBER_DB_SEEK);
+ }
+ if (!iter_.Valid()) {
+ valid_ = false;
+ return;
+ }
+ direction_ = kForward;
+
+ // Now the inner iterator is placed to the target position. From there,
+ // we need to find out the next key that is visible to the user.
+ ClearSavedValue();
+ if (prefix_same_as_start_) {
+ // The case where the iterator needs to be invalidated if it has exausted
+ // keys within the same prefix of the seek key.
+ assert(prefix_extractor_ != nullptr);
+ Slice target_prefix = prefix_extractor_->Transform(target);
+ FindNextUserEntry(false /* not skipping saved_key */,
+ &target_prefix /* prefix */);
+ if (valid_) {
+ // Remember the prefix of the seek key for the future Next() call to
+ // check.
+ prefix_.SetUserKey(target_prefix);
+ }
+ } else {
+ FindNextUserEntry(false /* not skipping saved_key */, nullptr);
+ }
+ if (!valid_) {
+ return;
+ }
+
+ // Updating stats and perf context counters.
+ if (statistics_ != nullptr) {
+ // Decrement since we don't want to count this key as skipped
+ RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
+ RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
+ }
+ PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
+}
+
+void DBIter::SeekForPrev(const Slice& target) {
+ PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_);
+ StopWatch sw(env_, statistics_, DB_SEEK);
+
+#ifndef ROCKSDB_LITE
+ if (db_impl_ != nullptr && cfd_ != nullptr) {
+ db_impl_->TraceIteratorSeekForPrev(cfd_->GetID(), target);
+ }
+#endif // ROCKSDB_LITE
+
+ status_ = Status::OK();
+ ReleaseTempPinnedData();
+ ResetInternalKeysSkippedCounter();
+
+ // Seek the inner iterator based on the target key.
+ {
+ PERF_TIMER_GUARD(seek_internal_seek_time);
+ SetSavedKeyToSeekForPrevTarget(target);
+ iter_.SeekForPrev(saved_key_.GetInternalKey());
+ range_del_agg_.InvalidateRangeDelMapPositions();
+ RecordTick(statistics_, NUMBER_DB_SEEK);
+ }
+ if (!iter_.Valid()) {
+ valid_ = false;
+ return;
+ }
+ direction_ = kReverse;
+
+ // Now the inner iterator is placed to the target position. From there,
+ // we need to find out the first key that is visible to the user in the
+ // backward direction.
+ ClearSavedValue();
+ if (prefix_same_as_start_) {
+ // The case where the iterator needs to be invalidated if it has exausted
+ // keys within the same prefix of the seek key.
+ assert(prefix_extractor_ != nullptr);
+ Slice target_prefix = prefix_extractor_->Transform(target);
+ PrevInternal(&target_prefix);
+ if (valid_) {
+ // Remember the prefix of the seek key for the future Prev() call to
+ // check.
+ prefix_.SetUserKey(target_prefix);
+ }
+ } else {
+ PrevInternal(nullptr);
+ }
+
+ // Report stats and perf context.
+ if (statistics_ != nullptr && valid_) {
+ RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
+ RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
+ PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
+ }
+}
+
+void DBIter::SeekToFirst() {
+ if (iterate_lower_bound_ != nullptr) {
+ Seek(*iterate_lower_bound_);
+ return;
+ }
+ PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_);
+ // Don't use iter_::Seek() if we set a prefix extractor
+ // because prefix seek will be used.
+ if (!expect_total_order_inner_iter()) {
+ max_skip_ = std::numeric_limits<uint64_t>::max();
+ }
+ status_ = Status::OK();
+ direction_ = kForward;
+ ReleaseTempPinnedData();
+ ResetInternalKeysSkippedCounter();
+ ClearSavedValue();
+ is_key_seqnum_zero_ = false;
+
+ {
+ PERF_TIMER_GUARD(seek_internal_seek_time);
+ iter_.SeekToFirst();
+ range_del_agg_.InvalidateRangeDelMapPositions();
+ }
+
+ RecordTick(statistics_, NUMBER_DB_SEEK);
+ if (iter_.Valid()) {
+ saved_key_.SetUserKey(
+ ExtractUserKey(iter_.key()),
+ !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
+ FindNextUserEntry(false /* not skipping saved_key */,
+ nullptr /* no prefix check */);
+ if (statistics_ != nullptr) {
+ if (valid_) {
+ RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
+ RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
+ PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
+ }
+ }
+ } else {
+ valid_ = false;
+ }
+ if (valid_ && prefix_same_as_start_) {
+ assert(prefix_extractor_ != nullptr);
+ prefix_.SetUserKey(prefix_extractor_->Transform(saved_key_.GetUserKey()));
+ }
+}
+
+void DBIter::SeekToLast() {
+ if (iterate_upper_bound_ != nullptr) {
+ // Seek to last key strictly less than ReadOptions.iterate_upper_bound.
+ SeekForPrev(*iterate_upper_bound_);
+ if (Valid() && user_comparator_.Equal(*iterate_upper_bound_, key())) {
+ ReleaseTempPinnedData();
+ PrevInternal(nullptr);
+ }
+ return;
+ }
+
+ PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_);
+ // Don't use iter_::Seek() if we set a prefix extractor
+ // because prefix seek will be used.
+ if (!expect_total_order_inner_iter()) {
+ max_skip_ = std::numeric_limits<uint64_t>::max();
+ }
+ status_ = Status::OK();
+ direction_ = kReverse;
+ ReleaseTempPinnedData();
+ ResetInternalKeysSkippedCounter();
+ ClearSavedValue();
+ is_key_seqnum_zero_ = false;
+
+ {
+ PERF_TIMER_GUARD(seek_internal_seek_time);
+ iter_.SeekToLast();
+ range_del_agg_.InvalidateRangeDelMapPositions();
+ }
+ PrevInternal(nullptr);
+ if (statistics_ != nullptr) {
+ RecordTick(statistics_, NUMBER_DB_SEEK);
+ if (valid_) {
+ RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
+ RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
+ PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
+ }
+ }
+ if (valid_ && prefix_same_as_start_) {
+ assert(prefix_extractor_ != nullptr);
+ prefix_.SetUserKey(prefix_extractor_->Transform(saved_key_.GetUserKey()));
+ }
+}
+
+Iterator* NewDBIterator(Env* env, const ReadOptions& read_options,
+ const ImmutableCFOptions& cf_options,
+ const MutableCFOptions& mutable_cf_options,
+ const Comparator* user_key_comparator,
+ InternalIterator* internal_iter,
+ const SequenceNumber& sequence,
+ uint64_t max_sequential_skip_in_iterations,
+ ReadCallback* read_callback, DBImpl* db_impl,
+ ColumnFamilyData* cfd, bool allow_blob) {
+ DBIter* db_iter = new DBIter(
+ env, read_options, cf_options, mutable_cf_options, user_key_comparator,
+ internal_iter, sequence, false, max_sequential_skip_in_iterations,
+ read_callback, db_impl, cfd, allow_blob);
+ return db_iter;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_iter.h b/src/rocksdb/db/db_iter.h
new file mode 100644
index 000000000..32704e4d5
--- /dev/null
+++ b/src/rocksdb/db/db_iter.h
@@ -0,0 +1,344 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+#include <string>
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "db/range_del_aggregator.h"
+#include "memory/arena.h"
+#include "options/cf_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/iterator.h"
+#include "table/iterator_wrapper.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This file declares the factory functions of DBIter, in its original form
+// or a wrapped form with class ArenaWrappedDBIter, which is defined here.
+// Class DBIter, which is declared and implemented inside db_iter.cc, is
+// an iterator that converts internal keys (yielded by an InternalIterator)
+// that were live at the specified sequence number into appropriate user
+// keys.
+// Each internal key consists of a user key, a sequence number, and a value
+// type. DBIter deals with multiple key versions, tombstones, merge operands,
+// etc, and exposes an Iterator.
+// For example, DBIter may wrap following InternalIterator:
+// user key: AAA value: v3 seqno: 100 type: Put
+// user key: AAA value: v2 seqno: 97 type: Put
+// user key: AAA value: v1 seqno: 95 type: Put
+// user key: BBB value: v1 seqno: 90 type: Put
+// user key: BBC value: N/A seqno: 98 type: Delete
+// user key: BBC value: v1 seqno: 95 type: Put
+// If the snapshot passed in is 102, then the DBIter is expected to
+// expose the following iterator:
+// key: AAA value: v3
+// key: BBB value: v1
+// If the snapshot passed in is 96, then it should expose:
+// key: AAA value: v1
+// key: BBB value: v1
+// key: BBC value: v1
+//
+
+// Memtables and sstables that make the DB representation contain
+// (userkey,seq,type) => uservalue entries. DBIter
+// combines multiple entries for the same userkey found in the DB
+// representation into a single entry while accounting for sequence
+// numbers, deletion markers, overwrites, etc.
+class DBIter final : public Iterator {
+ public:
+ // The following is grossly complicated. TODO: clean it up
+ // Which direction is the iterator currently moving?
+ // (1) When moving forward:
+ // (1a) if current_entry_is_merged_ = false, the internal iterator is
+ // positioned at the exact entry that yields this->key(), this->value()
+ // (1b) if current_entry_is_merged_ = true, the internal iterator is
+ // positioned immediately after the last entry that contributed to the
+ // current this->value(). That entry may or may not have key equal to
+ // this->key().
+ // (2) When moving backwards, the internal iterator is positioned
+ // just before all entries whose user key == this->key().
+ enum Direction { kForward, kReverse };
+
+ // LocalStatistics contain Statistics counters that will be aggregated per
+ // each iterator instance and then will be sent to the global statistics when
+ // the iterator is destroyed.
+ //
+ // The purpose of this approach is to avoid perf regression happening
+ // when multiple threads bump the atomic counters from a DBIter::Next().
+ struct LocalStatistics {
+ explicit LocalStatistics() { ResetCounters(); }
+
+ void ResetCounters() {
+ next_count_ = 0;
+ next_found_count_ = 0;
+ prev_count_ = 0;
+ prev_found_count_ = 0;
+ bytes_read_ = 0;
+ skip_count_ = 0;
+ }
+
+ void BumpGlobalStatistics(Statistics* global_statistics) {
+ RecordTick(global_statistics, NUMBER_DB_NEXT, next_count_);
+ RecordTick(global_statistics, NUMBER_DB_NEXT_FOUND, next_found_count_);
+ RecordTick(global_statistics, NUMBER_DB_PREV, prev_count_);
+ RecordTick(global_statistics, NUMBER_DB_PREV_FOUND, prev_found_count_);
+ RecordTick(global_statistics, ITER_BYTES_READ, bytes_read_);
+ RecordTick(global_statistics, NUMBER_ITER_SKIP, skip_count_);
+ PERF_COUNTER_ADD(iter_read_bytes, bytes_read_);
+ ResetCounters();
+ }
+
+ // Map to Tickers::NUMBER_DB_NEXT
+ uint64_t next_count_;
+ // Map to Tickers::NUMBER_DB_NEXT_FOUND
+ uint64_t next_found_count_;
+ // Map to Tickers::NUMBER_DB_PREV
+ uint64_t prev_count_;
+ // Map to Tickers::NUMBER_DB_PREV_FOUND
+ uint64_t prev_found_count_;
+ // Map to Tickers::ITER_BYTES_READ
+ uint64_t bytes_read_;
+ // Map to Tickers::NUMBER_ITER_SKIP
+ uint64_t skip_count_;
+ };
+
+ DBIter(Env* _env, const ReadOptions& read_options,
+ const ImmutableCFOptions& cf_options,
+ const MutableCFOptions& mutable_cf_options, const Comparator* cmp,
+ InternalIterator* iter, SequenceNumber s, bool arena_mode,
+ uint64_t max_sequential_skip_in_iterations,
+ ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
+ bool allow_blob);
+
+ // No copying allowed
+ DBIter(const DBIter&) = delete;
+ void operator=(const DBIter&) = delete;
+
+ ~DBIter() override {
+ // Release pinned data if any
+ if (pinned_iters_mgr_.PinningEnabled()) {
+ pinned_iters_mgr_.ReleasePinnedData();
+ }
+ RecordTick(statistics_, NO_ITERATOR_DELETED);
+ ResetInternalKeysSkippedCounter();
+ local_stats_.BumpGlobalStatistics(statistics_);
+ iter_.DeleteIter(arena_mode_);
+ }
+ void SetIter(InternalIterator* iter) {
+ assert(iter_.iter() == nullptr);
+ iter_.Set(iter);
+ iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_);
+ }
+ ReadRangeDelAggregator* GetRangeDelAggregator() { return &range_del_agg_; }
+
+ bool Valid() const override { return valid_; }
+ Slice key() const override {
+ assert(valid_);
+ if (start_seqnum_ > 0) {
+ return saved_key_.GetInternalKey();
+ } else {
+ return saved_key_.GetUserKey();
+ }
+ }
+ Slice value() const override {
+ assert(valid_);
+ if (current_entry_is_merged_) {
+ // If pinned_value_ is set then the result of merge operator is one of
+ // the merge operands and we should return it.
+ return pinned_value_.data() ? pinned_value_ : saved_value_;
+ } else if (direction_ == kReverse) {
+ return pinned_value_;
+ } else {
+ return iter_.value();
+ }
+ }
+ Status status() const override {
+ if (status_.ok()) {
+ return iter_.status();
+ } else {
+ assert(!valid_);
+ return status_;
+ }
+ }
+ bool IsBlob() const {
+ assert(valid_ && (allow_blob_ || !is_blob_));
+ return is_blob_;
+ }
+
+ Status GetProperty(std::string prop_name, std::string* prop) override;
+
+ void Next() final override;
+ void Prev() final override;
+ void Seek(const Slice& target) final override;
+ void SeekForPrev(const Slice& target) final override;
+ void SeekToFirst() final override;
+ void SeekToLast() final override;
+ Env* env() const { return env_; }
+ void set_sequence(uint64_t s) {
+ sequence_ = s;
+ if (read_callback_) {
+ read_callback_->Refresh(s);
+ }
+ }
+ void set_valid(bool v) { valid_ = v; }
+
+ private:
+ // For all methods in this block:
+ // PRE: iter_->Valid() && status_.ok()
+ // Return false if there was an error, and status() is non-ok, valid_ = false;
+ // in this case callers would usually stop what they were doing and return.
+ bool ReverseToForward();
+ bool ReverseToBackward();
+ // Set saved_key_ to the seek key to target, with proper sequence number set.
+ // It might get adjusted if the seek key is smaller than iterator lower bound.
+ void SetSavedKeyToSeekTarget(const Slice& target);
+ // Set saved_key_ to the seek key to target, with proper sequence number set.
+ // It might get adjusted if the seek key is larger than iterator upper bound.
+ void SetSavedKeyToSeekForPrevTarget(const Slice& target);
+ bool FindValueForCurrentKey();
+ bool FindValueForCurrentKeyUsingSeek();
+ bool FindUserKeyBeforeSavedKey();
+ // If `skipping_saved_key` is true, the function will keep iterating until it
+ // finds a user key that is larger than `saved_key_`.
+ // If `prefix` is not null, the iterator needs to stop when all keys for the
+ // prefix are exhausted and the interator is set to invalid.
+ bool FindNextUserEntry(bool skipping_saved_key, const Slice* prefix);
+ // Internal implementation of FindNextUserEntry().
+ bool FindNextUserEntryInternal(bool skipping_saved_key, const Slice* prefix);
+ bool ParseKey(ParsedInternalKey* key);
+ bool MergeValuesNewToOld();
+
+ // If prefix is not null, we need to set the iterator to invalid if no more
+ // entry can be found within the prefix.
+ void PrevInternal(const Slice* prefix);
+ bool TooManyInternalKeysSkipped(bool increment = true);
+ bool IsVisible(SequenceNumber sequence);
+
+ // Temporarily pin the blocks that we encounter until ReleaseTempPinnedData()
+ // is called
+ void TempPinData() {
+ if (!pin_thru_lifetime_) {
+ pinned_iters_mgr_.StartPinning();
+ }
+ }
+
+ // Release blocks pinned by TempPinData()
+ void ReleaseTempPinnedData() {
+ if (!pin_thru_lifetime_ && pinned_iters_mgr_.PinningEnabled()) {
+ pinned_iters_mgr_.ReleasePinnedData();
+ }
+ }
+
+ inline void ClearSavedValue() {
+ if (saved_value_.capacity() > 1048576) {
+ std::string empty;
+ swap(empty, saved_value_);
+ } else {
+ saved_value_.clear();
+ }
+ }
+
+ inline void ResetInternalKeysSkippedCounter() {
+ local_stats_.skip_count_ += num_internal_keys_skipped_;
+ if (valid_) {
+ local_stats_.skip_count_--;
+ }
+ num_internal_keys_skipped_ = 0;
+ }
+
+ bool expect_total_order_inner_iter() {
+ assert(expect_total_order_inner_iter_ || prefix_extractor_ != nullptr);
+ return expect_total_order_inner_iter_;
+ }
+
+ const SliceTransform* prefix_extractor_;
+ Env* const env_;
+ Logger* logger_;
+ UserComparatorWrapper user_comparator_;
+ const MergeOperator* const merge_operator_;
+ IteratorWrapper iter_;
+ ReadCallback* read_callback_;
+ // Max visible sequence number. It is normally the snapshot seq unless we have
+ // uncommitted data in db as in WriteUnCommitted.
+ SequenceNumber sequence_;
+
+ IterKey saved_key_;
+ // Reusable internal key data structure. This is only used inside one function
+ // and should not be used across functions. Reusing this object can reduce
+ // overhead of calling construction of the function if creating it each time.
+ ParsedInternalKey ikey_;
+ std::string saved_value_;
+ Slice pinned_value_;
+ // for prefix seek mode to support prev()
+ Statistics* statistics_;
+ uint64_t max_skip_;
+ uint64_t max_skippable_internal_keys_;
+ uint64_t num_internal_keys_skipped_;
+ const Slice* iterate_lower_bound_;
+ const Slice* iterate_upper_bound_;
+
+ // The prefix of the seek key. It is only used when prefix_same_as_start_
+ // is true and prefix extractor is not null. In Next() or Prev(), current keys
+ // will be checked against this prefix, so that the iterator can be
+ // invalidated if the keys in this prefix has been exhausted. Set it using
+ // SetUserKey() and use it using GetUserKey().
+ IterKey prefix_;
+
+ Status status_;
+ Direction direction_;
+ bool valid_;
+ bool current_entry_is_merged_;
+ // True if we know that the current entry's seqnum is 0.
+ // This information is used as that the next entry will be for another
+ // user key.
+ bool is_key_seqnum_zero_;
+ const bool prefix_same_as_start_;
+ // Means that we will pin all data blocks we read as long the Iterator
+ // is not deleted, will be true if ReadOptions::pin_data is true
+ const bool pin_thru_lifetime_;
+ // Expect the inner iterator to maintain a total order.
+ // prefix_extractor_ must be non-NULL if the value is false.
+ const bool expect_total_order_inner_iter_;
+ bool allow_blob_;
+ bool is_blob_;
+ bool arena_mode_;
+ // List of operands for merge operator.
+ MergeContext merge_context_;
+ ReadRangeDelAggregator range_del_agg_;
+ LocalStatistics local_stats_;
+ PinnedIteratorsManager pinned_iters_mgr_;
+#ifdef ROCKSDB_LITE
+ ROCKSDB_FIELD_UNUSED
+#endif
+ DBImpl* db_impl_;
+#ifdef ROCKSDB_LITE
+ ROCKSDB_FIELD_UNUSED
+#endif
+ ColumnFamilyData* cfd_;
+ // for diff snapshots we want the lower bound on the seqnum;
+ // if this value > 0 iterator will return internal keys
+ SequenceNumber start_seqnum_;
+};
+
+// Return a new iterator that converts internal keys (yielded by
+// "*internal_iter") that were live at the specified `sequence` number
+// into appropriate user keys.
+extern Iterator* NewDBIterator(
+ Env* env, const ReadOptions& read_options,
+ const ImmutableCFOptions& cf_options,
+ const MutableCFOptions& mutable_cf_options,
+ const Comparator* user_key_comparator, InternalIterator* internal_iter,
+ const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
+ ReadCallback* read_callback, DBImpl* db_impl = nullptr,
+ ColumnFamilyData* cfd = nullptr, bool allow_blob = false);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_iter_stress_test.cc b/src/rocksdb/db/db_iter_stress_test.cc
new file mode 100644
index 000000000..57cd9866e
--- /dev/null
+++ b/src/rocksdb/db/db_iter_stress_test.cc
@@ -0,0 +1,654 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "test_util/testharness.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+#ifdef GFLAGS
+
+#include "util/gflags_compat.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+DEFINE_bool(verbose, false,
+ "Print huge, detailed trace. Intended for debugging failures.");
+
+#else
+
+void ParseCommandLineFlags(int*, char***, bool) {}
+bool FLAGS_verbose = false;
+
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBIteratorStressTest : public testing::Test {
+ public:
+ Env* env_;
+
+ DBIteratorStressTest() : env_(Env::Default()) {}
+};
+
+namespace {
+
+struct Entry {
+ std::string key;
+ ValueType type; // kTypeValue, kTypeDeletion, kTypeMerge
+ uint64_t sequence;
+ std::string ikey; // internal key, made from `key`, `sequence` and `type`
+ std::string value;
+ // If false, we'll pretend that this entry doesn't exist.
+ bool visible = true;
+
+ bool operator<(const Entry& e) const {
+ if (key != e.key) return key < e.key;
+ return std::tie(sequence, type) > std::tie(e.sequence, e.type);
+ }
+};
+
+struct Data {
+ std::vector<Entry> entries;
+
+ // Indices in `entries` with `visible` = false.
+ std::vector<size_t> hidden;
+ // Keys of entries whose `visible` changed since the last seek of iterators.
+ std::set<std::string> recently_touched_keys;
+};
+
+struct StressTestIterator : public InternalIterator {
+ Data* data;
+ Random64* rnd;
+ InternalKeyComparator cmp;
+
+ // Each operation will return error with this probability...
+ double error_probability = 0;
+ // ... and add/remove entries with this probability.
+ double mutation_probability = 0;
+ // The probability of adding vs removing entries will be chosen so that the
+ // amount of removed entries stays somewhat close to this number.
+ double target_hidden_fraction = 0;
+ // If true, print all mutations to stdout for debugging.
+ bool trace = false;
+
+ int iter = -1;
+ Status status_;
+
+ StressTestIterator(Data* _data, Random64* _rnd, const Comparator* _cmp)
+ : data(_data), rnd(_rnd), cmp(_cmp) {}
+
+ bool Valid() const override {
+ if (iter >= 0 && iter < (int)data->entries.size()) {
+ assert(status_.ok());
+ return true;
+ }
+ return false;
+ }
+
+ Status status() const override { return status_; }
+
+ bool MaybeFail() {
+ if (rnd->Next() >=
+ std::numeric_limits<uint64_t>::max() * error_probability) {
+ return false;
+ }
+ if (rnd->Next() % 2) {
+ status_ = Status::Incomplete("test");
+ } else {
+ status_ = Status::IOError("test");
+ }
+ if (trace) {
+ std::cout << "injecting " << status_.ToString() << std::endl;
+ }
+ iter = -1;
+ return true;
+ }
+
+ void MaybeMutate() {
+ if (rnd->Next() >=
+ std::numeric_limits<uint64_t>::max() * mutation_probability) {
+ return;
+ }
+ do {
+ // If too many entries are hidden, hide less, otherwise hide more.
+ double hide_probability =
+ data->hidden.size() > data->entries.size() * target_hidden_fraction
+ ? 1. / 3
+ : 2. / 3;
+ if (data->hidden.empty()) {
+ hide_probability = 1;
+ }
+ bool do_hide =
+ rnd->Next() < std::numeric_limits<uint64_t>::max() * hide_probability;
+ if (do_hide) {
+ // Hide a random entry.
+ size_t idx = rnd->Next() % data->entries.size();
+ Entry& e = data->entries[idx];
+ if (e.visible) {
+ if (trace) {
+ std::cout << "hiding idx " << idx << std::endl;
+ }
+ e.visible = false;
+ data->hidden.push_back(idx);
+ data->recently_touched_keys.insert(e.key);
+ } else {
+ // Already hidden. Let's go unhide something instead, just because
+ // it's easy and it doesn't really matter what we do.
+ do_hide = false;
+ }
+ }
+ if (!do_hide) {
+ // Unhide a random entry.
+ size_t hi = rnd->Next() % data->hidden.size();
+ size_t idx = data->hidden[hi];
+ if (trace) {
+ std::cout << "unhiding idx " << idx << std::endl;
+ }
+ Entry& e = data->entries[idx];
+ assert(!e.visible);
+ e.visible = true;
+ data->hidden[hi] = data->hidden.back();
+ data->hidden.pop_back();
+ data->recently_touched_keys.insert(e.key);
+ }
+ } while (rnd->Next() % 3 != 0); // do 3 mutations on average
+ }
+
+ void SkipForward() {
+ while (iter < (int)data->entries.size() && !data->entries[iter].visible) {
+ ++iter;
+ }
+ }
+ void SkipBackward() {
+ while (iter >= 0 && !data->entries[iter].visible) {
+ --iter;
+ }
+ }
+
+ void SeekToFirst() override {
+ if (MaybeFail()) return;
+ MaybeMutate();
+
+ status_ = Status::OK();
+ iter = 0;
+ SkipForward();
+ }
+ void SeekToLast() override {
+ if (MaybeFail()) return;
+ MaybeMutate();
+
+ status_ = Status::OK();
+ iter = (int)data->entries.size() - 1;
+ SkipBackward();
+ }
+
+ void Seek(const Slice& target) override {
+ if (MaybeFail()) return;
+ MaybeMutate();
+
+ status_ = Status::OK();
+ // Binary search.
+ auto it = std::partition_point(
+ data->entries.begin(), data->entries.end(),
+ [&](const Entry& e) { return cmp.Compare(e.ikey, target) < 0; });
+ iter = (int)(it - data->entries.begin());
+ SkipForward();
+ }
+ void SeekForPrev(const Slice& target) override {
+ if (MaybeFail()) return;
+ MaybeMutate();
+
+ status_ = Status::OK();
+ // Binary search.
+ auto it = std::partition_point(
+ data->entries.begin(), data->entries.end(),
+ [&](const Entry& e) { return cmp.Compare(e.ikey, target) <= 0; });
+ iter = (int)(it - data->entries.begin());
+ --iter;
+ SkipBackward();
+ }
+
+ void Next() override {
+ assert(Valid());
+ if (MaybeFail()) return;
+ MaybeMutate();
+ ++iter;
+ SkipForward();
+ }
+ void Prev() override {
+ assert(Valid());
+ if (MaybeFail()) return;
+ MaybeMutate();
+ --iter;
+ SkipBackward();
+ }
+
+ Slice key() const override {
+ assert(Valid());
+ return data->entries[iter].ikey;
+ }
+ Slice value() const override {
+ assert(Valid());
+ return data->entries[iter].value;
+ }
+
+ bool IsKeyPinned() const override { return true; }
+ bool IsValuePinned() const override { return true; }
+};
+
+// A small reimplementation of DBIter, supporting only some of the features,
+// and doing everything in O(log n).
+// Skips all keys that are in recently_touched_keys.
+struct ReferenceIterator {
+ Data* data;
+ uint64_t sequence; // ignore entries with sequence number below this
+
+ bool valid = false;
+ std::string key;
+ std::string value;
+
+ ReferenceIterator(Data* _data, uint64_t _sequence)
+ : data(_data), sequence(_sequence) {}
+
+ bool Valid() const { return valid; }
+
+ // Finds the first entry with key
+ // greater/less/greater-or-equal/less-or-equal than `key`, depending on
+ // arguments: if `skip`, inequality is strict; if `forward`, it's
+ // greater/greater-or-equal, otherwise less/less-or-equal.
+ // Sets `key` to the result.
+ // If no such key exists, returns false. Doesn't check `visible`.
+ bool FindNextKey(bool skip, bool forward) {
+ valid = false;
+ auto it = std::partition_point(data->entries.begin(), data->entries.end(),
+ [&](const Entry& e) {
+ if (forward != skip) {
+ return e.key < key;
+ } else {
+ return e.key <= key;
+ }
+ });
+ if (forward) {
+ if (it != data->entries.end()) {
+ key = it->key;
+ return true;
+ }
+ } else {
+ if (it != data->entries.begin()) {
+ --it;
+ key = it->key;
+ return true;
+ }
+ }
+ return false;
+ }
+
+ bool FindValueForCurrentKey() {
+ if (data->recently_touched_keys.count(key)) {
+ return false;
+ }
+
+ // Find the first entry for the key. The caller promises that it exists.
+ auto it = std::partition_point(data->entries.begin(), data->entries.end(),
+ [&](const Entry& e) {
+ if (e.key != key) {
+ return e.key < key;
+ }
+ return e.sequence > sequence;
+ });
+
+ // Find the first visible entry.
+ for (;; ++it) {
+ if (it == data->entries.end()) {
+ return false;
+ }
+ Entry& e = *it;
+ if (e.key != key) {
+ return false;
+ }
+ assert(e.sequence <= sequence);
+ if (!e.visible) continue;
+ if (e.type == kTypeDeletion) {
+ return false;
+ }
+ if (e.type == kTypeValue) {
+ value = e.value;
+ valid = true;
+ return true;
+ }
+ assert(e.type == kTypeMerge);
+ break;
+ }
+
+ // Collect merge operands.
+ std::vector<Slice> operands;
+ for (; it != data->entries.end(); ++it) {
+ Entry& e = *it;
+ if (e.key != key) {
+ break;
+ }
+ assert(e.sequence <= sequence);
+ if (!e.visible) continue;
+ if (e.type == kTypeDeletion) {
+ break;
+ }
+ operands.push_back(e.value);
+ if (e.type == kTypeValue) {
+ break;
+ }
+ }
+
+ // Do a merge.
+ value = operands.back().ToString();
+ for (int i = (int)operands.size() - 2; i >= 0; --i) {
+ value.append(",");
+ value.append(operands[i].data(), operands[i].size());
+ }
+
+ valid = true;
+ return true;
+ }
+
+ // Start at `key` and move until we encounter a valid value.
+ // `forward` defines the direction of movement.
+ // If `skip` is true, we're looking for key not equal to `key`.
+ void DoTheThing(bool skip, bool forward) {
+ while (FindNextKey(skip, forward) && !FindValueForCurrentKey()) {
+ skip = true;
+ }
+ }
+
+ void Seek(const Slice& target) {
+ key = target.ToString();
+ DoTheThing(false, true);
+ }
+ void SeekForPrev(const Slice& target) {
+ key = target.ToString();
+ DoTheThing(false, false);
+ }
+ void SeekToFirst() { Seek(""); }
+ void SeekToLast() {
+ key = data->entries.back().key;
+ DoTheThing(false, false);
+ }
+ void Next() {
+ assert(Valid());
+ DoTheThing(true, true);
+ }
+ void Prev() {
+ assert(Valid());
+ DoTheThing(true, false);
+ }
+};
+
+} // namespace
+
+// Use an internal iterator that sometimes returns errors and sometimes
+// adds/removes entries on the fly. Do random operations on a DBIter and
+// check results.
+// TODO: can be improved for more coverage:
+// * Override IsKeyPinned() and IsValuePinned() to actually use
+// PinnedIteratorManager and check that there's no use-after free.
+// * Try different combinations of prefix_extractor, total_order_seek,
+// prefix_same_as_start, iterate_lower_bound, iterate_upper_bound.
+TEST_F(DBIteratorStressTest, StressTest) {
+ // We use a deterministic RNG, and everything happens in a single thread.
+ Random64 rnd(826909345792864532ll);
+
+ auto gen_key = [&](int max_key) {
+ assert(max_key > 0);
+ int len = 0;
+ int a = max_key;
+ while (a) {
+ a /= 10;
+ ++len;
+ }
+ std::string s = ToString(rnd.Next() % static_cast<uint64_t>(max_key));
+ s.insert(0, len - (int)s.size(), '0');
+ return s;
+ };
+
+ Options options;
+ options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+ ReadOptions ropt;
+
+ size_t num_matching = 0;
+ size_t num_at_end = 0;
+ size_t num_not_ok = 0;
+ size_t num_recently_removed = 0;
+
+ // Number of iterations for each combination of parameters
+ // (there are ~250 of those).
+ // Tweak this to change the test run time.
+ // As of the time of writing, the test takes ~4 seconds for value of 5000.
+ const int num_iterations = 5000;
+ // Enable this to print all the operations for debugging.
+ bool trace = FLAGS_verbose;
+
+ for (int num_entries : {5, 10, 100}) {
+ for (double key_space : {0.1, 1.0, 3.0}) {
+ for (ValueType prevalent_entry_type :
+ {kTypeValue, kTypeDeletion, kTypeMerge}) {
+ for (double error_probability : {0.01, 0.1}) {
+ for (double mutation_probability : {0.01, 0.5}) {
+ for (double target_hidden_fraction : {0.1, 0.5}) {
+ std::string trace_str =
+ "entries: " + ToString(num_entries) +
+ ", key_space: " + ToString(key_space) +
+ ", error_probability: " + ToString(error_probability) +
+ ", mutation_probability: " + ToString(mutation_probability) +
+ ", target_hidden_fraction: " +
+ ToString(target_hidden_fraction);
+ SCOPED_TRACE(trace_str);
+ if (trace) {
+ std::cout << trace_str << std::endl;
+ }
+
+ // Generate data.
+ Data data;
+ int max_key = (int)(num_entries * key_space) + 1;
+ for (int i = 0; i < num_entries; ++i) {
+ Entry e;
+ e.key = gen_key(max_key);
+ if (rnd.Next() % 10 != 0) {
+ e.type = prevalent_entry_type;
+ } else {
+ const ValueType types[] = {kTypeValue, kTypeDeletion,
+ kTypeMerge};
+ e.type =
+ types[rnd.Next() % (sizeof(types) / sizeof(types[0]))];
+ }
+ e.sequence = i;
+ e.value = "v" + ToString(i);
+ ParsedInternalKey internal_key(e.key, e.sequence, e.type);
+ AppendInternalKey(&e.ikey, internal_key);
+
+ data.entries.push_back(e);
+ }
+ std::sort(data.entries.begin(), data.entries.end());
+ if (trace) {
+ std::cout << "entries:";
+ for (size_t i = 0; i < data.entries.size(); ++i) {
+ Entry& e = data.entries[i];
+ std::cout
+ << "\n idx " << i << ": \"" << e.key << "\": \""
+ << e.value << "\" seq: " << e.sequence << " type: "
+ << (e.type == kTypeValue
+ ? "val"
+ : e.type == kTypeDeletion ? "del" : "merge");
+ }
+ std::cout << std::endl;
+ }
+
+ std::unique_ptr<Iterator> db_iter;
+ std::unique_ptr<ReferenceIterator> ref_iter;
+ for (int iteration = 0; iteration < num_iterations; ++iteration) {
+ SCOPED_TRACE(iteration);
+ // Create a new iterator every ~30 operations.
+ if (db_iter == nullptr || rnd.Next() % 30 == 0) {
+ uint64_t sequence = rnd.Next() % (data.entries.size() + 2);
+ ref_iter.reset(new ReferenceIterator(&data, sequence));
+ if (trace) {
+ std::cout << "new iterator, seq: " << sequence << std::endl;
+ }
+
+ auto internal_iter =
+ new StressTestIterator(&data, &rnd, BytewiseComparator());
+ internal_iter->error_probability = error_probability;
+ internal_iter->mutation_probability = mutation_probability;
+ internal_iter->target_hidden_fraction =
+ target_hidden_fraction;
+ internal_iter->trace = trace;
+ db_iter.reset(NewDBIterator(
+ env_, ropt, ImmutableCFOptions(options),
+ MutableCFOptions(options), BytewiseComparator(),
+ internal_iter, sequence,
+ options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ }
+
+ // Do a random operation. It's important to do it on ref_it
+ // later than on db_iter to make sure ref_it sees the correct
+ // recently_touched_keys.
+ std::string old_key;
+ bool forward = rnd.Next() % 2 > 0;
+ // Do Next()/Prev() ~90% of the time.
+ bool seek = !ref_iter->Valid() || rnd.Next() % 10 == 0;
+ if (trace) {
+ std::cout << iteration << ": ";
+ }
+
+ if (!seek) {
+ assert(db_iter->Valid());
+ old_key = ref_iter->key;
+ if (trace) {
+ std::cout << (forward ? "Next" : "Prev") << std::endl;
+ }
+
+ if (forward) {
+ db_iter->Next();
+ ref_iter->Next();
+ } else {
+ db_iter->Prev();
+ ref_iter->Prev();
+ }
+ } else {
+ data.recently_touched_keys.clear();
+ // Do SeekToFirst less often than Seek.
+ if (rnd.Next() % 4 == 0) {
+ if (trace) {
+ std::cout << (forward ? "SeekToFirst" : "SeekToLast")
+ << std::endl;
+ }
+
+ if (forward) {
+ old_key = "";
+ db_iter->SeekToFirst();
+ ref_iter->SeekToFirst();
+ } else {
+ old_key = data.entries.back().key;
+ db_iter->SeekToLast();
+ ref_iter->SeekToLast();
+ }
+ } else {
+ old_key = gen_key(max_key);
+ if (trace) {
+ std::cout << (forward ? "Seek" : "SeekForPrev") << " \""
+ << old_key << '"' << std::endl;
+ }
+ if (forward) {
+ db_iter->Seek(old_key);
+ ref_iter->Seek(old_key);
+ } else {
+ db_iter->SeekForPrev(old_key);
+ ref_iter->SeekForPrev(old_key);
+ }
+ }
+ }
+
+ // Check the result.
+ if (db_iter->Valid()) {
+ ASSERT_TRUE(db_iter->status().ok());
+ if (data.recently_touched_keys.count(
+ db_iter->key().ToString())) {
+ // Ended on a key that may have been mutated during the
+ // operation. Reference iterator skips such keys, so we
+ // can't check the exact result.
+
+ // Check that the key moved in the right direction.
+ if (forward) {
+ if (seek)
+ ASSERT_GE(db_iter->key().ToString(), old_key);
+ else
+ ASSERT_GT(db_iter->key().ToString(), old_key);
+ } else {
+ if (seek)
+ ASSERT_LE(db_iter->key().ToString(), old_key);
+ else
+ ASSERT_LT(db_iter->key().ToString(), old_key);
+ }
+
+ if (ref_iter->Valid()) {
+ // Check that DBIter didn't miss any non-mutated key.
+ if (forward) {
+ ASSERT_LT(db_iter->key().ToString(), ref_iter->key);
+ } else {
+ ASSERT_GT(db_iter->key().ToString(), ref_iter->key);
+ }
+ }
+ // Tell the next iteration of the loop to reseek the
+ // iterators.
+ ref_iter->valid = false;
+
+ ++num_recently_removed;
+ } else {
+ ASSERT_TRUE(ref_iter->Valid());
+ ASSERT_EQ(ref_iter->key, db_iter->key().ToString());
+ ASSERT_EQ(ref_iter->value, db_iter->value());
+ ++num_matching;
+ }
+ } else if (db_iter->status().ok()) {
+ ASSERT_FALSE(ref_iter->Valid());
+ ++num_at_end;
+ } else {
+ // Non-ok status. Nothing to check here.
+ // Tell the next iteration of the loop to reseek the
+ // iterators.
+ ref_iter->valid = false;
+ ++num_not_ok;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // Check that all cases were hit many times.
+ EXPECT_GT(num_matching, 10000);
+ EXPECT_GT(num_at_end, 10000);
+ EXPECT_GT(num_not_ok, 10000);
+ EXPECT_GT(num_recently_removed, 10000);
+
+ std::cout << "stats:\n exact matches: " << num_matching
+ << "\n end reached: " << num_at_end
+ << "\n non-ok status: " << num_not_ok
+ << "\n mutated on the fly: " << num_recently_removed << std::endl;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ ParseCommandLineFlags(&argc, &argv, true);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_iter_test.cc b/src/rocksdb/db/db_iter_test.cc
new file mode 100644
index 000000000..ddbea8d17
--- /dev/null
+++ b/src/rocksdb/db/db_iter_test.cc
@@ -0,0 +1,3175 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <utility>
+
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
+#include "table/iterator_wrapper.h"
+#include "table/merging_iterator.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static uint64_t TestGetTickerCount(const Options& options,
+ Tickers ticker_type) {
+ return options.statistics->getTickerCount(ticker_type);
+}
+
+class TestIterator : public InternalIterator {
+ public:
+ explicit TestIterator(const Comparator* comparator)
+ : initialized_(false),
+ valid_(false),
+ sequence_number_(0),
+ iter_(0),
+ cmp(comparator) {
+ data_.reserve(16);
+ }
+
+ void AddPut(std::string argkey, std::string argvalue) {
+ Add(argkey, kTypeValue, argvalue);
+ }
+
+ void AddDeletion(std::string argkey) {
+ Add(argkey, kTypeDeletion, std::string());
+ }
+
+ void AddSingleDeletion(std::string argkey) {
+ Add(argkey, kTypeSingleDeletion, std::string());
+ }
+
+ void AddMerge(std::string argkey, std::string argvalue) {
+ Add(argkey, kTypeMerge, argvalue);
+ }
+
+ void Add(std::string argkey, ValueType type, std::string argvalue) {
+ Add(argkey, type, argvalue, sequence_number_++);
+ }
+
+ void Add(std::string argkey, ValueType type, std::string argvalue,
+ size_t seq_num, bool update_iter = false) {
+ valid_ = true;
+ ParsedInternalKey internal_key(argkey, seq_num, type);
+ data_.push_back(
+ std::pair<std::string, std::string>(std::string(), argvalue));
+ AppendInternalKey(&data_.back().first, internal_key);
+ if (update_iter && valid_ && cmp.Compare(data_.back().first, key()) < 0) {
+ // insert a key smaller than current key
+ Finish();
+ // data_[iter_] is not anymore the current element of the iterator.
+ // Increment it to reposition it to the right position.
+ iter_++;
+ }
+ }
+
+ // should be called before operations with iterator
+ void Finish() {
+ initialized_ = true;
+ std::sort(data_.begin(), data_.end(),
+ [this](std::pair<std::string, std::string> a,
+ std::pair<std::string, std::string> b) {
+ return (cmp.Compare(a.first, b.first) < 0);
+ });
+ }
+
+ // Removes the key from the set of keys over which this iterator iterates.
+ // Not to be confused with AddDeletion().
+ // If the iterator is currently positioned on this key, the deletion will
+ // apply next time the iterator moves.
+ // Used for simulating ForwardIterator updating to a new version that doesn't
+ // have some of the keys (e.g. after compaction with a filter).
+ void Vanish(std::string _key) {
+ if (valid_ && data_[iter_].first == _key) {
+ delete_current_ = true;
+ return;
+ }
+ for (auto it = data_.begin(); it != data_.end(); ++it) {
+ ParsedInternalKey ikey;
+ bool ok __attribute__((__unused__)) = ParseInternalKey(it->first, &ikey);
+ assert(ok);
+ if (ikey.user_key != _key) {
+ continue;
+ }
+ if (valid_ && data_.begin() + iter_ > it) {
+ --iter_;
+ }
+ data_.erase(it);
+ return;
+ }
+ assert(false);
+ }
+
+ // Number of operations done on this iterator since construction.
+ size_t steps() const { return steps_; }
+
+ bool Valid() const override {
+ assert(initialized_);
+ return valid_;
+ }
+
+ void SeekToFirst() override {
+ assert(initialized_);
+ ++steps_;
+ DeleteCurrentIfNeeded();
+ valid_ = (data_.size() > 0);
+ iter_ = 0;
+ }
+
+ void SeekToLast() override {
+ assert(initialized_);
+ ++steps_;
+ DeleteCurrentIfNeeded();
+ valid_ = (data_.size() > 0);
+ iter_ = data_.size() - 1;
+ }
+
+ void Seek(const Slice& target) override {
+ assert(initialized_);
+ SeekToFirst();
+ ++steps_;
+ if (!valid_) {
+ return;
+ }
+ while (iter_ < data_.size() &&
+ (cmp.Compare(data_[iter_].first, target) < 0)) {
+ ++iter_;
+ }
+
+ if (iter_ == data_.size()) {
+ valid_ = false;
+ }
+ }
+
+ void SeekForPrev(const Slice& target) override {
+ assert(initialized_);
+ DeleteCurrentIfNeeded();
+ SeekForPrevImpl(target, &cmp);
+ }
+
+ void Next() override {
+ assert(initialized_);
+ assert(valid_);
+ assert(iter_ < data_.size());
+
+ ++steps_;
+ if (delete_current_) {
+ DeleteCurrentIfNeeded();
+ } else {
+ ++iter_;
+ }
+ valid_ = iter_ < data_.size();
+ }
+
+ void Prev() override {
+ assert(initialized_);
+ assert(valid_);
+ assert(iter_ < data_.size());
+
+ ++steps_;
+ DeleteCurrentIfNeeded();
+ if (iter_ == 0) {
+ valid_ = false;
+ } else {
+ --iter_;
+ }
+ }
+
+ Slice key() const override {
+ assert(initialized_);
+ return data_[iter_].first;
+ }
+
+ Slice value() const override {
+ assert(initialized_);
+ return data_[iter_].second;
+ }
+
+ Status status() const override {
+ assert(initialized_);
+ return Status::OK();
+ }
+
+ bool IsKeyPinned() const override { return true; }
+ bool IsValuePinned() const override { return true; }
+
+ private:
+ bool initialized_;
+ bool valid_;
+ size_t sequence_number_;
+ size_t iter_;
+ size_t steps_ = 0;
+
+ InternalKeyComparator cmp;
+ std::vector<std::pair<std::string, std::string>> data_;
+ bool delete_current_ = false;
+
+ void DeleteCurrentIfNeeded() {
+ if (!delete_current_) {
+ return;
+ }
+ data_.erase(data_.begin() + iter_);
+ delete_current_ = false;
+ }
+};
+
+class DBIteratorTest : public testing::Test {
+ public:
+ Env* env_;
+
+ DBIteratorTest() : env_(Env::Default()) {}
+};
+
+TEST_F(DBIteratorTest, DBIteratorPrevNext) {
+ Options options;
+ ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+ MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddDeletion("a");
+ internal_iter->AddDeletion("a");
+ internal_iter->AddDeletion("a");
+ internal_iter->AddDeletion("a");
+ internal_iter->AddPut("a", "val_a");
+
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->Finish();
+
+ ReadOptions ro;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 10, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "val_b");
+
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+ db_iter->Next();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "val_b");
+
+ db_iter->Next();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+ // Test to check the SeekToLast() with iterate_upper_bound not set
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->Finish();
+
+ ReadOptions ro;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 10, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ }
+
+ // Test to check the SeekToLast() with iterate_upper_bound set
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->AddPut("d", "val_d");
+ internal_iter->AddPut("e", "val_e");
+ internal_iter->AddPut("f", "val_f");
+ internal_iter->Finish();
+
+ Slice prefix("d");
+
+ ReadOptions ro;
+ ro.iterate_upper_bound = &prefix;
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 10, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+
+ db_iter->Next();
+ ASSERT_TRUE(!db_iter->Valid());
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ }
+ // Test to check the SeekToLast() iterate_upper_bound set to a key that
+ // is not Put yet
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->AddPut("d", "val_d");
+ internal_iter->Finish();
+
+ Slice prefix("z");
+
+ ReadOptions ro;
+ ro.iterate_upper_bound = &prefix;
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 10, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "d");
+
+ db_iter->Next();
+ ASSERT_TRUE(!db_iter->Valid());
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "d");
+
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ }
+ // Test to check the SeekToLast() with iterate_upper_bound set to the
+ // first key
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->Finish();
+
+ Slice prefix("a");
+
+ ReadOptions ro;
+ ro.iterate_upper_bound = &prefix;
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 10, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+ // Test case to check SeekToLast with iterate_upper_bound set
+ // (same key put may times - SeekToLast should start with the
+ // maximum sequence id of the upper bound)
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->Finish();
+
+ Slice prefix("c");
+
+ ReadOptions ro;
+ ro.iterate_upper_bound = &prefix;
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 7, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+
+ SetPerfLevel(kEnableCount);
+ ASSERT_TRUE(GetPerfLevel() == kEnableCount);
+
+ get_perf_context()->Reset();
+ db_iter->SeekToLast();
+
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(static_cast<int>(get_perf_context()->internal_key_skipped_count), 1);
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+
+ SetPerfLevel(kDisable);
+ }
+ // Test to check the SeekToLast() with the iterate_upper_bound set
+ // (Checking the value of the key which has sequence ids greater than
+ // and less that the iterator's sequence id)
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+
+ internal_iter->AddPut("a", "val_a1");
+ internal_iter->AddPut("a", "val_a2");
+ internal_iter->AddPut("b", "val_b1");
+ internal_iter->AddPut("c", "val_c1");
+ internal_iter->AddPut("c", "val_c2");
+ internal_iter->AddPut("c", "val_c3");
+ internal_iter->AddPut("b", "val_b2");
+ internal_iter->AddPut("d", "val_d1");
+ internal_iter->Finish();
+
+ Slice prefix("c");
+
+ ReadOptions ro;
+ ro.iterate_upper_bound = &prefix;
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 4, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "val_b1");
+ }
+
+ // Test to check the SeekToLast() with the iterate_upper_bound set to the
+ // key that is deleted
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddDeletion("a");
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->Finish();
+
+ Slice prefix("a");
+
+ ReadOptions ro;
+ ro.iterate_upper_bound = &prefix;
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 10, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+ // Test to check the SeekToLast() with the iterate_upper_bound set
+ // (Deletion cases)
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->Finish();
+
+ Slice prefix("c");
+
+ ReadOptions ro;
+ ro.iterate_upper_bound = &prefix;
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 10, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+
+ db_iter->Next();
+ ASSERT_TRUE(!db_iter->Valid());
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ }
+ // Test to check the SeekToLast() with iterate_upper_bound set
+ // (Deletion cases - Lot of internal keys after the upper_bound
+ // is deleted)
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->AddDeletion("c");
+ internal_iter->AddDeletion("d");
+ internal_iter->AddDeletion("e");
+ internal_iter->AddDeletion("f");
+ internal_iter->AddDeletion("g");
+ internal_iter->AddDeletion("h");
+ internal_iter->Finish();
+
+ Slice prefix("c");
+
+ ReadOptions ro;
+ ro.iterate_upper_bound = &prefix;
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 7, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+
+ SetPerfLevel(kEnableCount);
+ ASSERT_TRUE(GetPerfLevel() == kEnableCount);
+
+ get_perf_context()->Reset();
+ db_iter->SeekToLast();
+
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(static_cast<int>(get_perf_context()->internal_delete_skipped_count), 0);
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+
+ SetPerfLevel(kDisable);
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddDeletion("a");
+ internal_iter->AddDeletion("a");
+ internal_iter->AddDeletion("a");
+ internal_iter->AddDeletion("a");
+ internal_iter->AddPut("a", "val_a");
+
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->Finish();
+
+ ReadOptions ro;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 10, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+ db_iter->Next();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "val_b");
+
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("b", "val_b");
+
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("b", "val_b");
+
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("b", "val_b");
+
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("b", "val_b");
+
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->Finish();
+
+ ReadOptions ro;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 2, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "val_b");
+
+ db_iter->Next();
+ ASSERT_TRUE(!db_iter->Valid());
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "val_b");
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("a", "val_a");
+
+ internal_iter->AddPut("b", "val_b");
+
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->Finish();
+
+ ReadOptions ro;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 10, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "val_b");
+
+ db_iter->Next();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "val_c");
+ }
+}
+
+TEST_F(DBIteratorTest, DBIteratorEmpty) {
+ Options options;
+ ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+ MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+ ReadOptions ro;
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 0, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 0, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+}
+
+TEST_F(DBIteratorTest, DBIteratorUseSkipCountSkips) {
+ ReadOptions ro;
+ Options options;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ for (size_t i = 0; i < 200; ++i) {
+ internal_iter->AddPut("a", "a");
+ internal_iter->AddPut("b", "b");
+ internal_iter->AddPut("c", "c");
+ }
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, 2,
+ options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "c");
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1u);
+
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "b");
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2u);
+
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "a");
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3u);
+
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3u);
+}
+
+TEST_F(DBIteratorTest, DBIteratorUseSkip) {
+ ReadOptions ro;
+ Options options;
+ options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+ ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+ MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+
+ {
+ for (size_t i = 0; i < 200; ++i) {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("b", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ for (size_t k = 0; k < 200; ++k) {
+ internal_iter->AddPut("c", ToString(k));
+ }
+ internal_iter->Finish();
+
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, i + 2, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), ToString(i));
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_2");
+ db_iter->Prev();
+
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+ }
+
+ {
+ for (size_t i = 0; i < 200; ++i) {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("b", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ for (size_t k = 0; k < 200; ++k) {
+ internal_iter->AddDeletion("c");
+ }
+ internal_iter->AddPut("c", "200");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, i + 2, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_2");
+ db_iter->Prev();
+
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("b", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ for (size_t i = 0; i < 200; ++i) {
+ internal_iter->AddDeletion("c");
+ }
+ internal_iter->AddPut("c", "200");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 202, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "200");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_2");
+ db_iter->Prev();
+
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+ }
+
+ {
+ for (size_t i = 0; i < 200; ++i) {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ for (size_t k = 0; k < 200; ++k) {
+ internal_iter->AddDeletion("c");
+ }
+ internal_iter->AddPut("c", "200");
+ internal_iter->Finish();
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, i, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(!db_iter->Valid());
+
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ for (size_t i = 0; i < 200; ++i) {
+ internal_iter->AddDeletion("c");
+ }
+ internal_iter->AddPut("c", "200");
+ internal_iter->Finish();
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 200, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "200");
+
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "200");
+
+ db_iter->Next();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ for (size_t i = 0; i < 200; ++i) {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("b", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ for (size_t k = 0; k < 200; ++k) {
+ internal_iter->AddPut("d", ToString(k));
+ }
+
+ for (size_t k = 0; k < 200; ++k) {
+ internal_iter->AddPut("c", ToString(k));
+ }
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, i + 2, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "d");
+ ASSERT_EQ(db_iter->value().ToString(), ToString(i));
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_2");
+ db_iter->Prev();
+
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+ }
+
+ {
+ for (size_t i = 0; i < 200; ++i) {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("b", "b");
+ internal_iter->AddMerge("a", "a");
+ for (size_t k = 0; k < 200; ++k) {
+ internal_iter->AddMerge("c", ToString(k));
+ }
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, i + 2, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ std::string merge_result = "0";
+ for (size_t j = 1; j <= i; ++j) {
+ merge_result += "," + ToString(j);
+ }
+ ASSERT_EQ(db_iter->value().ToString(), merge_result);
+
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "b");
+
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "a");
+
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+ }
+}
+
+TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
+ Options options;
+ ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+ MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+ ReadOptions ro;
+
+ // Basic test case ... Make sure explicityly passing the default value works.
+ // Skipping internal keys is disabled by default, when the value is 0.
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->AddDeletion("c");
+ internal_iter->AddPut("d", "val_d");
+ internal_iter->Finish();
+
+ ro.max_skippable_internal_keys = 0;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 10, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+ db_iter->Next();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "d");
+ ASSERT_EQ(db_iter->value().ToString(), "val_d");
+
+ db_iter->Next();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_TRUE(db_iter->status().ok());
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "d");
+ ASSERT_EQ(db_iter->value().ToString(), "val_d");
+
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_TRUE(db_iter->status().ok());
+ }
+
+ // Test to make sure that the request will *not* fail as incomplete if
+ // num_internal_keys_skipped is *equal* to max_skippable_internal_keys
+ // threshold. (It will fail as incomplete only when the threshold is
+ // exceeded.)
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->Finish();
+
+ ro.max_skippable_internal_keys = 2;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 10, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+ db_iter->Next();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+ db_iter->Next();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_TRUE(db_iter->status().ok());
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+ db_iter->Prev();
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_TRUE(db_iter->status().ok());
+ }
+
+ // Fail the request as incomplete when num_internal_keys_skipped >
+ // max_skippable_internal_keys
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->Finish();
+
+ ro.max_skippable_internal_keys = 2;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 10, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+ db_iter->Next();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_TRUE(db_iter->status().IsIncomplete());
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_TRUE(db_iter->status().IsIncomplete());
+ }
+
+ // Test that the num_internal_keys_skipped counter resets after a successful
+ // read.
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->AddDeletion("d");
+ internal_iter->AddDeletion("d");
+ internal_iter->AddDeletion("d");
+ internal_iter->AddPut("e", "val_e");
+ internal_iter->Finish();
+
+ ro.max_skippable_internal_keys = 2;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 10, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+ db_iter->Next();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+ db_iter->Next(); // num_internal_keys_skipped counter resets here.
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_TRUE(db_iter->status().IsIncomplete());
+ }
+
+ // Test that the num_internal_keys_skipped counter resets after a successful
+ // read.
+ // Reverse direction
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->AddDeletion("d");
+ internal_iter->AddDeletion("d");
+ internal_iter->AddPut("e", "val_e");
+ internal_iter->Finish();
+
+ ro.max_skippable_internal_keys = 2;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 10, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "e");
+ ASSERT_EQ(db_iter->value().ToString(), "val_e");
+
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+ db_iter->Prev(); // num_internal_keys_skipped counter resets here.
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_TRUE(db_iter->status().IsIncomplete());
+ }
+
+ // Test that skipping separate keys is handled
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddDeletion("c");
+ internal_iter->AddDeletion("d");
+ internal_iter->AddPut("e", "val_e");
+ internal_iter->Finish();
+
+ ro.max_skippable_internal_keys = 2;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 10, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+ db_iter->Next();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_TRUE(db_iter->status().IsIncomplete());
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "e");
+ ASSERT_EQ(db_iter->value().ToString(), "val_e");
+
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_TRUE(db_iter->status().IsIncomplete());
+ }
+
+ // Test if alternating puts and deletes of the same key are handled correctly.
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->AddDeletion("c");
+ internal_iter->AddPut("d", "val_d");
+ internal_iter->AddDeletion("d");
+ internal_iter->AddPut("e", "val_e");
+ internal_iter->Finish();
+
+ ro.max_skippable_internal_keys = 2;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 10, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+ db_iter->Next();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_TRUE(db_iter->status().IsIncomplete());
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "e");
+ ASSERT_EQ(db_iter->value().ToString(), "val_e");
+
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_TRUE(db_iter->status().IsIncomplete());
+ }
+
+ // Test for large number of skippable internal keys with *default*
+ // max_sequential_skip_in_iterations.
+ {
+ for (size_t i = 1; i <= 200; ++i) {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ for (size_t j = 1; j <= i; ++j) {
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->AddDeletion("b");
+ }
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->Finish();
+
+ ro.max_skippable_internal_keys = i;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 2 * i + 1, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+ db_iter->Next();
+ if ((options.max_sequential_skip_in_iterations + 1) >=
+ ro.max_skippable_internal_keys) {
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_TRUE(db_iter->status().IsIncomplete());
+ } else {
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "val_c");
+ }
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+ db_iter->Prev();
+ if ((options.max_sequential_skip_in_iterations + 1) >=
+ ro.max_skippable_internal_keys) {
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_TRUE(db_iter->status().IsIncomplete());
+ } else {
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "val_a");
+ }
+ }
+ }
+
+ // Test for large number of skippable internal keys with a *non-default*
+ // max_sequential_skip_in_iterations.
+ {
+ for (size_t i = 1; i <= 200; ++i) {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ for (size_t j = 1; j <= i; ++j) {
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->AddDeletion("b");
+ }
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->Finish();
+
+ options.max_sequential_skip_in_iterations = 1000;
+ ro.max_skippable_internal_keys = i;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 2 * i + 1, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+ db_iter->Next();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_TRUE(db_iter->status().IsIncomplete());
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_TRUE(db_iter->status().IsIncomplete());
+ }
+ }
+}
+
+TEST_F(DBIteratorTest, DBIterator1) {
+ ReadOptions ro;
+ Options options;
+ options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "0");
+ internal_iter->AddPut("b", "0");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("a", "1");
+ internal_iter->AddMerge("b", "2");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, 1,
+ options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "0");
+ db_iter->Next();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ db_iter->Next();
+ ASSERT_FALSE(db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator2) {
+ ReadOptions ro;
+ Options options;
+ options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "0");
+ internal_iter->AddPut("b", "0");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("a", "1");
+ internal_iter->AddMerge("b", "2");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, 0,
+ options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "0");
+ db_iter->Next();
+ ASSERT_TRUE(!db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator3) {
+ ReadOptions ro;
+ Options options;
+ options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "0");
+ internal_iter->AddPut("b", "0");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("a", "1");
+ internal_iter->AddMerge("b", "2");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, 2,
+ options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "0");
+ db_iter->Next();
+ ASSERT_TRUE(!db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator4) {
+ ReadOptions ro;
+ Options options;
+ options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "0");
+ internal_iter->AddPut("b", "0");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("a", "1");
+ internal_iter->AddMerge("b", "2");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, 4,
+ options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "0,1");
+ db_iter->Next();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "2");
+ db_iter->Next();
+ ASSERT_TRUE(!db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator5) {
+ ReadOptions ro;
+ Options options;
+ options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+ ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+ MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ internal_iter->AddMerge("a", "merge_3");
+ internal_iter->AddPut("a", "put_1");
+ internal_iter->AddMerge("a", "merge_4");
+ internal_iter->AddMerge("a", "merge_5");
+ internal_iter->AddMerge("a", "merge_6");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 0, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ internal_iter->AddMerge("a", "merge_3");
+ internal_iter->AddPut("a", "put_1");
+ internal_iter->AddMerge("a", "merge_4");
+ internal_iter->AddMerge("a", "merge_5");
+ internal_iter->AddMerge("a", "merge_6");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 1, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ internal_iter->AddMerge("a", "merge_3");
+ internal_iter->AddPut("a", "put_1");
+ internal_iter->AddMerge("a", "merge_4");
+ internal_iter->AddMerge("a", "merge_5");
+ internal_iter->AddMerge("a", "merge_6");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 2, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2,merge_3");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ internal_iter->AddMerge("a", "merge_3");
+ internal_iter->AddPut("a", "put_1");
+ internal_iter->AddMerge("a", "merge_4");
+ internal_iter->AddMerge("a", "merge_5");
+ internal_iter->AddMerge("a", "merge_6");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 3, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "put_1");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ internal_iter->AddMerge("a", "merge_3");
+ internal_iter->AddPut("a", "put_1");
+ internal_iter->AddMerge("a", "merge_4");
+ internal_iter->AddMerge("a", "merge_5");
+ internal_iter->AddMerge("a", "merge_6");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 4, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ internal_iter->AddMerge("a", "merge_3");
+ internal_iter->AddPut("a", "put_1");
+ internal_iter->AddMerge("a", "merge_4");
+ internal_iter->AddMerge("a", "merge_5");
+ internal_iter->AddMerge("a", "merge_6");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 5, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4,merge_5");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ internal_iter->AddMerge("a", "merge_3");
+ internal_iter->AddPut("a", "put_1");
+ internal_iter->AddMerge("a", "merge_4");
+ internal_iter->AddMerge("a", "merge_5");
+ internal_iter->AddMerge("a", "merge_6");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 6, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4,merge_5,merge_6");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ // put, singledelete, merge
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddSingleDeletion("a");
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->Finish();
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 10, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->Seek("b");
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ }
+}
+
+TEST_F(DBIteratorTest, DBIterator6) {
+ ReadOptions ro;
+ Options options;
+ options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+ ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+ MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ internal_iter->AddMerge("a", "merge_3");
+ internal_iter->AddDeletion("a");
+ internal_iter->AddMerge("a", "merge_4");
+ internal_iter->AddMerge("a", "merge_5");
+ internal_iter->AddMerge("a", "merge_6");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 0, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ internal_iter->AddMerge("a", "merge_3");
+ internal_iter->AddDeletion("a");
+ internal_iter->AddMerge("a", "merge_4");
+ internal_iter->AddMerge("a", "merge_5");
+ internal_iter->AddMerge("a", "merge_6");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 1, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ internal_iter->AddMerge("a", "merge_3");
+ internal_iter->AddDeletion("a");
+ internal_iter->AddMerge("a", "merge_4");
+ internal_iter->AddMerge("a", "merge_5");
+ internal_iter->AddMerge("a", "merge_6");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 2, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2,merge_3");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ internal_iter->AddMerge("a", "merge_3");
+ internal_iter->AddDeletion("a");
+ internal_iter->AddMerge("a", "merge_4");
+ internal_iter->AddMerge("a", "merge_5");
+ internal_iter->AddMerge("a", "merge_6");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 3, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ internal_iter->AddMerge("a", "merge_3");
+ internal_iter->AddDeletion("a");
+ internal_iter->AddMerge("a", "merge_4");
+ internal_iter->AddMerge("a", "merge_5");
+ internal_iter->AddMerge("a", "merge_6");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 4, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_4");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ internal_iter->AddMerge("a", "merge_3");
+ internal_iter->AddDeletion("a");
+ internal_iter->AddMerge("a", "merge_4");
+ internal_iter->AddMerge("a", "merge_5");
+ internal_iter->AddMerge("a", "merge_6");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 5, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ internal_iter->AddMerge("a", "merge_3");
+ internal_iter->AddDeletion("a");
+ internal_iter->AddMerge("a", "merge_4");
+ internal_iter->AddMerge("a", "merge_5");
+ internal_iter->AddMerge("a", "merge_6");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 6, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5,merge_6");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+}
+
+TEST_F(DBIteratorTest, DBIterator7) {
+ ReadOptions ro;
+ Options options;
+ options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+ ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+ MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddPut("b", "val");
+ internal_iter->AddMerge("b", "merge_2");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_3");
+
+ internal_iter->AddMerge("c", "merge_4");
+ internal_iter->AddMerge("c", "merge_5");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_6");
+ internal_iter->AddMerge("b", "merge_7");
+ internal_iter->AddMerge("b", "merge_8");
+ internal_iter->AddMerge("b", "merge_9");
+ internal_iter->AddMerge("b", "merge_10");
+ internal_iter->AddMerge("b", "merge_11");
+
+ internal_iter->AddDeletion("c");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 0, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddPut("b", "val");
+ internal_iter->AddMerge("b", "merge_2");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_3");
+
+ internal_iter->AddMerge("c", "merge_4");
+ internal_iter->AddMerge("c", "merge_5");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_6");
+ internal_iter->AddMerge("b", "merge_7");
+ internal_iter->AddMerge("b", "merge_8");
+ internal_iter->AddMerge("b", "merge_9");
+ internal_iter->AddMerge("b", "merge_10");
+ internal_iter->AddMerge("b", "merge_11");
+
+ internal_iter->AddDeletion("c");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 2, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "val,merge_2");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddPut("b", "val");
+ internal_iter->AddMerge("b", "merge_2");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_3");
+
+ internal_iter->AddMerge("c", "merge_4");
+ internal_iter->AddMerge("c", "merge_5");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_6");
+ internal_iter->AddMerge("b", "merge_7");
+ internal_iter->AddMerge("b", "merge_8");
+ internal_iter->AddMerge("b", "merge_9");
+ internal_iter->AddMerge("b", "merge_10");
+ internal_iter->AddMerge("b", "merge_11");
+
+ internal_iter->AddDeletion("c");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 4, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_3");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddPut("b", "val");
+ internal_iter->AddMerge("b", "merge_2");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_3");
+
+ internal_iter->AddMerge("c", "merge_4");
+ internal_iter->AddMerge("c", "merge_5");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_6");
+ internal_iter->AddMerge("b", "merge_7");
+ internal_iter->AddMerge("b", "merge_8");
+ internal_iter->AddMerge("b", "merge_9");
+ internal_iter->AddMerge("b", "merge_10");
+ internal_iter->AddMerge("b", "merge_11");
+
+ internal_iter->AddDeletion("c");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 5, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_4");
+ db_iter->Prev();
+
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_3");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddPut("b", "val");
+ internal_iter->AddMerge("b", "merge_2");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_3");
+
+ internal_iter->AddMerge("c", "merge_4");
+ internal_iter->AddMerge("c", "merge_5");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_6");
+ internal_iter->AddMerge("b", "merge_7");
+ internal_iter->AddMerge("b", "merge_8");
+ internal_iter->AddMerge("b", "merge_9");
+ internal_iter->AddMerge("b", "merge_10");
+ internal_iter->AddMerge("b", "merge_11");
+
+ internal_iter->AddDeletion("c");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 6, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_3");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddPut("b", "val");
+ internal_iter->AddMerge("b", "merge_2");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_3");
+
+ internal_iter->AddMerge("c", "merge_4");
+ internal_iter->AddMerge("c", "merge_5");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_6");
+ internal_iter->AddMerge("b", "merge_7");
+ internal_iter->AddMerge("b", "merge_8");
+ internal_iter->AddMerge("b", "merge_9");
+ internal_iter->AddMerge("b", "merge_10");
+ internal_iter->AddMerge("b", "merge_11");
+
+ internal_iter->AddDeletion("c");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 7, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddPut("b", "val");
+ internal_iter->AddMerge("b", "merge_2");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_3");
+
+ internal_iter->AddMerge("c", "merge_4");
+ internal_iter->AddMerge("c", "merge_5");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_6");
+ internal_iter->AddMerge("b", "merge_7");
+ internal_iter->AddMerge("b", "merge_8");
+ internal_iter->AddMerge("b", "merge_9");
+ internal_iter->AddMerge("b", "merge_10");
+ internal_iter->AddMerge("b", "merge_11");
+
+ internal_iter->AddDeletion("c");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 9, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_6,merge_7");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddPut("b", "val");
+ internal_iter->AddMerge("b", "merge_2");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_3");
+
+ internal_iter->AddMerge("c", "merge_4");
+ internal_iter->AddMerge("c", "merge_5");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_6");
+ internal_iter->AddMerge("b", "merge_7");
+ internal_iter->AddMerge("b", "merge_8");
+ internal_iter->AddMerge("b", "merge_9");
+ internal_iter->AddMerge("b", "merge_10");
+ internal_iter->AddMerge("b", "merge_11");
+
+ internal_iter->AddDeletion("c");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 13, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(),
+ "merge_6,merge_7,merge_8,merge_9,merge_10,merge_11");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddPut("b", "val");
+ internal_iter->AddMerge("b", "merge_2");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_3");
+
+ internal_iter->AddMerge("c", "merge_4");
+ internal_iter->AddMerge("c", "merge_5");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_6");
+ internal_iter->AddMerge("b", "merge_7");
+ internal_iter->AddMerge("b", "merge_8");
+ internal_iter->AddMerge("b", "merge_9");
+ internal_iter->AddMerge("b", "merge_10");
+ internal_iter->AddMerge("b", "merge_11");
+
+ internal_iter->AddDeletion("c");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+ internal_iter, 14, options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(),
+ "merge_6,merge_7,merge_8,merge_9,merge_10,merge_11");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+}
+
+TEST_F(DBIteratorTest, DBIterator8) {
+ ReadOptions ro;
+ Options options;
+ options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddDeletion("a");
+ internal_iter->AddPut("a", "0");
+ internal_iter->AddPut("b", "0");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, 10,
+ options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "0");
+
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "0");
+}
+
+// TODO(3.13): fix the issue of Seek() then Prev() which might not necessary
+// return the biggest element smaller than the seek key.
+TEST_F(DBIteratorTest, DBIterator9) {
+ ReadOptions ro;
+ Options options;
+ options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ internal_iter->AddMerge("b", "merge_3");
+ internal_iter->AddMerge("b", "merge_4");
+ internal_iter->AddMerge("d", "merge_5");
+ internal_iter->AddMerge("d", "merge_6");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, 10,
+ options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4");
+ db_iter->Next();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "d");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_5,merge_6");
+
+ db_iter->Seek("b");
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2");
+
+ db_iter->SeekForPrev("b");
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4");
+ db_iter->Next();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "d");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_5,merge_6");
+
+ db_iter->Seek("c");
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "d");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_5,merge_6");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4");
+
+ db_iter->SeekForPrev("c");
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4");
+ db_iter->Next();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "d");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_5,merge_6");
+ }
+}
+
+// TODO(3.13): fix the issue of Seek() then Prev() which might not necessary
+// return the biggest element smaller than the seek key.
+TEST_F(DBIteratorTest, DBIterator10) {
+ ReadOptions ro;
+ Options options;
+
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "1");
+ internal_iter->AddPut("b", "2");
+ internal_iter->AddPut("c", "3");
+ internal_iter->AddPut("d", "4");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, 10,
+ options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+
+ db_iter->Seek("c");
+ ASSERT_TRUE(db_iter->Valid());
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "2");
+
+ db_iter->Next();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "3");
+
+ db_iter->SeekForPrev("c");
+ ASSERT_TRUE(db_iter->Valid());
+ db_iter->Next();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "d");
+ ASSERT_EQ(db_iter->value().ToString(), "4");
+
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "3");
+}
+
+TEST_F(DBIteratorTest, SeekToLastOccurrenceSeq0) {
+ ReadOptions ro;
+ Options options;
+ options.merge_operator = nullptr;
+
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "1");
+ internal_iter->AddPut("b", "2");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, 10, 0 /* force seek */,
+ nullptr /*read_callback*/));
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "1");
+ db_iter->Next();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "2");
+ db_iter->Next();
+ ASSERT_FALSE(db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator11) {
+ ReadOptions ro;
+ Options options;
+ options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "0");
+ internal_iter->AddPut("b", "0");
+ internal_iter->AddSingleDeletion("b");
+ internal_iter->AddMerge("a", "1");
+ internal_iter->AddMerge("b", "2");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, 1,
+ options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "0");
+ db_iter->Next();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ db_iter->Next();
+ ASSERT_FALSE(db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator12) {
+ ReadOptions ro;
+ Options options;
+ options.merge_operator = nullptr;
+
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "1");
+ internal_iter->AddPut("b", "2");
+ internal_iter->AddPut("c", "3");
+ internal_iter->AddSingleDeletion("b");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, 10, 0, nullptr /*read_callback*/));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "3");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "1");
+ db_iter->Prev();
+ ASSERT_FALSE(db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator13) {
+ ReadOptions ro;
+ Options options;
+ options.merge_operator = nullptr;
+
+ std::string key;
+ key.resize(9);
+ key.assign(9, static_cast<char>(0));
+ key[0] = 'b';
+
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut(key, "0");
+ internal_iter->AddPut(key, "1");
+ internal_iter->AddPut(key, "2");
+ internal_iter->AddPut(key, "3");
+ internal_iter->AddPut(key, "4");
+ internal_iter->AddPut(key, "5");
+ internal_iter->AddPut(key, "6");
+ internal_iter->AddPut(key, "7");
+ internal_iter->AddPut(key, "8");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, 2, 3, nullptr /*read_callback*/));
+ db_iter->Seek("b");
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), key);
+ ASSERT_EQ(db_iter->value().ToString(), "2");
+}
+
+TEST_F(DBIteratorTest, DBIterator14) {
+ ReadOptions ro;
+ Options options;
+ options.merge_operator = nullptr;
+
+ std::string key("b");
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("b", "0");
+ internal_iter->AddPut("b", "1");
+ internal_iter->AddPut("b", "2");
+ internal_iter->AddPut("b", "3");
+ internal_iter->AddPut("a", "4");
+ internal_iter->AddPut("a", "5");
+ internal_iter->AddPut("a", "6");
+ internal_iter->AddPut("c", "7");
+ internal_iter->AddPut("c", "8");
+ internal_iter->AddPut("c", "9");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, 4, 1, nullptr /*read_callback*/));
+ db_iter->Seek("b");
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "3");
+ db_iter->SeekToFirst();
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "4");
+}
+
+TEST_F(DBIteratorTest, DBIteratorTestDifferentialSnapshots) {
+ { // test that KVs earlier that iter_start_seqnum are filtered out
+ ReadOptions ro;
+ ro.iter_start_seqnum=5;
+ Options options;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ for (size_t i = 0; i < 10; ++i) {
+ internal_iter->AddPut(std::to_string(i), std::to_string(i) + "a");
+ internal_iter->AddPut(std::to_string(i), std::to_string(i) + "b");
+ internal_iter->AddPut(std::to_string(i), std::to_string(i) + "c");
+ }
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, 13,
+ options.max_sequential_skip_in_iterations, nullptr));
+ // Expecting InternalKeys in [5,8] range with correct type
+ int seqnums[4] = {5,8,11,13};
+ std::string user_keys[4] = {"1","2","3","4"};
+ std::string values[4] = {"1c", "2c", "3c", "4b"};
+ int i = 0;
+ for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+ FullKey fkey;
+ ParseFullKey(db_iter->key(), &fkey);
+ ASSERT_EQ(user_keys[i], fkey.user_key.ToString());
+ ASSERT_EQ(EntryType::kEntryPut, fkey.type);
+ ASSERT_EQ(seqnums[i], fkey.sequence);
+ ASSERT_EQ(values[i], db_iter->value().ToString());
+ i++;
+ }
+ ASSERT_EQ(i, 4);
+ }
+
+ { // Test that deletes are returned correctly as internal KVs
+ ReadOptions ro;
+ ro.iter_start_seqnum=5;
+ Options options;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ for (size_t i = 0; i < 10; ++i) {
+ internal_iter->AddPut(std::to_string(i), std::to_string(i) + "a");
+ internal_iter->AddPut(std::to_string(i), std::to_string(i) + "b");
+ internal_iter->AddDeletion(std::to_string(i));
+ }
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, 13,
+ options.max_sequential_skip_in_iterations, nullptr));
+ // Expecting InternalKeys in [5,8] range with correct type
+ int seqnums[4] = {5,8,11,13};
+ EntryType key_types[4] = {EntryType::kEntryDelete,EntryType::kEntryDelete,
+ EntryType::kEntryDelete,EntryType::kEntryPut};
+ std::string user_keys[4] = {"1","2","3","4"};
+ std::string values[4] = {"", "", "", "4b"};
+ int i = 0;
+ for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+ FullKey fkey;
+ ParseFullKey(db_iter->key(), &fkey);
+ ASSERT_EQ(user_keys[i], fkey.user_key.ToString());
+ ASSERT_EQ(key_types[i], fkey.type);
+ ASSERT_EQ(seqnums[i], fkey.sequence);
+ ASSERT_EQ(values[i], db_iter->value().ToString());
+ i++;
+ }
+ ASSERT_EQ(i, 4);
+ }
+}
+
+class DBIterWithMergeIterTest : public testing::Test {
+ public:
+ DBIterWithMergeIterTest()
+ : env_(Env::Default()), icomp_(BytewiseComparator()) {
+ options_.merge_operator = nullptr;
+
+ internal_iter1_ = new TestIterator(BytewiseComparator());
+ internal_iter1_->Add("a", kTypeValue, "1", 3u);
+ internal_iter1_->Add("f", kTypeValue, "2", 5u);
+ internal_iter1_->Add("g", kTypeValue, "3", 7u);
+ internal_iter1_->Finish();
+
+ internal_iter2_ = new TestIterator(BytewiseComparator());
+ internal_iter2_->Add("a", kTypeValue, "4", 6u);
+ internal_iter2_->Add("b", kTypeValue, "5", 1u);
+ internal_iter2_->Add("c", kTypeValue, "6", 2u);
+ internal_iter2_->Add("d", kTypeValue, "7", 3u);
+ internal_iter2_->Finish();
+
+ std::vector<InternalIterator*> child_iters;
+ child_iters.push_back(internal_iter1_);
+ child_iters.push_back(internal_iter2_);
+ InternalKeyComparator icomp(BytewiseComparator());
+ InternalIterator* merge_iter =
+ NewMergingIterator(&icomp_, &child_iters[0], 2u);
+
+ db_iter_.reset(NewDBIterator(
+ env_, ro_, ImmutableCFOptions(options_), MutableCFOptions(options_),
+ BytewiseComparator(), merge_iter,
+ 8 /* read data earlier than seqId 8 */,
+ 3 /* max iterators before reseek */, nullptr /*read_callback*/));
+ }
+
+ Env* env_;
+ ReadOptions ro_;
+ Options options_;
+ TestIterator* internal_iter1_;
+ TestIterator* internal_iter2_;
+ InternalKeyComparator icomp_;
+ Iterator* merge_iter_;
+ std::unique_ptr<Iterator> db_iter_;
+};
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIterator1) {
+ db_iter_->SeekToFirst();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "a");
+ ASSERT_EQ(db_iter_->value().ToString(), "4");
+ db_iter_->Next();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "b");
+ ASSERT_EQ(db_iter_->value().ToString(), "5");
+ db_iter_->Next();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "c");
+ ASSERT_EQ(db_iter_->value().ToString(), "6");
+ db_iter_->Next();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "d");
+ ASSERT_EQ(db_iter_->value().ToString(), "7");
+ db_iter_->Next();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "f");
+ ASSERT_EQ(db_iter_->value().ToString(), "2");
+ db_iter_->Next();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "g");
+ ASSERT_EQ(db_iter_->value().ToString(), "3");
+ db_iter_->Next();
+ ASSERT_FALSE(db_iter_->Valid());
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIterator2) {
+ // Test Prev() when one child iterator is at its end.
+ db_iter_->SeekForPrev("g");
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "g");
+ ASSERT_EQ(db_iter_->value().ToString(), "3");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "f");
+ ASSERT_EQ(db_iter_->value().ToString(), "2");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "d");
+ ASSERT_EQ(db_iter_->value().ToString(), "7");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "c");
+ ASSERT_EQ(db_iter_->value().ToString(), "6");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "b");
+ ASSERT_EQ(db_iter_->value().ToString(), "5");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "a");
+ ASSERT_EQ(db_iter_->value().ToString(), "4");
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace1) {
+ // Test Prev() when one child iterator is at its end but more rows
+ // are added.
+ db_iter_->Seek("f");
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "f");
+ ASSERT_EQ(db_iter_->value().ToString(), "2");
+
+ // Test call back inserts a key in the end of the mem table after
+ // MergeIterator::Prev() realized the mem table iterator is at its end
+ // and before an SeekToLast() is called.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "MergeIterator::Prev:BeforePrev",
+ [&](void* /*arg*/) { internal_iter2_->Add("z", kTypeValue, "7", 12u); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "d");
+ ASSERT_EQ(db_iter_->value().ToString(), "7");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "c");
+ ASSERT_EQ(db_iter_->value().ToString(), "6");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "b");
+ ASSERT_EQ(db_iter_->value().ToString(), "5");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "a");
+ ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace2) {
+ // Test Prev() when one child iterator is at its end but more rows
+ // are added.
+ db_iter_->Seek("f");
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "f");
+ ASSERT_EQ(db_iter_->value().ToString(), "2");
+
+ // Test call back inserts entries for update a key in the end of the
+ // mem table after MergeIterator::Prev() realized the mem tableiterator is at
+ // its end and before an SeekToLast() is called.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "MergeIterator::Prev:BeforePrev", [&](void* /*arg*/) {
+ internal_iter2_->Add("z", kTypeValue, "7", 12u);
+ internal_iter2_->Add("z", kTypeValue, "7", 11u);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "d");
+ ASSERT_EQ(db_iter_->value().ToString(), "7");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "c");
+ ASSERT_EQ(db_iter_->value().ToString(), "6");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "b");
+ ASSERT_EQ(db_iter_->value().ToString(), "5");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "a");
+ ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace3) {
+ // Test Prev() when one child iterator is at its end but more rows
+ // are added and max_skipped is triggered.
+ db_iter_->Seek("f");
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "f");
+ ASSERT_EQ(db_iter_->value().ToString(), "2");
+
+ // Test call back inserts entries for update a key in the end of the
+ // mem table after MergeIterator::Prev() realized the mem table iterator is at
+ // its end and before an SeekToLast() is called.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "MergeIterator::Prev:BeforePrev", [&](void* /*arg*/) {
+ internal_iter2_->Add("z", kTypeValue, "7", 16u, true);
+ internal_iter2_->Add("z", kTypeValue, "7", 15u, true);
+ internal_iter2_->Add("z", kTypeValue, "7", 14u, true);
+ internal_iter2_->Add("z", kTypeValue, "7", 13u, true);
+ internal_iter2_->Add("z", kTypeValue, "7", 12u, true);
+ internal_iter2_->Add("z", kTypeValue, "7", 11u, true);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "d");
+ ASSERT_EQ(db_iter_->value().ToString(), "7");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "c");
+ ASSERT_EQ(db_iter_->value().ToString(), "6");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "b");
+ ASSERT_EQ(db_iter_->value().ToString(), "5");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "a");
+ ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace4) {
+ // Test Prev() when one child iterator has more rows inserted
+ // between Seek() and Prev() when changing directions.
+ internal_iter2_->Add("z", kTypeValue, "9", 4u);
+
+ db_iter_->Seek("g");
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "g");
+ ASSERT_EQ(db_iter_->value().ToString(), "3");
+
+ // Test call back inserts entries for update a key before "z" in
+ // mem table after MergeIterator::Prev() calls mem table iterator's
+ // Seek() and before calling Prev()
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "MergeIterator::Prev:BeforePrev", [&](void* arg) {
+ IteratorWrapper* it = reinterpret_cast<IteratorWrapper*>(arg);
+ if (it->key().starts_with("z")) {
+ internal_iter2_->Add("x", kTypeValue, "7", 16u, true);
+ internal_iter2_->Add("x", kTypeValue, "7", 15u, true);
+ internal_iter2_->Add("x", kTypeValue, "7", 14u, true);
+ internal_iter2_->Add("x", kTypeValue, "7", 13u, true);
+ internal_iter2_->Add("x", kTypeValue, "7", 12u, true);
+ internal_iter2_->Add("x", kTypeValue, "7", 11u, true);
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "f");
+ ASSERT_EQ(db_iter_->value().ToString(), "2");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "d");
+ ASSERT_EQ(db_iter_->value().ToString(), "7");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "c");
+ ASSERT_EQ(db_iter_->value().ToString(), "6");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "b");
+ ASSERT_EQ(db_iter_->value().ToString(), "5");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "a");
+ ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace5) {
+ internal_iter2_->Add("z", kTypeValue, "9", 4u);
+
+ // Test Prev() when one child iterator has more rows inserted
+ // between Seek() and Prev() when changing directions.
+ db_iter_->Seek("g");
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "g");
+ ASSERT_EQ(db_iter_->value().ToString(), "3");
+
+ // Test call back inserts entries for update a key before "z" in
+ // mem table after MergeIterator::Prev() calls mem table iterator's
+ // Seek() and before calling Prev()
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "MergeIterator::Prev:BeforePrev", [&](void* arg) {
+ IteratorWrapper* it = reinterpret_cast<IteratorWrapper*>(arg);
+ if (it->key().starts_with("z")) {
+ internal_iter2_->Add("x", kTypeValue, "7", 16u, true);
+ internal_iter2_->Add("x", kTypeValue, "7", 15u, true);
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "f");
+ ASSERT_EQ(db_iter_->value().ToString(), "2");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "d");
+ ASSERT_EQ(db_iter_->value().ToString(), "7");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "c");
+ ASSERT_EQ(db_iter_->value().ToString(), "6");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "b");
+ ASSERT_EQ(db_iter_->value().ToString(), "5");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "a");
+ ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace6) {
+ internal_iter2_->Add("z", kTypeValue, "9", 4u);
+
+ // Test Prev() when one child iterator has more rows inserted
+ // between Seek() and Prev() when changing directions.
+ db_iter_->Seek("g");
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "g");
+ ASSERT_EQ(db_iter_->value().ToString(), "3");
+
+ // Test call back inserts an entry for update a key before "z" in
+ // mem table after MergeIterator::Prev() calls mem table iterator's
+ // Seek() and before calling Prev()
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "MergeIterator::Prev:BeforePrev", [&](void* arg) {
+ IteratorWrapper* it = reinterpret_cast<IteratorWrapper*>(arg);
+ if (it->key().starts_with("z")) {
+ internal_iter2_->Add("x", kTypeValue, "7", 16u, true);
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "f");
+ ASSERT_EQ(db_iter_->value().ToString(), "2");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "d");
+ ASSERT_EQ(db_iter_->value().ToString(), "7");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "c");
+ ASSERT_EQ(db_iter_->value().ToString(), "6");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "b");
+ ASSERT_EQ(db_iter_->value().ToString(), "5");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "a");
+ ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace7) {
+ internal_iter1_->Add("u", kTypeValue, "10", 4u);
+ internal_iter1_->Add("v", kTypeValue, "11", 4u);
+ internal_iter1_->Add("w", kTypeValue, "12", 4u);
+ internal_iter2_->Add("z", kTypeValue, "9", 4u);
+
+ // Test Prev() when one child iterator has more rows inserted
+ // between Seek() and Prev() when changing directions.
+ db_iter_->Seek("g");
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "g");
+ ASSERT_EQ(db_iter_->value().ToString(), "3");
+
+ // Test call back inserts entries for update a key before "z" in
+ // mem table after MergeIterator::Prev() calls mem table iterator's
+ // Seek() and before calling Prev()
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "MergeIterator::Prev:BeforePrev", [&](void* arg) {
+ IteratorWrapper* it = reinterpret_cast<IteratorWrapper*>(arg);
+ if (it->key().starts_with("z")) {
+ internal_iter2_->Add("x", kTypeValue, "7", 16u, true);
+ internal_iter2_->Add("x", kTypeValue, "7", 15u, true);
+ internal_iter2_->Add("x", kTypeValue, "7", 14u, true);
+ internal_iter2_->Add("x", kTypeValue, "7", 13u, true);
+ internal_iter2_->Add("x", kTypeValue, "7", 12u, true);
+ internal_iter2_->Add("x", kTypeValue, "7", 11u, true);
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "f");
+ ASSERT_EQ(db_iter_->value().ToString(), "2");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "d");
+ ASSERT_EQ(db_iter_->value().ToString(), "7");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "c");
+ ASSERT_EQ(db_iter_->value().ToString(), "6");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "b");
+ ASSERT_EQ(db_iter_->value().ToString(), "5");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "a");
+ ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace8) {
+ // internal_iter1_: a, f, g
+ // internal_iter2_: a, b, c, d, adding (z)
+ internal_iter2_->Add("z", kTypeValue, "9", 4u);
+
+ // Test Prev() when one child iterator has more rows inserted
+ // between Seek() and Prev() when changing directions.
+ db_iter_->Seek("g");
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "g");
+ ASSERT_EQ(db_iter_->value().ToString(), "3");
+
+ // Test call back inserts two keys before "z" in mem table after
+ // MergeIterator::Prev() calls mem table iterator's Seek() and
+ // before calling Prev()
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "MergeIterator::Prev:BeforePrev", [&](void* arg) {
+ IteratorWrapper* it = reinterpret_cast<IteratorWrapper*>(arg);
+ if (it->key().starts_with("z")) {
+ internal_iter2_->Add("x", kTypeValue, "7", 16u, true);
+ internal_iter2_->Add("y", kTypeValue, "7", 17u, true);
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "f");
+ ASSERT_EQ(db_iter_->value().ToString(), "2");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "d");
+ ASSERT_EQ(db_iter_->value().ToString(), "7");
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+
+TEST_F(DBIteratorTest, SeekPrefixTombstones) {
+ ReadOptions ro;
+ Options options;
+ options.prefix_extractor.reset(NewNoopTransform());
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddDeletion("b");
+ internal_iter->AddDeletion("c");
+ internal_iter->AddDeletion("d");
+ internal_iter->AddDeletion("e");
+ internal_iter->AddDeletion("f");
+ internal_iter->AddDeletion("g");
+ internal_iter->Finish();
+
+ ro.prefix_same_as_start = true;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, 10,
+ options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+
+ int skipped_keys = 0;
+
+ get_perf_context()->Reset();
+ db_iter->SeekForPrev("z");
+ skipped_keys =
+ static_cast<int>(get_perf_context()->internal_key_skipped_count);
+ ASSERT_EQ(skipped_keys, 0);
+
+ get_perf_context()->Reset();
+ db_iter->Seek("a");
+ skipped_keys =
+ static_cast<int>(get_perf_context()->internal_key_skipped_count);
+ ASSERT_EQ(skipped_keys, 0);
+}
+
+TEST_F(DBIteratorTest, SeekToFirstLowerBound) {
+ const int kNumKeys = 3;
+ for (int i = 0; i < kNumKeys + 2; ++i) {
+ // + 2 for two special cases: lower bound before and lower bound after the
+ // internal iterator's keys
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ for (int j = 1; j <= kNumKeys; ++j) {
+ internal_iter->AddPut(std::to_string(j), "val");
+ }
+ internal_iter->Finish();
+
+ ReadOptions ro;
+ auto lower_bound_str = std::to_string(i);
+ Slice lower_bound(lower_bound_str);
+ ro.iterate_lower_bound = &lower_bound;
+ Options options;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, 10 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+
+ db_iter->SeekToFirst();
+ if (i == kNumKeys + 1) {
+ // lower bound was beyond the last key
+ ASSERT_FALSE(db_iter->Valid());
+ } else {
+ ASSERT_TRUE(db_iter->Valid());
+ int expected;
+ if (i == 0) {
+ // lower bound was before the first key
+ expected = 1;
+ } else {
+ // lower bound was at the ith key
+ expected = i;
+ }
+ ASSERT_EQ(std::to_string(expected), db_iter->key().ToString());
+ }
+ }
+}
+
+TEST_F(DBIteratorTest, PrevLowerBound) {
+ const int kNumKeys = 3;
+ const int kLowerBound = 2;
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ for (int j = 1; j <= kNumKeys; ++j) {
+ internal_iter->AddPut(std::to_string(j), "val");
+ }
+ internal_iter->Finish();
+
+ ReadOptions ro;
+ auto lower_bound_str = std::to_string(kLowerBound);
+ Slice lower_bound(lower_bound_str);
+ ro.iterate_lower_bound = &lower_bound;
+ Options options;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, 10 /* sequence */,
+ options.max_sequential_skip_in_iterations, nullptr /* read_callback */));
+
+ db_iter->SeekToLast();
+ for (int i = kNumKeys; i >= kLowerBound; --i) {
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(std::to_string(i), db_iter->key().ToString());
+ db_iter->Prev();
+ }
+ ASSERT_FALSE(db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, SeekLessLowerBound) {
+ const int kNumKeys = 3;
+ const int kLowerBound = 2;
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ for (int j = 1; j <= kNumKeys; ++j) {
+ internal_iter->AddPut(std::to_string(j), "val");
+ }
+ internal_iter->Finish();
+
+ ReadOptions ro;
+ auto lower_bound_str = std::to_string(kLowerBound);
+ Slice lower_bound(lower_bound_str);
+ ro.iterate_lower_bound = &lower_bound;
+ Options options;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, 10 /* sequence */,
+ options.max_sequential_skip_in_iterations, nullptr /* read_callback */));
+
+ auto before_lower_bound_str = std::to_string(kLowerBound - 1);
+ Slice before_lower_bound(lower_bound_str);
+
+ db_iter->Seek(before_lower_bound);
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(lower_bound_str, db_iter->key().ToString());
+}
+
+TEST_F(DBIteratorTest, ReverseToForwardWithDisappearingKeys) {
+ Options options;
+ options.prefix_extractor.reset(NewCappedPrefixTransform(0));
+
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "A");
+ internal_iter->AddPut("b", "B");
+ for (int i = 0; i < 100; ++i) {
+ internal_iter->AddPut("c" + ToString(i), "");
+ }
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ReadOptions(), ImmutableCFOptions(options),
+ MutableCFOptions(options), BytewiseComparator(), internal_iter, 10,
+ options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+
+ db_iter->SeekForPrev("a");
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_OK(db_iter->status());
+ ASSERT_EQ("a", db_iter->key().ToString());
+
+ internal_iter->Vanish("a");
+ db_iter->Next();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_OK(db_iter->status());
+ ASSERT_EQ("b", db_iter->key().ToString());
+
+ // A (sort of) bug used to cause DBIter to pointlessly drag the internal
+ // iterator all the way to the end. But this doesn't really matter at the time
+ // of writing because the only iterator that can see disappearing keys is
+ // ForwardIterator, which doesn't support SeekForPrev().
+ EXPECT_LT(internal_iter->steps(), 20);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_iterator_test.cc b/src/rocksdb/db/db_iterator_test.cc
new file mode 100644
index 000000000..99ffb5ce4
--- /dev/null
+++ b/src/rocksdb/db/db_iterator_test.cc
@@ -0,0 +1,2998 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <functional>
+
+#include "db/arena_wrapped_db_iter.h"
+#include "db/db_iter.h"
+#include "db/db_test_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/iostats_context.h"
+#include "rocksdb/perf_context.h"
+#include "table/block_based/flush_block_policy.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A dumb ReadCallback which saying every key is committed.
+class DummyReadCallback : public ReadCallback {
+ public:
+ DummyReadCallback() : ReadCallback(kMaxSequenceNumber) {}
+ bool IsVisibleFullCheck(SequenceNumber /*seq*/) override { return true; }
+ void SetSnapshot(SequenceNumber seq) { max_visible_seq_ = seq; }
+};
+
+// Test param:
+// bool: whether to pass read_callback to NewIterator().
+class DBIteratorTest : public DBTestBase,
+ public testing::WithParamInterface<bool> {
+ public:
+ DBIteratorTest() : DBTestBase("/db_iterator_test") {}
+
+ Iterator* NewIterator(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family = nullptr) {
+ if (column_family == nullptr) {
+ column_family = db_->DefaultColumnFamily();
+ }
+ auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+ SequenceNumber seq = read_options.snapshot != nullptr
+ ? read_options.snapshot->GetSequenceNumber()
+ : db_->GetLatestSequenceNumber();
+ bool use_read_callback = GetParam();
+ DummyReadCallback* read_callback = nullptr;
+ if (use_read_callback) {
+ read_callback = new DummyReadCallback();
+ read_callback->SetSnapshot(seq);
+ InstrumentedMutexLock lock(&mutex_);
+ read_callbacks_.push_back(
+ std::unique_ptr<DummyReadCallback>(read_callback));
+ }
+ return dbfull()->NewIteratorImpl(read_options, cfd, seq, read_callback);
+ }
+
+ private:
+ InstrumentedMutex mutex_;
+ std::vector<std::unique_ptr<DummyReadCallback>> read_callbacks_;
+};
+
+TEST_P(DBIteratorTest, IteratorProperty) {
+ // The test needs to be changed if kPersistedTier is supported in iterator.
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu"}, options);
+ Put(1, "1", "2");
+ Delete(1, "2");
+ ReadOptions ropt;
+ ropt.pin_data = false;
+ {
+ std::unique_ptr<Iterator> iter(NewIterator(ropt, handles_[1]));
+ iter->SeekToFirst();
+ std::string prop_value;
+ ASSERT_NOK(iter->GetProperty("non_existing.value", &prop_value));
+ ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+ ASSERT_EQ("0", prop_value);
+ ASSERT_OK(iter->GetProperty("rocksdb.iterator.internal-key", &prop_value));
+ ASSERT_EQ("1", prop_value);
+ iter->Next();
+ ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+ ASSERT_EQ("Iterator is not valid.", prop_value);
+
+ // Get internal key at which the iteration stopped (tombstone in this case).
+ ASSERT_OK(iter->GetProperty("rocksdb.iterator.internal-key", &prop_value));
+ ASSERT_EQ("2", prop_value);
+ }
+ Close();
+}
+
+TEST_P(DBIteratorTest, PersistedTierOnIterator) {
+ // The test needs to be changed if kPersistedTier is supported in iterator.
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ReadOptions ropt;
+ ropt.read_tier = kPersistedTier;
+
+ auto* iter = db_->NewIterator(ropt, handles_[1]);
+ ASSERT_TRUE(iter->status().IsNotSupported());
+ delete iter;
+
+ std::vector<Iterator*> iters;
+ ASSERT_TRUE(db_->NewIterators(ropt, {handles_[1]}, &iters).IsNotSupported());
+ Close();
+}
+
+TEST_P(DBIteratorTest, NonBlockingIteration) {
+ do {
+ ReadOptions non_blocking_opts, regular_opts;
+ Options options = CurrentOptions();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ non_blocking_opts.read_tier = kBlockCacheTier;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ // write one kv to the database.
+ ASSERT_OK(Put(1, "a", "b"));
+
+ // scan using non-blocking iterator. We should find it because
+ // it is in memtable.
+ Iterator* iter = NewIterator(non_blocking_opts, handles_[1]);
+ int count = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_OK(iter->status());
+ count++;
+ }
+ ASSERT_EQ(count, 1);
+ delete iter;
+
+ // flush memtable to storage. Now, the key should not be in the
+ // memtable neither in the block cache.
+ ASSERT_OK(Flush(1));
+
+ // verify that a non-blocking iterator does not find any
+ // kvs. Neither does it do any IOs to storage.
+ uint64_t numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+ uint64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+ iter = NewIterator(non_blocking_opts, handles_[1]);
+ count = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ count++;
+ }
+ ASSERT_EQ(count, 0);
+ ASSERT_TRUE(iter->status().IsIncomplete());
+ ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+ ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+ delete iter;
+
+ // read in the specified block via a regular get
+ ASSERT_EQ(Get(1, "a"), "b");
+
+ // verify that we can find it via a non-blocking scan
+ numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+ cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+ iter = NewIterator(non_blocking_opts, handles_[1]);
+ count = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_OK(iter->status());
+ count++;
+ }
+ ASSERT_EQ(count, 1);
+ ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+ ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+ delete iter;
+
+ // This test verifies block cache behaviors, which is not used by plain
+ // table format.
+ } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast | kSkipMmapReads));
+}
+
+TEST_P(DBIteratorTest, IterSeekBeforePrev) {
+ ASSERT_OK(Put("a", "b"));
+ ASSERT_OK(Put("c", "d"));
+ dbfull()->Flush(FlushOptions());
+ ASSERT_OK(Put("0", "f"));
+ ASSERT_OK(Put("1", "h"));
+ dbfull()->Flush(FlushOptions());
+ ASSERT_OK(Put("2", "j"));
+ auto iter = NewIterator(ReadOptions());
+ iter->Seek(Slice("c"));
+ iter->Prev();
+ iter->Seek(Slice("a"));
+ iter->Prev();
+ delete iter;
+}
+
+TEST_P(DBIteratorTest, IterReseekNewUpperBound) {
+ Random rnd(301);
+ Options options = CurrentOptions();
+ BlockBasedTableOptions table_options;
+ table_options.block_size = 1024;
+ table_options.block_size_deviation = 50;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.compression = kNoCompression;
+ Reopen(options);
+
+ ASSERT_OK(Put("a", RandomString(&rnd, 400)));
+ ASSERT_OK(Put("aabb", RandomString(&rnd, 400)));
+ ASSERT_OK(Put("aaef", RandomString(&rnd, 400)));
+ ASSERT_OK(Put("b", RandomString(&rnd, 400)));
+ dbfull()->Flush(FlushOptions());
+ ReadOptions opts;
+ Slice ub = Slice("aa");
+ opts.iterate_upper_bound = &ub;
+ auto iter = NewIterator(opts);
+ iter->Seek(Slice("a"));
+ ub = Slice("b");
+ iter->Seek(Slice("aabc"));
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().ToString(), "aaef");
+ delete iter;
+}
+
+TEST_P(DBIteratorTest, IterSeekForPrevBeforeNext) {
+ ASSERT_OK(Put("a", "b"));
+ ASSERT_OK(Put("c", "d"));
+ dbfull()->Flush(FlushOptions());
+ ASSERT_OK(Put("0", "f"));
+ ASSERT_OK(Put("1", "h"));
+ dbfull()->Flush(FlushOptions());
+ ASSERT_OK(Put("2", "j"));
+ auto iter = NewIterator(ReadOptions());
+ iter->SeekForPrev(Slice("0"));
+ iter->Next();
+ iter->SeekForPrev(Slice("1"));
+ iter->Next();
+ delete iter;
+}
+
+namespace {
+std::string MakeLongKey(size_t length, char c) {
+ return std::string(length, c);
+}
+} // namespace
+
+TEST_P(DBIteratorTest, IterLongKeys) {
+ ASSERT_OK(Put(MakeLongKey(20, 0), "0"));
+ ASSERT_OK(Put(MakeLongKey(32, 2), "2"));
+ ASSERT_OK(Put("a", "b"));
+ dbfull()->Flush(FlushOptions());
+ ASSERT_OK(Put(MakeLongKey(50, 1), "1"));
+ ASSERT_OK(Put(MakeLongKey(127, 3), "3"));
+ ASSERT_OK(Put(MakeLongKey(64, 4), "4"));
+ auto iter = NewIterator(ReadOptions());
+
+ // Create a key that needs to be skipped for Seq too new
+ iter->Seek(MakeLongKey(20, 0));
+ ASSERT_EQ(IterStatus(iter), MakeLongKey(20, 0) + "->0");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), MakeLongKey(64, 4) + "->4");
+
+ iter->SeekForPrev(MakeLongKey(127, 3));
+ ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1");
+ delete iter;
+
+ iter = NewIterator(ReadOptions());
+ iter->Seek(MakeLongKey(50, 1));
+ ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3");
+ delete iter;
+}
+
+TEST_P(DBIteratorTest, IterNextWithNewerSeq) {
+ ASSERT_OK(Put("0", "0"));
+ dbfull()->Flush(FlushOptions());
+ ASSERT_OK(Put("a", "b"));
+ ASSERT_OK(Put("c", "d"));
+ ASSERT_OK(Put("d", "e"));
+ auto iter = NewIterator(ReadOptions());
+
+ // Create a key that needs to be skipped for Seq too new
+ for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1;
+ i++) {
+ ASSERT_OK(Put("b", "f"));
+ }
+
+ iter->Seek(Slice("a"));
+ ASSERT_EQ(IterStatus(iter), "a->b");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "c->d");
+ iter->SeekForPrev(Slice("b"));
+ ASSERT_EQ(IterStatus(iter), "a->b");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "c->d");
+
+ delete iter;
+}
+
+TEST_P(DBIteratorTest, IterPrevWithNewerSeq) {
+ ASSERT_OK(Put("0", "0"));
+ dbfull()->Flush(FlushOptions());
+ ASSERT_OK(Put("a", "b"));
+ ASSERT_OK(Put("c", "d"));
+ ASSERT_OK(Put("d", "e"));
+ auto iter = NewIterator(ReadOptions());
+
+ // Create a key that needs to be skipped for Seq too new
+ for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1;
+ i++) {
+ ASSERT_OK(Put("b", "f"));
+ }
+
+ iter->Seek(Slice("d"));
+ ASSERT_EQ(IterStatus(iter), "d->e");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "c->d");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "a->b");
+ iter->Prev();
+ iter->SeekForPrev(Slice("d"));
+ ASSERT_EQ(IterStatus(iter), "d->e");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "c->d");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "a->b");
+ iter->Prev();
+ delete iter;
+}
+
+TEST_P(DBIteratorTest, IterPrevWithNewerSeq2) {
+ ASSERT_OK(Put("0", "0"));
+ dbfull()->Flush(FlushOptions());
+ ASSERT_OK(Put("a", "b"));
+ ASSERT_OK(Put("c", "d"));
+ ASSERT_OK(Put("e", "f"));
+ auto iter = NewIterator(ReadOptions());
+ auto iter2 = NewIterator(ReadOptions());
+ iter->Seek(Slice("c"));
+ iter2->SeekForPrev(Slice("d"));
+ ASSERT_EQ(IterStatus(iter), "c->d");
+ ASSERT_EQ(IterStatus(iter2), "c->d");
+
+ // Create a key that needs to be skipped for Seq too new
+ for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1;
+ i++) {
+ ASSERT_OK(Put("b", "f"));
+ }
+
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "a->b");
+ iter->Prev();
+ iter2->Prev();
+ ASSERT_EQ(IterStatus(iter2), "a->b");
+ iter2->Prev();
+ delete iter;
+ delete iter2;
+}
+
+TEST_P(DBIteratorTest, IterEmpty) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->Seek("foo");
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->SeekForPrev("foo");
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ delete iter;
+ } while (ChangeCompactOptions());
+}
+
+TEST_P(DBIteratorTest, IterSingle) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "a", "va"));
+ Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->Seek("");
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+ iter->SeekForPrev("");
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->Seek("a");
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+ iter->SeekForPrev("a");
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->Seek("b");
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+ iter->SeekForPrev("b");
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ delete iter;
+ } while (ChangeCompactOptions());
+}
+
+TEST_P(DBIteratorTest, IterMulti) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "a", "va"));
+ ASSERT_OK(Put(1, "b", "vb"));
+ ASSERT_OK(Put(1, "c", "vc"));
+ Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->Seek("");
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Seek("a");
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Seek("ax");
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+ iter->SeekForPrev("d");
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->SeekForPrev("c");
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->SeekForPrev("bx");
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+
+ iter->Seek("b");
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+ iter->Seek("z");
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+ iter->SeekForPrev("b");
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+ iter->SeekForPrev("");
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ // Switch from reverse to forward
+ iter->SeekToLast();
+ iter->Prev();
+ iter->Prev();
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+
+ // Switch from forward to reverse
+ iter->SeekToFirst();
+ iter->Next();
+ iter->Next();
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+
+ // Make sure iter stays at snapshot
+ ASSERT_OK(Put(1, "a", "va2"));
+ ASSERT_OK(Put(1, "a2", "va3"));
+ ASSERT_OK(Put(1, "b", "vb2"));
+ ASSERT_OK(Put(1, "c", "vc2"));
+ ASSERT_OK(Delete(1, "b"));
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ delete iter;
+ } while (ChangeCompactOptions());
+}
+
+// Check that we can skip over a run of user keys
+// by using reseek rather than sequential scan
+TEST_P(DBIteratorTest, IterReseek) {
+ anon::OptionsOverride options_override;
+ options_override.skip_policy = kSkipNoSnapshot;
+ Options options = CurrentOptions(options_override);
+ options.max_sequential_skip_in_iterations = 3;
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // insert three keys with same userkey and verify that
+ // reseek is not invoked. For each of these test cases,
+ // verify that we can find the next key "b".
+ ASSERT_OK(Put(1, "a", "zero"));
+ ASSERT_OK(Put(1, "a", "one"));
+ ASSERT_OK(Put(1, "a", "two"));
+ ASSERT_OK(Put(1, "b", "bone"));
+ Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+ iter->SeekToFirst();
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+ ASSERT_EQ(IterStatus(iter), "a->two");
+ iter->Next();
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+ ASSERT_EQ(IterStatus(iter), "b->bone");
+ delete iter;
+
+ // insert a total of three keys with same userkey and verify
+ // that reseek is still not invoked.
+ ASSERT_OK(Put(1, "a", "three"));
+ iter = NewIterator(ReadOptions(), handles_[1]);
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->three");
+ iter->Next();
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+ ASSERT_EQ(IterStatus(iter), "b->bone");
+ delete iter;
+
+ // insert a total of four keys with same userkey and verify
+ // that reseek is invoked.
+ ASSERT_OK(Put(1, "a", "four"));
+ iter = NewIterator(ReadOptions(), handles_[1]);
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->four");
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+ iter->Next();
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+ ASSERT_EQ(IterStatus(iter), "b->bone");
+ delete iter;
+
+ // Testing reverse iterator
+ // At this point, we have three versions of "a" and one version of "b".
+ // The reseek statistics is already at 1.
+ int num_reseeks = static_cast<int>(
+ TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION));
+
+ // Insert another version of b and assert that reseek is not invoked
+ ASSERT_OK(Put(1, "b", "btwo"));
+ iter = NewIterator(ReadOptions(), handles_[1]);
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "b->btwo");
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+ num_reseeks);
+ iter->Prev();
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+ num_reseeks + 1);
+ ASSERT_EQ(IterStatus(iter), "a->four");
+ delete iter;
+
+ // insert two more versions of b. This makes a total of 4 versions
+ // of b and 4 versions of a.
+ ASSERT_OK(Put(1, "b", "bthree"));
+ ASSERT_OK(Put(1, "b", "bfour"));
+ iter = NewIterator(ReadOptions(), handles_[1]);
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "b->bfour");
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+ num_reseeks + 2);
+ iter->Prev();
+
+ // the previous Prev call should have invoked reseek
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+ num_reseeks + 3);
+ ASSERT_EQ(IterStatus(iter), "a->four");
+ delete iter;
+}
+
+TEST_P(DBIteratorTest, IterSmallAndLargeMix) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "a", "va"));
+ ASSERT_OK(Put(1, "b", std::string(100000, 'b')));
+ ASSERT_OK(Put(1, "c", "vc"));
+ ASSERT_OK(Put(1, "d", std::string(100000, 'd')));
+ ASSERT_OK(Put(1, "e", std::string(100000, 'e')));
+
+ Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b'));
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd'));
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e'));
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e'));
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd'));
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b'));
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ delete iter;
+ } while (ChangeCompactOptions());
+}
+
+TEST_P(DBIteratorTest, IterMultiWithDelete) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "ka", "va"));
+ ASSERT_OK(Put(1, "kb", "vb"));
+ ASSERT_OK(Put(1, "kc", "vc"));
+ ASSERT_OK(Delete(1, "kb"));
+ ASSERT_EQ("NOT_FOUND", Get(1, "kb"));
+
+ Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+ iter->Seek("kc");
+ ASSERT_EQ(IterStatus(iter), "kc->vc");
+ if (!CurrentOptions().merge_operator) {
+ // TODO: merge operator does not support backward iteration yet
+ if (kPlainTableAllBytesPrefix != option_config_ &&
+ kBlockBasedTableWithWholeKeyHashIndex != option_config_ &&
+ kHashLinkList != option_config_ &&
+ kHashSkipList != option_config_) { // doesn't support SeekToLast
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "ka->va");
+ }
+ }
+ delete iter;
+ } while (ChangeOptions());
+}
+
+TEST_P(DBIteratorTest, IterPrevMaxSkip) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ for (int i = 0; i < 2; i++) {
+ ASSERT_OK(Put(1, "key1", "v1"));
+ ASSERT_OK(Put(1, "key2", "v2"));
+ ASSERT_OK(Put(1, "key3", "v3"));
+ ASSERT_OK(Put(1, "key4", "v4"));
+ ASSERT_OK(Put(1, "key5", "v5"));
+ }
+
+ VerifyIterLast("key5->v5", 1);
+
+ ASSERT_OK(Delete(1, "key5"));
+ VerifyIterLast("key4->v4", 1);
+
+ ASSERT_OK(Delete(1, "key4"));
+ VerifyIterLast("key3->v3", 1);
+
+ ASSERT_OK(Delete(1, "key3"));
+ VerifyIterLast("key2->v2", 1);
+
+ ASSERT_OK(Delete(1, "key2"));
+ VerifyIterLast("key1->v1", 1);
+
+ ASSERT_OK(Delete(1, "key1"));
+ VerifyIterLast("(invalid)", 1);
+ } while (ChangeOptions(kSkipMergePut | kSkipNoSeekToLast));
+}
+
+TEST_P(DBIteratorTest, IterWithSnapshot) {
+ anon::OptionsOverride options_override;
+ options_override.skip_policy = kSkipNoSnapshot;
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
+ ASSERT_OK(Put(1, "key1", "val1"));
+ ASSERT_OK(Put(1, "key2", "val2"));
+ ASSERT_OK(Put(1, "key3", "val3"));
+ ASSERT_OK(Put(1, "key4", "val4"));
+ ASSERT_OK(Put(1, "key5", "val5"));
+
+ const Snapshot* snapshot = db_->GetSnapshot();
+ ReadOptions options;
+ options.snapshot = snapshot;
+ Iterator* iter = NewIterator(options, handles_[1]);
+
+ ASSERT_OK(Put(1, "key0", "val0"));
+ // Put more values after the snapshot
+ ASSERT_OK(Put(1, "key100", "val100"));
+ ASSERT_OK(Put(1, "key101", "val101"));
+
+ iter->Seek("key5");
+ ASSERT_EQ(IterStatus(iter), "key5->val5");
+ if (!CurrentOptions().merge_operator) {
+ // TODO: merge operator does not support backward iteration yet
+ if (kPlainTableAllBytesPrefix != option_config_ &&
+ kBlockBasedTableWithWholeKeyHashIndex != option_config_ &&
+ kHashLinkList != option_config_ && kHashSkipList != option_config_) {
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "key4->val4");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "key3->val3");
+
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "key4->val4");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "key5->val5");
+ }
+ iter->Next();
+ ASSERT_TRUE(!iter->Valid());
+ }
+
+ if (!CurrentOptions().merge_operator) {
+ // TODO(gzh): merge operator does not support backward iteration yet
+ if (kPlainTableAllBytesPrefix != option_config_ &&
+ kBlockBasedTableWithWholeKeyHashIndex != option_config_ &&
+ kHashLinkList != option_config_ && kHashSkipList != option_config_) {
+ iter->SeekForPrev("key1");
+ ASSERT_EQ(IterStatus(iter), "key1->val1");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "key2->val2");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "key3->val3");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "key2->val2");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "key1->val1");
+ iter->Prev();
+ ASSERT_TRUE(!iter->Valid());
+ }
+ }
+ db_->ReleaseSnapshot(snapshot);
+ delete iter;
+ } while (ChangeOptions());
+}
+
+TEST_P(DBIteratorTest, IteratorPinsRef) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ Put(1, "foo", "hello");
+
+ // Get iterator that will yield the current contents of the DB.
+ Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+
+ // Write to force compactions
+ Put(1, "foo", "newvalue1");
+ for (int i = 0; i < 100; i++) {
+ // 100K values
+ ASSERT_OK(Put(1, Key(i), Key(i) + std::string(100000, 'v')));
+ }
+ Put(1, "foo", "newvalue2");
+
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("foo", iter->key().ToString());
+ ASSERT_EQ("hello", iter->value().ToString());
+ iter->Next();
+ ASSERT_TRUE(!iter->Valid());
+ delete iter;
+ } while (ChangeCompactOptions());
+}
+
+TEST_P(DBIteratorTest, IteratorDeleteAfterCfDelete) {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+
+ Put(1, "foo", "delete-cf-then-delete-iter");
+ Put(1, "hello", "value2");
+
+ ColumnFamilyHandle* cf = handles_[1];
+ ReadOptions ro;
+
+ auto* iter = db_->NewIterator(ro, cf);
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "foo->delete-cf-then-delete-iter");
+
+ // delete CF handle
+ db_->DestroyColumnFamilyHandle(cf);
+ handles_.erase(std::begin(handles_) + 1);
+
+ // delete Iterator after CF handle is deleted
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "hello->value2");
+ delete iter;
+}
+
+TEST_P(DBIteratorTest, IteratorDeleteAfterCfDrop) {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+
+ Put(1, "foo", "drop-cf-then-delete-iter");
+
+ ReadOptions ro;
+ ColumnFamilyHandle* cf = handles_[1];
+
+ auto* iter = db_->NewIterator(ro, cf);
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "foo->drop-cf-then-delete-iter");
+
+ // drop and delete CF
+ db_->DropColumnFamily(cf);
+ db_->DestroyColumnFamilyHandle(cf);
+ handles_.erase(std::begin(handles_) + 1);
+
+ // delete Iterator after CF handle is dropped
+ delete iter;
+}
+
+// SetOptions not defined in ROCKSDB LITE
+#ifndef ROCKSDB_LITE
+TEST_P(DBIteratorTest, DBIteratorBoundTest) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+
+ options.prefix_extractor = nullptr;
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("a", "0"));
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Put("foo1", "bar1"));
+ ASSERT_OK(Put("g1", "0"));
+
+ // testing basic case with no iterate_upper_bound and no prefix_extractor
+ {
+ ReadOptions ro;
+ ro.iterate_upper_bound = nullptr;
+
+ std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+ iter->Seek("foo");
+
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("foo")), 0);
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("foo1")), 0);
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("g1")), 0);
+
+ iter->SeekForPrev("g1");
+
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("g1")), 0);
+
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("foo1")), 0);
+
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("foo")), 0);
+ }
+
+ // testing iterate_upper_bound and forward iterator
+ // to make sure it stops at bound
+ {
+ ReadOptions ro;
+ // iterate_upper_bound points beyond the last expected entry
+ Slice prefix("foo2");
+ ro.iterate_upper_bound = &prefix;
+
+ std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+ iter->Seek("foo");
+
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("foo")), 0);
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(("foo1")), 0);
+
+ iter->Next();
+ // should stop here...
+ ASSERT_TRUE(!iter->Valid());
+ }
+ // Testing SeekToLast with iterate_upper_bound set
+ {
+ ReadOptions ro;
+
+ Slice prefix("foo");
+ ro.iterate_upper_bound = &prefix;
+
+ std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+ iter->SeekToLast();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("a")), 0);
+ }
+
+ // prefix is the first letter of the key
+ ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:1"}}));
+ ASSERT_OK(Put("a", "0"));
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Put("foo1", "bar1"));
+ ASSERT_OK(Put("g1", "0"));
+
+ // testing with iterate_upper_bound and prefix_extractor
+ // Seek target and iterate_upper_bound are not is same prefix
+ // This should be an error
+ {
+ ReadOptions ro;
+ Slice upper_bound("g");
+ ro.iterate_upper_bound = &upper_bound;
+
+ std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+ iter->Seek("foo");
+
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("foo", iter->key().ToString());
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("foo1", iter->key().ToString());
+
+ iter->Next();
+ ASSERT_TRUE(!iter->Valid());
+ }
+
+ // testing that iterate_upper_bound prevents iterating over deleted items
+ // if the bound has already reached
+ {
+ options.prefix_extractor = nullptr;
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("a", "0"));
+ ASSERT_OK(Put("b", "0"));
+ ASSERT_OK(Put("b1", "0"));
+ ASSERT_OK(Put("c", "0"));
+ ASSERT_OK(Put("d", "0"));
+ ASSERT_OK(Put("e", "0"));
+ ASSERT_OK(Delete("c"));
+ ASSERT_OK(Delete("d"));
+
+ // base case with no bound
+ ReadOptions ro;
+ ro.iterate_upper_bound = nullptr;
+
+ std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+ iter->Seek("b");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("b")), 0);
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(("b1")), 0);
+
+ get_perf_context()->Reset();
+ iter->Next();
+
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(static_cast<int>(get_perf_context()->internal_delete_skipped_count), 2);
+
+ // now testing with iterate_bound
+ Slice prefix("c");
+ ro.iterate_upper_bound = &prefix;
+
+ iter.reset(NewIterator(ro));
+
+ get_perf_context()->Reset();
+
+ iter->Seek("b");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("b")), 0);
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(("b1")), 0);
+
+ iter->Next();
+ // the iteration should stop as soon as the bound key is reached
+ // even though the key is deleted
+ // hence internal_delete_skipped_count should be 0
+ ASSERT_TRUE(!iter->Valid());
+ ASSERT_EQ(static_cast<int>(get_perf_context()->internal_delete_skipped_count), 0);
+ }
+}
+
+TEST_P(DBIteratorTest, DBIteratorBoundMultiSeek) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.prefix_extractor = nullptr;
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("a", "0"));
+ ASSERT_OK(Put("z", "0"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("foo1", "bar1"));
+ ASSERT_OK(Put("foo2", "bar2"));
+ ASSERT_OK(Put("foo3", "bar3"));
+ ASSERT_OK(Put("foo4", "bar4"));
+
+ {
+ std::string up_str = "foo5";
+ Slice up(up_str);
+ ReadOptions ro;
+ ro.iterate_upper_bound = &up;
+ std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+ iter->Seek("foo1");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("foo1")), 0);
+
+ uint64_t prev_block_cache_hit =
+ TestGetTickerCount(options, BLOCK_CACHE_HIT);
+ uint64_t prev_block_cache_miss =
+ TestGetTickerCount(options, BLOCK_CACHE_MISS);
+
+ ASSERT_GT(prev_block_cache_hit + prev_block_cache_miss, 0);
+
+ iter->Seek("foo4");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("foo4")), 0);
+ ASSERT_EQ(prev_block_cache_hit,
+ TestGetTickerCount(options, BLOCK_CACHE_HIT));
+ ASSERT_EQ(prev_block_cache_miss,
+ TestGetTickerCount(options, BLOCK_CACHE_MISS));
+
+ iter->Seek("foo2");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("foo2")), 0);
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("foo3")), 0);
+ ASSERT_EQ(prev_block_cache_hit,
+ TestGetTickerCount(options, BLOCK_CACHE_HIT));
+ ASSERT_EQ(prev_block_cache_miss,
+ TestGetTickerCount(options, BLOCK_CACHE_MISS));
+ }
+}
+#endif
+
+TEST_P(DBIteratorTest, DBIteratorBoundOptimizationTest) {
+ for (auto format_version : {2, 3, 4}) {
+ int upper_bound_hits = 0;
+ Options options = CurrentOptions();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BlockBasedTableIterator:out_of_bound",
+ [&upper_bound_hits](void*) { upper_bound_hits++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.prefix_extractor = nullptr;
+ BlockBasedTableOptions table_options;
+ table_options.format_version = format_version;
+ table_options.flush_block_policy_factory =
+ std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("foo1", "bar1"));
+ ASSERT_OK(Put("foo2", "bar2"));
+ ASSERT_OK(Put("foo4", "bar4"));
+ ASSERT_OK(Flush());
+
+ Slice ub("foo3");
+ ReadOptions ro;
+ ro.iterate_upper_bound = &ub;
+
+ std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+ iter->Seek("foo");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("foo1")), 0);
+ ASSERT_EQ(upper_bound_hits, 0);
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("foo2")), 0);
+ ASSERT_EQ(upper_bound_hits, 0);
+
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_EQ(upper_bound_hits, 1);
+ }
+}
+
+// Enable kBinarySearchWithFirstKey, do some iterator operations and check that
+// they don't do unnecessary block reads.
+TEST_P(DBIteratorTest, IndexWithFirstKey) {
+ for (int tailing = 0; tailing < 2; ++tailing) {
+ SCOPED_TRACE("tailing = " + std::to_string(tailing));
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.prefix_extractor = nullptr;
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ Statistics* stats = options.statistics.get();
+ BlockBasedTableOptions table_options;
+ table_options.index_type =
+ BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey;
+ table_options.index_shortening =
+ BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+ table_options.flush_block_policy_factory =
+ std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+ table_options.block_cache =
+ NewLRUCache(8000); // fits all blocks and their cache metadata overhead
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ DestroyAndReopen(options);
+ ASSERT_OK(Merge("a1", "x1"));
+ ASSERT_OK(Merge("b1", "y1"));
+ ASSERT_OK(Merge("c0", "z1"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("a2", "x2"));
+ ASSERT_OK(Merge("b2", "y2"));
+ ASSERT_OK(Merge("c0", "z2"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("a3", "x3"));
+ ASSERT_OK(Merge("b3", "y3"));
+ ASSERT_OK(Merge("c3", "z3"));
+ ASSERT_OK(Flush());
+
+ // Block cache is not important for this test.
+ // We use BLOCK_CACHE_DATA_* counters just because they're the most readily
+ // available way of counting block accesses.
+
+ ReadOptions ropt;
+ ropt.tailing = tailing;
+ std::unique_ptr<Iterator> iter(NewIterator(ropt));
+
+ iter->Seek("b10");
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ("b2", iter->key().ToString());
+ EXPECT_EQ("y2", iter->value().ToString());
+ EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ("b3", iter->key().ToString());
+ EXPECT_EQ("y3", iter->value().ToString());
+ EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+ EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+ iter->Seek("c0");
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ("c0", iter->key().ToString());
+ EXPECT_EQ("z1,z2", iter->value().ToString());
+ EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+ EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ("c3", iter->key().ToString());
+ EXPECT_EQ("z3", iter->value().ToString());
+ EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+ EXPECT_EQ(5, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+ iter.reset();
+
+ // Enable iterate_upper_bound and check that iterator is not trying to read
+ // blocks that are fully above upper bound.
+ std::string ub = "b3";
+ Slice ub_slice(ub);
+ ropt.iterate_upper_bound = &ub_slice;
+ iter.reset(NewIterator(ropt));
+
+ iter->Seek("b2");
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ("b2", iter->key().ToString());
+ EXPECT_EQ("y2", iter->value().ToString());
+ EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+ EXPECT_EQ(5, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+ EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+ EXPECT_EQ(5, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+ }
+}
+
+TEST_P(DBIteratorTest, IndexWithFirstKeyGet) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.prefix_extractor = nullptr;
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ Statistics* stats = options.statistics.get();
+ BlockBasedTableOptions table_options;
+ table_options.index_type =
+ BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey;
+ table_options.index_shortening =
+ BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+ table_options.flush_block_policy_factory =
+ std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+ table_options.block_cache = NewLRUCache(1000); // fits all blocks
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ DestroyAndReopen(options);
+ ASSERT_OK(Merge("a", "x1"));
+ ASSERT_OK(Merge("c", "y1"));
+ ASSERT_OK(Merge("e", "z1"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("c", "y2"));
+ ASSERT_OK(Merge("e", "z2"));
+ ASSERT_OK(Flush());
+
+ // Get() between blocks shouldn't read any blocks.
+ ASSERT_EQ("NOT_FOUND", Get("b"));
+ EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+ EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+ // Get() of an existing key shouldn't read any unnecessary blocks when there's
+ // only one key per block.
+
+ ASSERT_EQ("y1,y2", Get("c"));
+ EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+ EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+ ASSERT_EQ("x1", Get("a"));
+ EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+ EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+ EXPECT_EQ(std::vector<std::string>({"NOT_FOUND", "z1,z2"}),
+ MultiGet({"b", "e"}));
+}
+
+// TODO(3.13): fix the issue of Seek() + Prev() which might not necessary
+// return the biggest key which is smaller than the seek key.
+TEST_P(DBIteratorTest, PrevAfterAndNextAfterMerge) {
+ Options options;
+ options.create_if_missing = true;
+ options.merge_operator = MergeOperators::CreatePutOperator();
+ options.env = env_;
+ DestroyAndReopen(options);
+
+ // write three entries with different keys using Merge()
+ WriteOptions wopts;
+ db_->Merge(wopts, "1", "data1");
+ db_->Merge(wopts, "2", "data2");
+ db_->Merge(wopts, "3", "data3");
+
+ std::unique_ptr<Iterator> it(NewIterator(ReadOptions()));
+
+ it->Seek("2");
+ ASSERT_TRUE(it->Valid());
+ ASSERT_EQ("2", it->key().ToString());
+
+ it->Prev();
+ ASSERT_TRUE(it->Valid());
+ ASSERT_EQ("1", it->key().ToString());
+
+ it->SeekForPrev("1");
+ ASSERT_TRUE(it->Valid());
+ ASSERT_EQ("1", it->key().ToString());
+
+ it->Next();
+ ASSERT_TRUE(it->Valid());
+ ASSERT_EQ("2", it->key().ToString());
+}
+
+class DBIteratorTestForPinnedData : public DBIteratorTest {
+ public:
+ enum TestConfig {
+ NORMAL,
+ CLOSE_AND_OPEN,
+ COMPACT_BEFORE_READ,
+ FLUSH_EVERY_1000,
+ MAX
+ };
+ DBIteratorTestForPinnedData() : DBIteratorTest() {}
+ void PinnedDataIteratorRandomized(TestConfig run_config) {
+ // Generate Random data
+ Random rnd(301);
+
+ int puts = 100000;
+ int key_pool = static_cast<int>(puts * 0.7);
+ int key_size = 100;
+ int val_size = 1000;
+ int seeks_percentage = 20; // 20% of keys will be used to test seek()
+ int delete_percentage = 20; // 20% of keys will be deleted
+ int merge_percentage = 20; // 20% of keys will be added using Merge()
+
+ Options options = CurrentOptions();
+ BlockBasedTableOptions table_options;
+ table_options.use_delta_encoding = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.merge_operator = MergeOperators::CreatePutOperator();
+ DestroyAndReopen(options);
+
+ std::vector<std::string> generated_keys(key_pool);
+ for (int i = 0; i < key_pool; i++) {
+ generated_keys[i] = RandomString(&rnd, key_size);
+ }
+
+ std::map<std::string, std::string> true_data;
+ std::vector<std::string> random_keys;
+ std::vector<std::string> deleted_keys;
+ for (int i = 0; i < puts; i++) {
+ auto& k = generated_keys[rnd.Next() % key_pool];
+ auto v = RandomString(&rnd, val_size);
+
+ // Insert data to true_data map and to DB
+ true_data[k] = v;
+ if (rnd.PercentTrue(merge_percentage)) {
+ ASSERT_OK(db_->Merge(WriteOptions(), k, v));
+ } else {
+ ASSERT_OK(Put(k, v));
+ }
+
+ // Pick random keys to be used to test Seek()
+ if (rnd.PercentTrue(seeks_percentage)) {
+ random_keys.push_back(k);
+ }
+
+ // Delete some random keys
+ if (rnd.PercentTrue(delete_percentage)) {
+ deleted_keys.push_back(k);
+ true_data.erase(k);
+ ASSERT_OK(Delete(k));
+ }
+
+ if (run_config == TestConfig::FLUSH_EVERY_1000) {
+ if (i && i % 1000 == 0) {
+ Flush();
+ }
+ }
+ }
+
+ if (run_config == TestConfig::CLOSE_AND_OPEN) {
+ Close();
+ Reopen(options);
+ } else if (run_config == TestConfig::COMPACT_BEFORE_READ) {
+ db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ }
+
+ ReadOptions ro;
+ ro.pin_data = true;
+ auto iter = NewIterator(ro);
+
+ {
+ // Test Seek to random keys
+ std::vector<Slice> keys_slices;
+ std::vector<std::string> true_keys;
+ for (auto& k : random_keys) {
+ iter->Seek(k);
+ if (!iter->Valid()) {
+ ASSERT_EQ(true_data.lower_bound(k), true_data.end());
+ continue;
+ }
+ std::string prop_value;
+ ASSERT_OK(
+ iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+ ASSERT_EQ("1", prop_value);
+ keys_slices.push_back(iter->key());
+ true_keys.push_back(true_data.lower_bound(k)->first);
+ }
+
+ for (size_t i = 0; i < keys_slices.size(); i++) {
+ ASSERT_EQ(keys_slices[i].ToString(), true_keys[i]);
+ }
+ }
+
+ {
+ // Test SeekForPrev to random keys
+ std::vector<Slice> keys_slices;
+ std::vector<std::string> true_keys;
+ for (auto& k : random_keys) {
+ iter->SeekForPrev(k);
+ if (!iter->Valid()) {
+ ASSERT_EQ(true_data.upper_bound(k), true_data.begin());
+ continue;
+ }
+ std::string prop_value;
+ ASSERT_OK(
+ iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+ ASSERT_EQ("1", prop_value);
+ keys_slices.push_back(iter->key());
+ true_keys.push_back((--true_data.upper_bound(k))->first);
+ }
+
+ for (size_t i = 0; i < keys_slices.size(); i++) {
+ ASSERT_EQ(keys_slices[i].ToString(), true_keys[i]);
+ }
+ }
+
+ {
+ // Test iterating all data forward
+ std::vector<Slice> all_keys;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ std::string prop_value;
+ ASSERT_OK(
+ iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+ ASSERT_EQ("1", prop_value);
+ all_keys.push_back(iter->key());
+ }
+ ASSERT_EQ(all_keys.size(), true_data.size());
+
+ // Verify that all keys slices are valid
+ auto data_iter = true_data.begin();
+ for (size_t i = 0; i < all_keys.size(); i++) {
+ ASSERT_EQ(all_keys[i].ToString(), data_iter->first);
+ data_iter++;
+ }
+ }
+
+ {
+ // Test iterating all data backward
+ std::vector<Slice> all_keys;
+ for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+ std::string prop_value;
+ ASSERT_OK(
+ iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+ ASSERT_EQ("1", prop_value);
+ all_keys.push_back(iter->key());
+ }
+ ASSERT_EQ(all_keys.size(), true_data.size());
+
+ // Verify that all keys slices are valid (backward)
+ auto data_iter = true_data.rbegin();
+ for (size_t i = 0; i < all_keys.size(); i++) {
+ ASSERT_EQ(all_keys[i].ToString(), data_iter->first);
+ data_iter++;
+ }
+ }
+
+ delete iter;
+}
+};
+
+TEST_P(DBIteratorTestForPinnedData, PinnedDataIteratorRandomizedNormal) {
+ PinnedDataIteratorRandomized(TestConfig::NORMAL);
+}
+
+TEST_P(DBIteratorTestForPinnedData, PinnedDataIteratorRandomizedCLoseAndOpen) {
+ PinnedDataIteratorRandomized(TestConfig::CLOSE_AND_OPEN);
+}
+
+TEST_P(DBIteratorTestForPinnedData,
+ PinnedDataIteratorRandomizedCompactBeforeRead) {
+ PinnedDataIteratorRandomized(TestConfig::COMPACT_BEFORE_READ);
+}
+
+TEST_P(DBIteratorTestForPinnedData, PinnedDataIteratorRandomizedFlush) {
+ PinnedDataIteratorRandomized(TestConfig::FLUSH_EVERY_1000);
+}
+
+#ifndef ROCKSDB_LITE
+TEST_P(DBIteratorTest, PinnedDataIteratorMultipleFiles) {
+ Options options = CurrentOptions();
+ BlockBasedTableOptions table_options;
+ table_options.use_delta_encoding = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.disable_auto_compactions = true;
+ options.write_buffer_size = 1024 * 1024 * 10; // 10 Mb
+ DestroyAndReopen(options);
+
+ std::map<std::string, std::string> true_data;
+
+ // Generate 4 sst files in L2
+ Random rnd(301);
+ for (int i = 1; i <= 1000; i++) {
+ std::string k = Key(i * 3);
+ std::string v = RandomString(&rnd, 100);
+ ASSERT_OK(Put(k, v));
+ true_data[k] = v;
+ if (i % 250 == 0) {
+ ASSERT_OK(Flush());
+ }
+ }
+ ASSERT_EQ(FilesPerLevel(0), "4");
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ(FilesPerLevel(0), "0,4");
+
+ // Generate 4 sst files in L0
+ for (int i = 1; i <= 1000; i++) {
+ std::string k = Key(i * 2);
+ std::string v = RandomString(&rnd, 100);
+ ASSERT_OK(Put(k, v));
+ true_data[k] = v;
+ if (i % 250 == 0) {
+ ASSERT_OK(Flush());
+ }
+ }
+ ASSERT_EQ(FilesPerLevel(0), "4,4");
+
+ // Add some keys/values in memtables
+ for (int i = 1; i <= 1000; i++) {
+ std::string k = Key(i);
+ std::string v = RandomString(&rnd, 100);
+ ASSERT_OK(Put(k, v));
+ true_data[k] = v;
+ }
+ ASSERT_EQ(FilesPerLevel(0), "4,4");
+
+ ReadOptions ro;
+ ro.pin_data = true;
+ auto iter = NewIterator(ro);
+
+ std::vector<std::pair<Slice, std::string>> results;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ std::string prop_value;
+ ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+ ASSERT_EQ("1", prop_value);
+ results.emplace_back(iter->key(), iter->value().ToString());
+ }
+
+ ASSERT_EQ(results.size(), true_data.size());
+ auto data_iter = true_data.begin();
+ for (size_t i = 0; i < results.size(); i++, data_iter++) {
+ auto& kv = results[i];
+ ASSERT_EQ(kv.first, data_iter->first);
+ ASSERT_EQ(kv.second, data_iter->second);
+ }
+
+ delete iter;
+}
+#endif
+
+TEST_P(DBIteratorTest, PinnedDataIteratorMergeOperator) {
+ Options options = CurrentOptions();
+ BlockBasedTableOptions table_options;
+ table_options.use_delta_encoding = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+ DestroyAndReopen(options);
+
+ std::string numbers[7];
+ for (int val = 0; val <= 6; val++) {
+ PutFixed64(numbers + val, val);
+ }
+
+ // +1 all keys in range [ 0 => 999]
+ for (int i = 0; i < 1000; i++) {
+ WriteOptions wo;
+ ASSERT_OK(db_->Merge(wo, Key(i), numbers[1]));
+ }
+
+ // +2 all keys divisible by 2 in range [ 0 => 999]
+ for (int i = 0; i < 1000; i += 2) {
+ WriteOptions wo;
+ ASSERT_OK(db_->Merge(wo, Key(i), numbers[2]));
+ }
+
+ // +3 all keys divisible by 5 in range [ 0 => 999]
+ for (int i = 0; i < 1000; i += 5) {
+ WriteOptions wo;
+ ASSERT_OK(db_->Merge(wo, Key(i), numbers[3]));
+ }
+
+ ReadOptions ro;
+ ro.pin_data = true;
+ auto iter = NewIterator(ro);
+
+ std::vector<std::pair<Slice, std::string>> results;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ std::string prop_value;
+ ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+ ASSERT_EQ("1", prop_value);
+ results.emplace_back(iter->key(), iter->value().ToString());
+ }
+
+ ASSERT_EQ(results.size(), 1000);
+ for (size_t i = 0; i < results.size(); i++) {
+ auto& kv = results[i];
+ ASSERT_EQ(kv.first, Key(static_cast<int>(i)));
+ int expected_val = 1;
+ if (i % 2 == 0) {
+ expected_val += 2;
+ }
+ if (i % 5 == 0) {
+ expected_val += 3;
+ }
+ ASSERT_EQ(kv.second, numbers[expected_val]);
+ }
+
+ delete iter;
+}
+
+TEST_P(DBIteratorTest, PinnedDataIteratorReadAfterUpdate) {
+ Options options = CurrentOptions();
+ BlockBasedTableOptions table_options;
+ table_options.use_delta_encoding = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.write_buffer_size = 100000;
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+
+ std::map<std::string, std::string> true_data;
+ for (int i = 0; i < 1000; i++) {
+ std::string k = RandomString(&rnd, 10);
+ std::string v = RandomString(&rnd, 1000);
+ ASSERT_OK(Put(k, v));
+ true_data[k] = v;
+ }
+
+ ReadOptions ro;
+ ro.pin_data = true;
+ auto iter = NewIterator(ro);
+
+ // Delete 50% of the keys and update the other 50%
+ for (auto& kv : true_data) {
+ if (rnd.OneIn(2)) {
+ ASSERT_OK(Delete(kv.first));
+ } else {
+ std::string new_val = RandomString(&rnd, 1000);
+ ASSERT_OK(Put(kv.first, new_val));
+ }
+ }
+
+ std::vector<std::pair<Slice, std::string>> results;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ std::string prop_value;
+ ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+ ASSERT_EQ("1", prop_value);
+ results.emplace_back(iter->key(), iter->value().ToString());
+ }
+
+ auto data_iter = true_data.begin();
+ for (size_t i = 0; i < results.size(); i++, data_iter++) {
+ auto& kv = results[i];
+ ASSERT_EQ(kv.first, data_iter->first);
+ ASSERT_EQ(kv.second, data_iter->second);
+ }
+
+ delete iter;
+}
+
+class SliceTransformLimitedDomainGeneric : public SliceTransform {
+ const char* Name() const override {
+ return "SliceTransformLimitedDomainGeneric";
+ }
+
+ Slice Transform(const Slice& src) const override {
+ return Slice(src.data(), 1);
+ }
+
+ bool InDomain(const Slice& src) const override {
+ // prefix will be x????
+ return src.size() >= 1;
+ }
+
+ bool InRange(const Slice& dst) const override {
+ // prefix will be x????
+ return dst.size() == 1;
+ }
+};
+
+TEST_P(DBIteratorTest, IterSeekForPrevCrossingFiles) {
+ Options options = CurrentOptions();
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+ options.disable_auto_compactions = true;
+ // Enable prefix bloom for SST files
+ BlockBasedTableOptions table_options;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("a1", "va1"));
+ ASSERT_OK(Put("a2", "va2"));
+ ASSERT_OK(Put("a3", "va3"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("b1", "vb1"));
+ ASSERT_OK(Put("b2", "vb2"));
+ ASSERT_OK(Put("b3", "vb3"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("b4", "vb4"));
+ ASSERT_OK(Put("d1", "vd1"));
+ ASSERT_OK(Put("d2", "vd2"));
+ ASSERT_OK(Put("d4", "vd4"));
+ ASSERT_OK(Flush());
+
+ MoveFilesToLevel(1);
+ {
+ ReadOptions ro;
+ Iterator* iter = NewIterator(ro);
+
+ iter->SeekForPrev("a4");
+ ASSERT_EQ(iter->key().ToString(), "a3");
+ ASSERT_EQ(iter->value().ToString(), "va3");
+
+ iter->SeekForPrev("c2");
+ ASSERT_EQ(iter->key().ToString(), "b3");
+ iter->SeekForPrev("d3");
+ ASSERT_EQ(iter->key().ToString(), "d2");
+ iter->SeekForPrev("b5");
+ ASSERT_EQ(iter->key().ToString(), "b4");
+ delete iter;
+ }
+
+ {
+ ReadOptions ro;
+ ro.prefix_same_as_start = true;
+ Iterator* iter = NewIterator(ro);
+ iter->SeekForPrev("c2");
+ ASSERT_TRUE(!iter->Valid());
+ delete iter;
+ }
+}
+
+TEST_P(DBIteratorTest, IterSeekForPrevCrossingFilesCustomPrefixExtractor) {
+ Options options = CurrentOptions();
+ options.prefix_extractor =
+ std::make_shared<SliceTransformLimitedDomainGeneric>();
+ options.disable_auto_compactions = true;
+ // Enable prefix bloom for SST files
+ BlockBasedTableOptions table_options;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("a1", "va1"));
+ ASSERT_OK(Put("a2", "va2"));
+ ASSERT_OK(Put("a3", "va3"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("b1", "vb1"));
+ ASSERT_OK(Put("b2", "vb2"));
+ ASSERT_OK(Put("b3", "vb3"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("b4", "vb4"));
+ ASSERT_OK(Put("d1", "vd1"));
+ ASSERT_OK(Put("d2", "vd2"));
+ ASSERT_OK(Put("d4", "vd4"));
+ ASSERT_OK(Flush());
+
+ MoveFilesToLevel(1);
+ {
+ ReadOptions ro;
+ Iterator* iter = NewIterator(ro);
+
+ iter->SeekForPrev("a4");
+ ASSERT_EQ(iter->key().ToString(), "a3");
+ ASSERT_EQ(iter->value().ToString(), "va3");
+
+ iter->SeekForPrev("c2");
+ ASSERT_EQ(iter->key().ToString(), "b3");
+ iter->SeekForPrev("d3");
+ ASSERT_EQ(iter->key().ToString(), "d2");
+ iter->SeekForPrev("b5");
+ ASSERT_EQ(iter->key().ToString(), "b4");
+ delete iter;
+ }
+
+ {
+ ReadOptions ro;
+ ro.prefix_same_as_start = true;
+ Iterator* iter = NewIterator(ro);
+ iter->SeekForPrev("c2");
+ ASSERT_TRUE(!iter->Valid());
+ delete iter;
+ }
+}
+
+TEST_P(DBIteratorTest, IterPrevKeyCrossingBlocks) {
+ Options options = CurrentOptions();
+ BlockBasedTableOptions table_options;
+ table_options.block_size = 1; // every block will contain one entry
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.merge_operator = MergeOperators::CreateStringAppendTESTOperator();
+ options.disable_auto_compactions = true;
+ options.max_sequential_skip_in_iterations = 8;
+
+ DestroyAndReopen(options);
+
+ // Putting such deletes will force DBIter::Prev() to fallback to a Seek
+ for (int file_num = 0; file_num < 10; file_num++) {
+ ASSERT_OK(Delete("key4"));
+ ASSERT_OK(Flush());
+ }
+
+ // First File containing 5 blocks of puts
+ ASSERT_OK(Put("key1", "val1.0"));
+ ASSERT_OK(Put("key2", "val2.0"));
+ ASSERT_OK(Put("key3", "val3.0"));
+ ASSERT_OK(Put("key4", "val4.0"));
+ ASSERT_OK(Put("key5", "val5.0"));
+ ASSERT_OK(Flush());
+
+ // Second file containing 9 blocks of merge operands
+ ASSERT_OK(db_->Merge(WriteOptions(), "key1", "val1.1"));
+ ASSERT_OK(db_->Merge(WriteOptions(), "key1", "val1.2"));
+
+ ASSERT_OK(db_->Merge(WriteOptions(), "key2", "val2.1"));
+ ASSERT_OK(db_->Merge(WriteOptions(), "key2", "val2.2"));
+ ASSERT_OK(db_->Merge(WriteOptions(), "key2", "val2.3"));
+
+ ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.1"));
+ ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.2"));
+ ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.3"));
+ ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.4"));
+ ASSERT_OK(Flush());
+
+ {
+ ReadOptions ro;
+ ro.fill_cache = false;
+ Iterator* iter = NewIterator(ro);
+
+ iter->SeekToLast();
+ ASSERT_EQ(iter->key().ToString(), "key5");
+ ASSERT_EQ(iter->value().ToString(), "val5.0");
+
+ iter->Prev();
+ ASSERT_EQ(iter->key().ToString(), "key4");
+ ASSERT_EQ(iter->value().ToString(), "val4.0");
+
+ iter->Prev();
+ ASSERT_EQ(iter->key().ToString(), "key3");
+ ASSERT_EQ(iter->value().ToString(), "val3.0,val3.1,val3.2,val3.3,val3.4");
+
+ iter->Prev();
+ ASSERT_EQ(iter->key().ToString(), "key2");
+ ASSERT_EQ(iter->value().ToString(), "val2.0,val2.1,val2.2,val2.3");
+
+ iter->Prev();
+ ASSERT_EQ(iter->key().ToString(), "key1");
+ ASSERT_EQ(iter->value().ToString(), "val1.0,val1.1,val1.2");
+
+ delete iter;
+ }
+}
+
+TEST_P(DBIteratorTest, IterPrevKeyCrossingBlocksRandomized) {
+ Options options = CurrentOptions();
+ options.merge_operator = MergeOperators::CreateStringAppendTESTOperator();
+ options.disable_auto_compactions = true;
+ options.level0_slowdown_writes_trigger = (1 << 30);
+ options.level0_stop_writes_trigger = (1 << 30);
+ options.max_sequential_skip_in_iterations = 8;
+ DestroyAndReopen(options);
+
+ const int kNumKeys = 500;
+ // Small number of merge operands to make sure that DBIter::Prev() dont
+ // fall back to Seek()
+ const int kNumMergeOperands = 3;
+ // Use value size that will make sure that every block contain 1 key
+ const int kValSize =
+ static_cast<int>(BlockBasedTableOptions().block_size) * 4;
+ // Percentage of keys that wont get merge operations
+ const int kNoMergeOpPercentage = 20;
+ // Percentage of keys that will be deleted
+ const int kDeletePercentage = 10;
+
+ // For half of the key range we will write multiple deletes first to
+ // force DBIter::Prev() to fall back to Seek()
+ for (int file_num = 0; file_num < 10; file_num++) {
+ for (int i = 0; i < kNumKeys; i += 2) {
+ ASSERT_OK(Delete(Key(i)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ Random rnd(301);
+ std::map<std::string, std::string> true_data;
+ std::string gen_key;
+ std::string gen_val;
+
+ for (int i = 0; i < kNumKeys; i++) {
+ gen_key = Key(i);
+ gen_val = RandomString(&rnd, kValSize);
+
+ ASSERT_OK(Put(gen_key, gen_val));
+ true_data[gen_key] = gen_val;
+ }
+ ASSERT_OK(Flush());
+
+ // Separate values and merge operands in different file so that we
+ // make sure that we dont merge them while flushing but actually
+ // merge them in the read path
+ for (int i = 0; i < kNumKeys; i++) {
+ if (rnd.PercentTrue(kNoMergeOpPercentage)) {
+ // Dont give merge operations for some keys
+ continue;
+ }
+
+ for (int j = 0; j < kNumMergeOperands; j++) {
+ gen_key = Key(i);
+ gen_val = RandomString(&rnd, kValSize);
+
+ ASSERT_OK(db_->Merge(WriteOptions(), gen_key, gen_val));
+ true_data[gen_key] += "," + gen_val;
+ }
+ }
+ ASSERT_OK(Flush());
+
+ for (int i = 0; i < kNumKeys; i++) {
+ if (rnd.PercentTrue(kDeletePercentage)) {
+ gen_key = Key(i);
+
+ ASSERT_OK(Delete(gen_key));
+ true_data.erase(gen_key);
+ }
+ }
+ ASSERT_OK(Flush());
+
+ {
+ ReadOptions ro;
+ ro.fill_cache = false;
+ Iterator* iter = NewIterator(ro);
+ auto data_iter = true_data.rbegin();
+
+ for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+ ASSERT_EQ(iter->key().ToString(), data_iter->first);
+ ASSERT_EQ(iter->value().ToString(), data_iter->second);
+ data_iter++;
+ }
+ ASSERT_EQ(data_iter, true_data.rend());
+
+ delete iter;
+ }
+
+ {
+ ReadOptions ro;
+ ro.fill_cache = false;
+ Iterator* iter = NewIterator(ro);
+ auto data_iter = true_data.rbegin();
+
+ int entries_right = 0;
+ std::string seek_key;
+ for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+ // Verify key/value of current position
+ ASSERT_EQ(iter->key().ToString(), data_iter->first);
+ ASSERT_EQ(iter->value().ToString(), data_iter->second);
+
+ bool restore_position_with_seek = rnd.Uniform(2);
+ if (restore_position_with_seek) {
+ seek_key = iter->key().ToString();
+ }
+
+ // Do some Next() operations the restore the iterator to orignal position
+ int next_count =
+ entries_right > 0 ? rnd.Uniform(std::min(entries_right, 10)) : 0;
+ for (int i = 0; i < next_count; i++) {
+ iter->Next();
+ data_iter--;
+
+ ASSERT_EQ(iter->key().ToString(), data_iter->first);
+ ASSERT_EQ(iter->value().ToString(), data_iter->second);
+ }
+
+ if (restore_position_with_seek) {
+ // Restore orignal position using Seek()
+ iter->Seek(seek_key);
+ for (int i = 0; i < next_count; i++) {
+ data_iter++;
+ }
+
+ ASSERT_EQ(iter->key().ToString(), data_iter->first);
+ ASSERT_EQ(iter->value().ToString(), data_iter->second);
+ } else {
+ // Restore original position using Prev()
+ for (int i = 0; i < next_count; i++) {
+ iter->Prev();
+ data_iter++;
+
+ ASSERT_EQ(iter->key().ToString(), data_iter->first);
+ ASSERT_EQ(iter->value().ToString(), data_iter->second);
+ }
+ }
+
+ entries_right++;
+ data_iter++;
+ }
+ ASSERT_EQ(data_iter, true_data.rend());
+
+ delete iter;
+ }
+}
+
+TEST_P(DBIteratorTest, IteratorWithLocalStatistics) {
+ Options options = CurrentOptions();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < 1000; i++) {
+ // Key 10 bytes / Value 10 bytes
+ ASSERT_OK(Put(RandomString(&rnd, 10), RandomString(&rnd, 10)));
+ }
+
+ std::atomic<uint64_t> total_next(0);
+ std::atomic<uint64_t> total_next_found(0);
+ std::atomic<uint64_t> total_prev(0);
+ std::atomic<uint64_t> total_prev_found(0);
+ std::atomic<uint64_t> total_bytes(0);
+
+ std::vector<port::Thread> threads;
+ std::function<void()> reader_func_next = [&]() {
+ SetPerfLevel(kEnableCount);
+ get_perf_context()->Reset();
+ Iterator* iter = NewIterator(ReadOptions());
+
+ iter->SeekToFirst();
+ // Seek will bump ITER_BYTES_READ
+ uint64_t bytes = 0;
+ bytes += iter->key().size();
+ bytes += iter->value().size();
+ while (true) {
+ iter->Next();
+ total_next++;
+
+ if (!iter->Valid()) {
+ break;
+ }
+ total_next_found++;
+ bytes += iter->key().size();
+ bytes += iter->value().size();
+ }
+
+ delete iter;
+ ASSERT_EQ(bytes, get_perf_context()->iter_read_bytes);
+ SetPerfLevel(kDisable);
+ total_bytes += bytes;
+ };
+
+ std::function<void()> reader_func_prev = [&]() {
+ SetPerfLevel(kEnableCount);
+ Iterator* iter = NewIterator(ReadOptions());
+
+ iter->SeekToLast();
+ // Seek will bump ITER_BYTES_READ
+ uint64_t bytes = 0;
+ bytes += iter->key().size();
+ bytes += iter->value().size();
+ while (true) {
+ iter->Prev();
+ total_prev++;
+
+ if (!iter->Valid()) {
+ break;
+ }
+ total_prev_found++;
+ bytes += iter->key().size();
+ bytes += iter->value().size();
+ }
+
+ delete iter;
+ ASSERT_EQ(bytes, get_perf_context()->iter_read_bytes);
+ SetPerfLevel(kDisable);
+ total_bytes += bytes;
+ };
+
+ for (int i = 0; i < 10; i++) {
+ threads.emplace_back(reader_func_next);
+ }
+ for (int i = 0; i < 15; i++) {
+ threads.emplace_back(reader_func_prev);
+ }
+
+ for (auto& t : threads) {
+ t.join();
+ }
+
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_NEXT), (uint64_t)total_next);
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_NEXT_FOUND),
+ (uint64_t)total_next_found);
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_PREV), (uint64_t)total_prev);
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_PREV_FOUND),
+ (uint64_t)total_prev_found);
+ ASSERT_EQ(TestGetTickerCount(options, ITER_BYTES_READ), (uint64_t)total_bytes);
+
+}
+
+TEST_P(DBIteratorTest, ReadAhead) {
+ Options options;
+ env_->count_random_reads_ = true;
+ options.env = env_;
+ options.disable_auto_compactions = true;
+ options.write_buffer_size = 4 << 20;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ BlockBasedTableOptions table_options;
+ table_options.block_size = 1024;
+ table_options.no_block_cache = true;
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+ Reopen(options);
+
+ std::string value(1024, 'a');
+ for (int i = 0; i < 100; i++) {
+ Put(Key(i), value);
+ }
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(2);
+
+ for (int i = 0; i < 100; i++) {
+ Put(Key(i), value);
+ }
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(1);
+
+ for (int i = 0; i < 100; i++) {
+ Put(Key(i), value);
+ }
+ ASSERT_OK(Flush());
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("1,1,1", FilesPerLevel());
+#endif // !ROCKSDB_LITE
+
+ env_->random_read_bytes_counter_ = 0;
+ options.statistics->setTickerCount(NO_FILE_OPENS, 0);
+ ReadOptions read_options;
+ auto* iter = NewIterator(read_options);
+ iter->SeekToFirst();
+ int64_t num_file_opens = TestGetTickerCount(options, NO_FILE_OPENS);
+ size_t bytes_read = env_->random_read_bytes_counter_;
+ delete iter;
+
+ int64_t num_file_closes = TestGetTickerCount(options, NO_FILE_CLOSES);
+ env_->random_read_bytes_counter_ = 0;
+ options.statistics->setTickerCount(NO_FILE_OPENS, 0);
+ read_options.readahead_size = 1024 * 10;
+ iter = NewIterator(read_options);
+ iter->SeekToFirst();
+ int64_t num_file_opens_readahead = TestGetTickerCount(options, NO_FILE_OPENS);
+ size_t bytes_read_readahead = env_->random_read_bytes_counter_;
+ delete iter;
+ int64_t num_file_closes_readahead =
+ TestGetTickerCount(options, NO_FILE_CLOSES);
+ ASSERT_EQ(num_file_opens, num_file_opens_readahead);
+ ASSERT_EQ(num_file_closes, num_file_closes_readahead);
+ ASSERT_GT(bytes_read_readahead, bytes_read);
+ ASSERT_GT(bytes_read_readahead, read_options.readahead_size * 3);
+
+ // Verify correctness.
+ iter = NewIterator(read_options);
+ int count = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_EQ(value, iter->value());
+ count++;
+ }
+ ASSERT_EQ(100, count);
+ for (int i = 0; i < 100; i++) {
+ iter->Seek(Key(i));
+ ASSERT_EQ(value, iter->value());
+ }
+ delete iter;
+}
+
+// Insert a key, create a snapshot iterator, overwrite key lots of times,
+// seek to a smaller key. Expect DBIter to fall back to a seek instead of
+// going through all the overwrites linearly.
+TEST_P(DBIteratorTest, DBIteratorSkipRecentDuplicatesTest) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.max_sequential_skip_in_iterations = 3;
+ options.prefix_extractor = nullptr;
+ options.write_buffer_size = 1 << 27; // big enough to avoid flush
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ DestroyAndReopen(options);
+
+ // Insert.
+ ASSERT_OK(Put("b", "0"));
+
+ // Create iterator.
+ ReadOptions ro;
+ std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+ // Insert a lot.
+ for (int i = 0; i < 100; ++i) {
+ ASSERT_OK(Put("b", std::to_string(i + 1).c_str()));
+ }
+
+#ifndef ROCKSDB_LITE
+ // Check that memtable wasn't flushed.
+ std::string val;
+ ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level0", &val));
+ EXPECT_EQ("0", val);
+#endif
+
+ // Seek iterator to a smaller key.
+ get_perf_context()->Reset();
+ iter->Seek("a");
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ("b", iter->key().ToString());
+ EXPECT_EQ("0", iter->value().ToString());
+
+ // Check that the seek didn't do too much work.
+ // Checks are not tight, just make sure that everything is well below 100.
+ EXPECT_LT(get_perf_context()->internal_key_skipped_count, 4);
+ EXPECT_LT(get_perf_context()->internal_recent_skipped_count, 8);
+ EXPECT_LT(get_perf_context()->seek_on_memtable_count, 10);
+ EXPECT_LT(get_perf_context()->next_on_memtable_count, 10);
+ EXPECT_LT(get_perf_context()->prev_on_memtable_count, 10);
+
+ // Check that iterator did something like what we expect.
+ EXPECT_EQ(get_perf_context()->internal_delete_skipped_count, 0);
+ EXPECT_EQ(get_perf_context()->internal_merge_count, 0);
+ EXPECT_GE(get_perf_context()->internal_recent_skipped_count, 2);
+ EXPECT_GE(get_perf_context()->seek_on_memtable_count, 2);
+ EXPECT_EQ(1, options.statistics->getTickerCount(
+ NUMBER_OF_RESEEKS_IN_ITERATION));
+}
+
+TEST_P(DBIteratorTest, Refresh) {
+ ASSERT_OK(Put("x", "y"));
+
+ std::unique_ptr<Iterator> iter(NewIterator(ReadOptions()));
+ iter->Seek(Slice("a"));
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+
+ ASSERT_OK(Put("c", "d"));
+
+ iter->Seek(Slice("a"));
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+
+ iter->Refresh();
+
+ iter->Seek(Slice("a"));
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("c")), 0);
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+
+ dbfull()->Flush(FlushOptions());
+
+ ASSERT_OK(Put("m", "n"));
+
+ iter->Seek(Slice("a"));
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("c")), 0);
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+
+ iter->Refresh();
+
+ iter->Seek(Slice("a"));
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("c")), 0);
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("m")), 0);
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+
+ iter.reset();
+}
+
+TEST_P(DBIteratorTest, RefreshWithSnapshot) {
+ ASSERT_OK(Put("x", "y"));
+ const Snapshot* snapshot = db_->GetSnapshot();
+ ReadOptions options;
+ options.snapshot = snapshot;
+ Iterator* iter = NewIterator(options);
+
+ iter->Seek(Slice("a"));
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+
+ ASSERT_OK(Put("c", "d"));
+
+ iter->Seek(Slice("a"));
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+
+ Status s;
+ s = iter->Refresh();
+ ASSERT_TRUE(s.IsNotSupported());
+ db_->ReleaseSnapshot(snapshot);
+ delete iter;
+}
+
+TEST_P(DBIteratorTest, CreationFailure) {
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::NewInternalIterator:StatusCallback", [](void* arg) {
+ *(reinterpret_cast<Status*>(arg)) = Status::Corruption("test status");
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ Iterator* iter = NewIterator(ReadOptions());
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_TRUE(iter->status().IsCorruption());
+ delete iter;
+}
+
+TEST_P(DBIteratorTest, UpperBoundWithChangeDirection) {
+ Options options = CurrentOptions();
+ options.max_sequential_skip_in_iterations = 3;
+ DestroyAndReopen(options);
+
+ // write a bunch of kvs to the database.
+ ASSERT_OK(Put("a", "1"));
+ ASSERT_OK(Put("y", "1"));
+ ASSERT_OK(Put("y1", "1"));
+ ASSERT_OK(Put("y2", "1"));
+ ASSERT_OK(Put("y3", "1"));
+ ASSERT_OK(Put("z", "1"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("a", "1"));
+ ASSERT_OK(Put("z", "1"));
+ ASSERT_OK(Put("bar", "1"));
+ ASSERT_OK(Put("foo", "1"));
+
+ std::string upper_bound = "x";
+ Slice ub_slice(upper_bound);
+ ReadOptions ro;
+ ro.iterate_upper_bound = &ub_slice;
+ ro.max_skippable_internal_keys = 1000;
+
+ Iterator* iter = NewIterator(ro);
+ iter->Seek("foo");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("foo", iter->key().ToString());
+
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("bar", iter->key().ToString());
+
+ delete iter;
+}
+
+TEST_P(DBIteratorTest, TableFilter) {
+ ASSERT_OK(Put("a", "1"));
+ dbfull()->Flush(FlushOptions());
+ ASSERT_OK(Put("b", "2"));
+ ASSERT_OK(Put("c", "3"));
+ dbfull()->Flush(FlushOptions());
+ ASSERT_OK(Put("d", "4"));
+ ASSERT_OK(Put("e", "5"));
+ ASSERT_OK(Put("f", "6"));
+ dbfull()->Flush(FlushOptions());
+
+ // Ensure the table_filter callback is called once for each table.
+ {
+ std::set<uint64_t> unseen{1, 2, 3};
+ ReadOptions opts;
+ opts.table_filter = [&](const TableProperties& props) {
+ auto it = unseen.find(props.num_entries);
+ if (it == unseen.end()) {
+ ADD_FAILURE() << "saw table properties with an unexpected "
+ << props.num_entries << " entries";
+ } else {
+ unseen.erase(it);
+ }
+ return true;
+ };
+ auto iter = NewIterator(opts);
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->1");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "b->2");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "c->3");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "d->4");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "e->5");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "f->6");
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_TRUE(unseen.empty());
+ delete iter;
+ }
+
+ // Ensure returning false in the table_filter hides the keys from that table
+ // during iteration.
+ {
+ ReadOptions opts;
+ opts.table_filter = [](const TableProperties& props) {
+ return props.num_entries != 2;
+ };
+ auto iter = NewIterator(opts);
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->1");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "d->4");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "e->5");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "f->6");
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+ delete iter;
+ }
+}
+
+TEST_P(DBIteratorTest, UpperBoundWithPrevReseek) {
+ Options options = CurrentOptions();
+ options.max_sequential_skip_in_iterations = 3;
+ DestroyAndReopen(options);
+
+ // write a bunch of kvs to the database.
+ ASSERT_OK(Put("a", "1"));
+ ASSERT_OK(Put("y", "1"));
+ ASSERT_OK(Put("z", "1"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("a", "1"));
+ ASSERT_OK(Put("z", "1"));
+ ASSERT_OK(Put("bar", "1"));
+ ASSERT_OK(Put("foo", "1"));
+ ASSERT_OK(Put("foo", "2"));
+
+ ASSERT_OK(Put("foo", "3"));
+ ASSERT_OK(Put("foo", "4"));
+ ASSERT_OK(Put("foo", "5"));
+ const Snapshot* snapshot = db_->GetSnapshot();
+ ASSERT_OK(Put("foo", "6"));
+
+ std::string upper_bound = "x";
+ Slice ub_slice(upper_bound);
+ ReadOptions ro;
+ ro.snapshot = snapshot;
+ ro.iterate_upper_bound = &ub_slice;
+
+ Iterator* iter = NewIterator(ro);
+ iter->SeekForPrev("goo");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("foo", iter->key().ToString());
+ iter->Prev();
+
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("bar", iter->key().ToString());
+
+ delete iter;
+ db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_P(DBIteratorTest, SkipStatistics) {
+ Options options = CurrentOptions();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ DestroyAndReopen(options);
+
+ int skip_count = 0;
+
+ // write a bunch of kvs to the database.
+ ASSERT_OK(Put("a", "1"));
+ ASSERT_OK(Put("b", "1"));
+ ASSERT_OK(Put("c", "1"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("d", "1"));
+ ASSERT_OK(Put("e", "1"));
+ ASSERT_OK(Put("f", "1"));
+ ASSERT_OK(Put("a", "2"));
+ ASSERT_OK(Put("b", "2"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Delete("d"));
+ ASSERT_OK(Delete("e"));
+ ASSERT_OK(Delete("f"));
+
+ Iterator* iter = NewIterator(ReadOptions());
+ int count = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_OK(iter->status());
+ count++;
+ }
+ ASSERT_EQ(count, 3);
+ delete iter;
+ skip_count += 8; // 3 deletes + 3 original keys + 2 lower in sequence
+ ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP));
+
+ iter = NewIterator(ReadOptions());
+ count = 0;
+ for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+ ASSERT_OK(iter->status());
+ count++;
+ }
+ ASSERT_EQ(count, 3);
+ delete iter;
+ skip_count += 8; // Same as above, but in reverse order
+ ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP));
+
+ ASSERT_OK(Put("aa", "1"));
+ ASSERT_OK(Put("ab", "1"));
+ ASSERT_OK(Put("ac", "1"));
+ ASSERT_OK(Put("ad", "1"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Delete("ab"));
+ ASSERT_OK(Delete("ac"));
+ ASSERT_OK(Delete("ad"));
+
+ ReadOptions ro;
+ Slice prefix("b");
+ ro.iterate_upper_bound = &prefix;
+
+ iter = NewIterator(ro);
+ count = 0;
+ for(iter->Seek("aa"); iter->Valid(); iter->Next()) {
+ ASSERT_OK(iter->status());
+ count++;
+ }
+ ASSERT_EQ(count, 1);
+ delete iter;
+ skip_count += 6; // 3 deletes + 3 original keys
+ ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP));
+
+ iter = NewIterator(ro);
+ count = 0;
+ for(iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+ ASSERT_OK(iter->status());
+ count++;
+ }
+ ASSERT_EQ(count, 2);
+ delete iter;
+ // 3 deletes + 3 original keys + lower sequence of "a"
+ skip_count += 7;
+ ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP));
+}
+
+TEST_P(DBIteratorTest, SeekAfterHittingManyInternalKeys) {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+ ReadOptions ropts;
+ ropts.max_skippable_internal_keys = 2;
+
+ Put("1", "val_1");
+ // Add more tombstones than max_skippable_internal_keys so that Next() fails.
+ Delete("2");
+ Delete("3");
+ Delete("4");
+ Delete("5");
+ Put("6", "val_6");
+
+ std::unique_ptr<Iterator> iter(NewIterator(ropts));
+ iter->SeekToFirst();
+
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().ToString(), "1");
+ ASSERT_EQ(iter->value().ToString(), "val_1");
+
+ // This should fail as incomplete due to too many non-visible internal keys on
+ // the way to the next valid user key.
+ iter->Next();
+ ASSERT_TRUE(!iter->Valid());
+ ASSERT_TRUE(iter->status().IsIncomplete());
+
+ // Get the internal key at which Next() failed.
+ std::string prop_value;
+ ASSERT_OK(iter->GetProperty("rocksdb.iterator.internal-key", &prop_value));
+ ASSERT_EQ("4", prop_value);
+
+ // Create a new iterator to seek to the internal key.
+ std::unique_ptr<Iterator> iter2(NewIterator(ropts));
+ iter2->Seek(prop_value);
+ ASSERT_TRUE(iter2->Valid());
+ ASSERT_OK(iter2->status());
+
+ ASSERT_EQ(iter2->key().ToString(), "6");
+ ASSERT_EQ(iter2->value().ToString(), "val_6");
+}
+
+// Reproduces a former bug where iterator would skip some records when DBIter
+// re-seeks subiterator with Incomplete status.
+TEST_P(DBIteratorTest, NonBlockingIterationBugRepro) {
+ Options options = CurrentOptions();
+ BlockBasedTableOptions table_options;
+ // Make sure the sst file has more than one block.
+ table_options.flush_block_policy_factory =
+ std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ DestroyAndReopen(options);
+
+ // Two records in sst file, each in its own block.
+ Put("b", "");
+ Put("d", "");
+ Flush();
+
+ // Create a nonblocking iterator before writing to memtable.
+ ReadOptions ropt;
+ ropt.read_tier = kBlockCacheTier;
+ std::unique_ptr<Iterator> iter(NewIterator(ropt));
+
+ // Overwrite a key in memtable many times to hit
+ // max_sequential_skip_in_iterations (which is 8 by default).
+ for (int i = 0; i < 20; ++i) {
+ Put("c", "");
+ }
+
+ // Load the second block in sst file into the block cache.
+ {
+ std::unique_ptr<Iterator> iter2(NewIterator(ReadOptions()));
+ iter2->Seek("d");
+ }
+
+ // Finally seek the nonblocking iterator.
+ iter->Seek("a");
+ // With the bug, the status used to be OK, and the iterator used to point to
+ // "d".
+ EXPECT_TRUE(iter->status().IsIncomplete());
+}
+
+TEST_P(DBIteratorTest, SeekBackwardAfterOutOfUpperBound) {
+ Put("a", "");
+ Put("b", "");
+ Flush();
+
+ ReadOptions ropt;
+ Slice ub = "b";
+ ropt.iterate_upper_bound = &ub;
+
+ std::unique_ptr<Iterator> it(dbfull()->NewIterator(ropt));
+ it->SeekForPrev("a");
+ ASSERT_TRUE(it->Valid());
+ ASSERT_OK(it->status());
+ ASSERT_EQ("a", it->key().ToString());
+ it->Next();
+ ASSERT_FALSE(it->Valid());
+ ASSERT_OK(it->status());
+ it->SeekForPrev("a");
+ ASSERT_OK(it->status());
+
+ ASSERT_TRUE(it->Valid());
+ ASSERT_EQ("a", it->key().ToString());
+}
+
+TEST_P(DBIteratorTest, AvoidReseekLevelIterator) {
+ Options options = CurrentOptions();
+ options.compression = CompressionType::kNoCompression;
+ BlockBasedTableOptions table_options;
+ table_options.block_size = 800;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+
+ Random rnd(301);
+ std::string random_str = RandomString(&rnd, 180);
+
+ ASSERT_OK(Put("1", random_str));
+ ASSERT_OK(Put("2", random_str));
+ ASSERT_OK(Put("3", random_str));
+ ASSERT_OK(Put("4", random_str));
+ // A new block
+ ASSERT_OK(Put("5", random_str));
+ ASSERT_OK(Put("6", random_str));
+ ASSERT_OK(Put("7", random_str));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("8", random_str));
+ ASSERT_OK(Put("9", random_str));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ int num_find_file_in_level = 0;
+ int num_idx_blk_seek = 0;
+ SyncPoint::GetInstance()->SetCallBack(
+ "LevelIterator::Seek:BeforeFindFile",
+ [&](void* /*arg*/) { num_find_file_in_level++; });
+ SyncPoint::GetInstance()->SetCallBack(
+ "IndexBlockIter::Seek:0", [&](void* /*arg*/) { num_idx_blk_seek++; });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ {
+ std::unique_ptr<Iterator> iter(NewIterator(ReadOptions()));
+ iter->Seek("1");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(1, num_find_file_in_level);
+ ASSERT_EQ(1, num_idx_blk_seek);
+
+ iter->Seek("2");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(1, num_find_file_in_level);
+ ASSERT_EQ(1, num_idx_blk_seek);
+
+ iter->Seek("3");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(1, num_find_file_in_level);
+ ASSERT_EQ(1, num_idx_blk_seek);
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(1, num_find_file_in_level);
+ ASSERT_EQ(1, num_idx_blk_seek);
+
+ iter->Seek("5");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(1, num_find_file_in_level);
+ ASSERT_EQ(2, num_idx_blk_seek);
+
+ iter->Seek("6");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(1, num_find_file_in_level);
+ ASSERT_EQ(2, num_idx_blk_seek);
+
+ iter->Seek("7");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(1, num_find_file_in_level);
+ ASSERT_EQ(3, num_idx_blk_seek);
+
+ iter->Seek("8");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(2, num_find_file_in_level);
+ // Still re-seek because "8" is the boundary key, which has
+ // the same user key as the seek key.
+ ASSERT_EQ(4, num_idx_blk_seek);
+
+ iter->Seek("5");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(3, num_find_file_in_level);
+ ASSERT_EQ(5, num_idx_blk_seek);
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(3, num_find_file_in_level);
+ ASSERT_EQ(5, num_idx_blk_seek);
+
+ // Seek backward never triggers the index block seek to be skipped
+ iter->Seek("5");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(3, num_find_file_in_level);
+ ASSERT_EQ(6, num_idx_blk_seek);
+ }
+
+ SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// MyRocks may change iterate bounds before seek. Simply test to make sure such
+// usage doesn't break iterator.
+TEST_P(DBIteratorTest, IterateBoundChangedBeforeSeek) {
+ Options options = CurrentOptions();
+ options.compression = CompressionType::kNoCompression;
+ BlockBasedTableOptions table_options;
+ table_options.block_size = 100;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ std::string value(50, 'v');
+ Reopen(options);
+ ASSERT_OK(Put("aaa", value));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("bbb", "v"));
+ ASSERT_OK(Put("ccc", "v"));
+ ASSERT_OK(Put("ddd", "v"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("eee", "v"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ std::string ub1 = "e";
+ std::string ub2 = "c";
+ Slice ub(ub1);
+ ReadOptions read_opts1;
+ read_opts1.iterate_upper_bound = &ub;
+ Iterator* iter = NewIterator(read_opts1);
+ // Seek and iterate accross block boundary.
+ iter->Seek("b");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("bbb", iter->key());
+ ub = Slice(ub2);
+ iter->Seek("b");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("bbb", iter->key());
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_OK(iter->status());
+ delete iter;
+
+ std::string lb1 = "a";
+ std::string lb2 = "c";
+ Slice lb(lb1);
+ ReadOptions read_opts2;
+ read_opts2.iterate_lower_bound = &lb;
+ iter = NewIterator(read_opts2);
+ iter->SeekForPrev("d");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("ccc", iter->key());
+ lb = Slice(lb2);
+ iter->SeekForPrev("d");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("ccc", iter->key());
+ iter->Prev();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_OK(iter->status());
+ delete iter;
+}
+
+TEST_P(DBIteratorTest, IterateWithLowerBoundAcrossFileBoundary) {
+ ASSERT_OK(Put("aaa", "v"));
+ ASSERT_OK(Put("bbb", "v"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("ccc", "v"));
+ ASSERT_OK(Put("ddd", "v"));
+ ASSERT_OK(Flush());
+ // Move both files to bottom level.
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ Slice lower_bound("b");
+ ReadOptions read_opts;
+ read_opts.iterate_lower_bound = &lower_bound;
+ std::unique_ptr<Iterator> iter(NewIterator(read_opts));
+ iter->SeekForPrev("d");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("ccc", iter->key());
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("bbb", iter->key());
+ iter->Prev();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_OK(iter->status());
+}
+
+INSTANTIATE_TEST_CASE_P(DBIteratorTestInstance, DBIteratorTest,
+ testing::Values(true, false));
+
+// Tests how DBIter work with ReadCallback
+class DBIteratorWithReadCallbackTest : public DBIteratorTest {};
+
+TEST_F(DBIteratorWithReadCallbackTest, ReadCallback) {
+ class TestReadCallback : public ReadCallback {
+ public:
+ explicit TestReadCallback(SequenceNumber _max_visible_seq)
+ : ReadCallback(_max_visible_seq) {}
+
+ bool IsVisibleFullCheck(SequenceNumber seq) override {
+ return seq <= max_visible_seq_;
+ }
+ };
+
+ ASSERT_OK(Put("foo", "v1"));
+ ASSERT_OK(Put("foo", "v2"));
+ ASSERT_OK(Put("foo", "v3"));
+ ASSERT_OK(Put("a", "va"));
+ ASSERT_OK(Put("z", "vz"));
+ SequenceNumber seq1 = db_->GetLatestSequenceNumber();
+ TestReadCallback callback1(seq1);
+ ASSERT_OK(Put("foo", "v4"));
+ ASSERT_OK(Put("foo", "v5"));
+ ASSERT_OK(Put("bar", "v7"));
+
+ SequenceNumber seq2 = db_->GetLatestSequenceNumber();
+ auto* cfd =
+ reinterpret_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())
+ ->cfd();
+ // The iterator are suppose to see data before seq1.
+ Iterator* iter =
+ dbfull()->NewIteratorImpl(ReadOptions(), cfd, seq2, &callback1);
+
+ // Seek
+ // The latest value of "foo" before seq1 is "v3"
+ iter->Seek("foo");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("foo", iter->key());
+ ASSERT_EQ("v3", iter->value());
+ // "bar" is not visible to the iterator. It will move on to the next key
+ // "foo".
+ iter->Seek("bar");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("foo", iter->key());
+ ASSERT_EQ("v3", iter->value());
+
+ // Next
+ // Seek to "a"
+ iter->Seek("a");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("va", iter->value());
+ // "bar" is not visible to the iterator. It will move on to the next key
+ // "foo".
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("foo", iter->key());
+ ASSERT_EQ("v3", iter->value());
+
+ // Prev
+ // Seek to "z"
+ iter->Seek("z");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("vz", iter->value());
+ // The previous key is "foo", which is visible to the iterator.
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("foo", iter->key());
+ ASSERT_EQ("v3", iter->value());
+ // "bar" is not visible to the iterator. It will move on to the next key "a".
+ iter->Prev(); // skipping "bar"
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("a", iter->key());
+ ASSERT_EQ("va", iter->value());
+
+ // SeekForPrev
+ // The previous key is "foo", which is visible to the iterator.
+ iter->SeekForPrev("y");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("foo", iter->key());
+ ASSERT_EQ("v3", iter->value());
+ // "bar" is not visible to the iterator. It will move on to the next key "a".
+ iter->SeekForPrev("bar");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("a", iter->key());
+ ASSERT_EQ("va", iter->value());
+
+ delete iter;
+
+ // Prev beyond max_sequential_skip_in_iterations
+ uint64_t num_versions =
+ CurrentOptions().max_sequential_skip_in_iterations + 10;
+ for (uint64_t i = 0; i < num_versions; i++) {
+ ASSERT_OK(Put("bar", ToString(i)));
+ }
+ SequenceNumber seq3 = db_->GetLatestSequenceNumber();
+ TestReadCallback callback2(seq3);
+ ASSERT_OK(Put("bar", "v8"));
+ SequenceNumber seq4 = db_->GetLatestSequenceNumber();
+
+ // The iterator is suppose to see data before seq3.
+ iter = dbfull()->NewIteratorImpl(ReadOptions(), cfd, seq4, &callback2);
+ // Seek to "z", which is visible.
+ iter->Seek("z");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("vz", iter->value());
+ // Previous key is "foo" and the last value "v5" is visible.
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("foo", iter->key());
+ ASSERT_EQ("v5", iter->value());
+ // Since the number of values of "bar" is more than
+ // max_sequential_skip_in_iterations, Prev() will ultimately fallback to
+ // seek in forward direction. Here we test the fallback seek is correct.
+ // The last visible value should be (num_versions - 1), as "v8" is not
+ // visible.
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("bar", iter->key());
+ ASSERT_EQ(ToString(num_versions - 1), iter->value());
+
+ delete iter;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_log_iter_test.cc b/src/rocksdb/db/db_log_iter_test.cc
new file mode 100644
index 000000000..1f9ff0d45
--- /dev/null
+++ b/src/rocksdb/db/db_log_iter_test.cc
@@ -0,0 +1,294 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// Introduction of SyncPoint effectively disabled building and running this test
+// in Release build.
+// which is a pity, it is a good test
+#if !defined(ROCKSDB_LITE)
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBTestXactLogIterator : public DBTestBase {
+ public:
+ DBTestXactLogIterator() : DBTestBase("/db_log_iter_test") {}
+
+ std::unique_ptr<TransactionLogIterator> OpenTransactionLogIter(
+ const SequenceNumber seq) {
+ std::unique_ptr<TransactionLogIterator> iter;
+ Status status = dbfull()->GetUpdatesSince(seq, &iter);
+ EXPECT_OK(status);
+ EXPECT_TRUE(iter->Valid());
+ return iter;
+ }
+};
+
+namespace {
+SequenceNumber ReadRecords(
+ std::unique_ptr<TransactionLogIterator>& iter,
+ int& count) {
+ count = 0;
+ SequenceNumber lastSequence = 0;
+ BatchResult res;
+ while (iter->Valid()) {
+ res = iter->GetBatch();
+ EXPECT_TRUE(res.sequence > lastSequence);
+ ++count;
+ lastSequence = res.sequence;
+ EXPECT_OK(iter->status());
+ iter->Next();
+ }
+ return res.sequence;
+}
+
+void ExpectRecords(
+ const int expected_no_records,
+ std::unique_ptr<TransactionLogIterator>& iter) {
+ int num_records;
+ ReadRecords(iter, num_records);
+ ASSERT_EQ(num_records, expected_no_records);
+}
+} // namespace
+
+TEST_F(DBTestXactLogIterator, TransactionLogIterator) {
+ do {
+ Options options = OptionsForLogIterTest();
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ Put(0, "key1", DummyString(1024));
+ Put(1, "key2", DummyString(1024));
+ Put(1, "key2", DummyString(1024));
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3U);
+ {
+ auto iter = OpenTransactionLogIter(0);
+ ExpectRecords(3, iter);
+ }
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ env_->SleepForMicroseconds(2 * 1000 * 1000);
+ {
+ Put(0, "key4", DummyString(1024));
+ Put(1, "key5", DummyString(1024));
+ Put(0, "key6", DummyString(1024));
+ }
+ {
+ auto iter = OpenTransactionLogIter(0);
+ ExpectRecords(6, iter);
+ }
+ } while (ChangeCompactOptions());
+}
+
+#ifndef NDEBUG // sync point is not included with DNDEBUG build
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorRace) {
+ static const int LOG_ITERATOR_RACE_TEST_COUNT = 2;
+ static const char* sync_points[LOG_ITERATOR_RACE_TEST_COUNT][4] = {
+ {"WalManager::GetSortedWalFiles:1", "WalManager::PurgeObsoleteFiles:1",
+ "WalManager::PurgeObsoleteFiles:2", "WalManager::GetSortedWalFiles:2"},
+ {"WalManager::GetSortedWalsOfType:1",
+ "WalManager::PurgeObsoleteFiles:1",
+ "WalManager::PurgeObsoleteFiles:2",
+ "WalManager::GetSortedWalsOfType:2"}};
+ for (int test = 0; test < LOG_ITERATOR_RACE_TEST_COUNT; ++test) {
+ // Setup sync point dependency to reproduce the race condition of
+ // a log file moved to archived dir, in the middle of GetSortedWalFiles
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {sync_points[test][0], sync_points[test][1]},
+ {sync_points[test][2], sync_points[test][3]},
+ });
+
+ do {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ Options options = OptionsForLogIterTest();
+ DestroyAndReopen(options);
+ Put("key1", DummyString(1024));
+ dbfull()->Flush(FlushOptions());
+ Put("key2", DummyString(1024));
+ dbfull()->Flush(FlushOptions());
+ Put("key3", DummyString(1024));
+ dbfull()->Flush(FlushOptions());
+ Put("key4", DummyString(1024));
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4U);
+ dbfull()->FlushWAL(false);
+
+ {
+ auto iter = OpenTransactionLogIter(0);
+ ExpectRecords(4, iter);
+ }
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ // trigger async flush, and log move. Well, log move will
+ // wait until the GetSortedWalFiles:1 to reproduce the race
+ // condition
+ FlushOptions flush_options;
+ flush_options.wait = false;
+ dbfull()->Flush(flush_options);
+
+ // "key5" would be written in a new memtable and log
+ Put("key5", DummyString(1024));
+ dbfull()->FlushWAL(false);
+ {
+ // this iter would miss "key4" if not fixed
+ auto iter = OpenTransactionLogIter(0);
+ ExpectRecords(5, iter);
+ }
+ } while (ChangeCompactOptions());
+ }
+}
+#endif
+
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorStallAtLastRecord) {
+ do {
+ Options options = OptionsForLogIterTest();
+ DestroyAndReopen(options);
+ Put("key1", DummyString(1024));
+ auto iter = OpenTransactionLogIter(0);
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ iter->Next();
+ ASSERT_TRUE(!iter->Valid());
+ ASSERT_OK(iter->status());
+ Put("key2", DummyString(1024));
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorCheckAfterRestart) {
+ do {
+ Options options = OptionsForLogIterTest();
+ DestroyAndReopen(options);
+ Put("key1", DummyString(1024));
+ Put("key2", DummyString(1023));
+ dbfull()->Flush(FlushOptions());
+ Reopen(options);
+ auto iter = OpenTransactionLogIter(0);
+ ExpectRecords(2, iter);
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorCorruptedLog) {
+ do {
+ Options options = OptionsForLogIterTest();
+ DestroyAndReopen(options);
+ for (int i = 0; i < 1024; i++) {
+ Put("key"+ToString(i), DummyString(10));
+ }
+ dbfull()->Flush(FlushOptions());
+ dbfull()->FlushWAL(false);
+ // Corrupt this log to create a gap
+ ROCKSDB_NAMESPACE::VectorLogPtr wal_files;
+ ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
+ const auto logfile_path = dbname_ + "/" + wal_files.front()->PathName();
+ if (mem_env_) {
+ mem_env_->Truncate(logfile_path, wal_files.front()->SizeFileBytes() / 2);
+ } else {
+ ASSERT_EQ(0, truncate(logfile_path.c_str(),
+ wal_files.front()->SizeFileBytes() / 2));
+ }
+
+ // Insert a new entry to a new log file
+ Put("key1025", DummyString(10));
+ dbfull()->FlushWAL(false);
+ // Try to read from the beginning. Should stop before the gap and read less
+ // than 1025 entries
+ auto iter = OpenTransactionLogIter(0);
+ int count;
+ SequenceNumber last_sequence_read = ReadRecords(iter, count);
+ ASSERT_LT(last_sequence_read, 1025U);
+ // Try to read past the gap, should be able to seek to key1025
+ auto iter2 = OpenTransactionLogIter(last_sequence_read + 1);
+ ExpectRecords(1, iter2);
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorBatchOperations) {
+ do {
+ Options options = OptionsForLogIterTest();
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ WriteBatch batch;
+ batch.Put(handles_[1], "key1", DummyString(1024));
+ batch.Put(handles_[0], "key2", DummyString(1024));
+ batch.Put(handles_[1], "key3", DummyString(1024));
+ batch.Delete(handles_[0], "key2");
+ dbfull()->Write(WriteOptions(), &batch);
+ Flush(1);
+ Flush(0);
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ Put(1, "key4", DummyString(1024));
+ auto iter = OpenTransactionLogIter(3);
+ ExpectRecords(2, iter);
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorBlobs) {
+ Options options = OptionsForLogIterTest();
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ {
+ WriteBatch batch;
+ batch.Put(handles_[1], "key1", DummyString(1024));
+ batch.Put(handles_[0], "key2", DummyString(1024));
+ batch.PutLogData(Slice("blob1"));
+ batch.Put(handles_[1], "key3", DummyString(1024));
+ batch.PutLogData(Slice("blob2"));
+ batch.Delete(handles_[0], "key2");
+ dbfull()->Write(WriteOptions(), &batch);
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ }
+
+ auto res = OpenTransactionLogIter(0)->GetBatch();
+ struct Handler : public WriteBatch::Handler {
+ std::string seen;
+ Status PutCF(uint32_t cf, const Slice& key, const Slice& value) override {
+ seen += "Put(" + ToString(cf) + ", " + key.ToString() + ", " +
+ ToString(value.size()) + ")";
+ return Status::OK();
+ }
+ Status MergeCF(uint32_t cf, const Slice& key, const Slice& value) override {
+ seen += "Merge(" + ToString(cf) + ", " + key.ToString() + ", " +
+ ToString(value.size()) + ")";
+ return Status::OK();
+ }
+ void LogData(const Slice& blob) override {
+ seen += "LogData(" + blob.ToString() + ")";
+ }
+ Status DeleteCF(uint32_t cf, const Slice& key) override {
+ seen += "Delete(" + ToString(cf) + ", " + key.ToString() + ")";
+ return Status::OK();
+ }
+ } handler;
+ res.writeBatchPtr->Iterate(&handler);
+ ASSERT_EQ(
+ "Put(1, key1, 1024)"
+ "Put(0, key2, 1024)"
+ "LogData(blob1)"
+ "Put(1, key3, 1024)"
+ "LogData(blob2)"
+ "Delete(0, key2)",
+ handler.seen);
+}
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !defined(ROCKSDB_LITE)
+
+int main(int argc, char** argv) {
+#if !defined(ROCKSDB_LITE)
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+#else
+ (void) argc;
+ (void) argv;
+ return 0;
+#endif
+}
diff --git a/src/rocksdb/db/db_memtable_test.cc b/src/rocksdb/db/db_memtable_test.cc
new file mode 100644
index 000000000..a2f4e327c
--- /dev/null
+++ b/src/rocksdb/db/db_memtable_test.cc
@@ -0,0 +1,340 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <memory>
+#include <string>
+
+#include "db/db_test_util.h"
+#include "db/memtable.h"
+#include "db/range_del_aggregator.h"
+#include "port/stack_trace.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/slice_transform.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBMemTableTest : public DBTestBase {
+ public:
+ DBMemTableTest() : DBTestBase("/db_memtable_test") {}
+};
+
+class MockMemTableRep : public MemTableRep {
+ public:
+ explicit MockMemTableRep(Allocator* allocator, MemTableRep* rep)
+ : MemTableRep(allocator), rep_(rep), num_insert_with_hint_(0) {}
+
+ KeyHandle Allocate(const size_t len, char** buf) override {
+ return rep_->Allocate(len, buf);
+ }
+
+ void Insert(KeyHandle handle) override { rep_->Insert(handle); }
+
+ void InsertWithHint(KeyHandle handle, void** hint) override {
+ num_insert_with_hint_++;
+ EXPECT_NE(nullptr, hint);
+ last_hint_in_ = *hint;
+ rep_->InsertWithHint(handle, hint);
+ last_hint_out_ = *hint;
+ }
+
+ bool Contains(const char* key) const override { return rep_->Contains(key); }
+
+ void Get(const LookupKey& k, void* callback_args,
+ bool (*callback_func)(void* arg, const char* entry)) override {
+ rep_->Get(k, callback_args, callback_func);
+ }
+
+ size_t ApproximateMemoryUsage() override {
+ return rep_->ApproximateMemoryUsage();
+ }
+
+ Iterator* GetIterator(Arena* arena) override {
+ return rep_->GetIterator(arena);
+ }
+
+ void* last_hint_in() { return last_hint_in_; }
+ void* last_hint_out() { return last_hint_out_; }
+ int num_insert_with_hint() { return num_insert_with_hint_; }
+
+ private:
+ std::unique_ptr<MemTableRep> rep_;
+ void* last_hint_in_;
+ void* last_hint_out_;
+ int num_insert_with_hint_;
+};
+
+class MockMemTableRepFactory : public MemTableRepFactory {
+ public:
+ MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator& cmp,
+ Allocator* allocator,
+ const SliceTransform* transform,
+ Logger* logger) override {
+ SkipListFactory factory;
+ MemTableRep* skiplist_rep =
+ factory.CreateMemTableRep(cmp, allocator, transform, logger);
+ mock_rep_ = new MockMemTableRep(allocator, skiplist_rep);
+ return mock_rep_;
+ }
+
+ MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator& cmp,
+ Allocator* allocator,
+ const SliceTransform* transform,
+ Logger* logger,
+ uint32_t column_family_id) override {
+ last_column_family_id_ = column_family_id;
+ return CreateMemTableRep(cmp, allocator, transform, logger);
+ }
+
+ const char* Name() const override { return "MockMemTableRepFactory"; }
+
+ MockMemTableRep* rep() { return mock_rep_; }
+
+ bool IsInsertConcurrentlySupported() const override { return false; }
+
+ uint32_t GetLastColumnFamilyId() { return last_column_family_id_; }
+
+ private:
+ MockMemTableRep* mock_rep_;
+ // workaround since there's no port::kMaxUint32 yet.
+ uint32_t last_column_family_id_ = static_cast<uint32_t>(-1);
+};
+
+class TestPrefixExtractor : public SliceTransform {
+ public:
+ const char* Name() const override { return "TestPrefixExtractor"; }
+
+ Slice Transform(const Slice& key) const override {
+ const char* p = separator(key);
+ if (p == nullptr) {
+ return Slice();
+ }
+ return Slice(key.data(), p - key.data() + 1);
+ }
+
+ bool InDomain(const Slice& key) const override {
+ return separator(key) != nullptr;
+ }
+
+ bool InRange(const Slice& /*key*/) const override { return false; }
+
+ private:
+ const char* separator(const Slice& key) const {
+ return reinterpret_cast<const char*>(memchr(key.data(), '_', key.size()));
+ }
+};
+
+// Test that ::Add properly returns false when inserting duplicate keys
+TEST_F(DBMemTableTest, DuplicateSeq) {
+ SequenceNumber seq = 123;
+ std::string value;
+ Status s;
+ MergeContext merge_context;
+ Options options;
+ InternalKeyComparator ikey_cmp(options.comparator);
+ ReadRangeDelAggregator range_del_agg(&ikey_cmp,
+ kMaxSequenceNumber /* upper_bound */);
+
+ // Create a MemTable
+ InternalKeyComparator cmp(BytewiseComparator());
+ auto factory = std::make_shared<SkipListFactory>();
+ options.memtable_factory = factory;
+ ImmutableCFOptions ioptions(options);
+ WriteBufferManager wb(options.db_write_buffer_size);
+ MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+ kMaxSequenceNumber, 0 /* column_family_id */);
+
+ // Write some keys and make sure it returns false on duplicates
+ bool res;
+ res = mem->Add(seq, kTypeValue, "key", "value2");
+ ASSERT_TRUE(res);
+ res = mem->Add(seq, kTypeValue, "key", "value2");
+ ASSERT_FALSE(res);
+ // Changing the type should still cause the duplicatae key
+ res = mem->Add(seq, kTypeMerge, "key", "value2");
+ ASSERT_FALSE(res);
+ // Changing the seq number will make the key fresh
+ res = mem->Add(seq + 1, kTypeMerge, "key", "value2");
+ ASSERT_TRUE(res);
+ // Test with different types for duplicate keys
+ res = mem->Add(seq, kTypeDeletion, "key", "");
+ ASSERT_FALSE(res);
+ res = mem->Add(seq, kTypeSingleDeletion, "key", "");
+ ASSERT_FALSE(res);
+
+ // Test the duplicate keys under stress
+ for (int i = 0; i < 10000; i++) {
+ bool insert_dup = i % 10 == 1;
+ if (!insert_dup) {
+ seq++;
+ }
+ res = mem->Add(seq, kTypeValue, "foo", "value" + ToString(seq));
+ if (insert_dup) {
+ ASSERT_FALSE(res);
+ } else {
+ ASSERT_TRUE(res);
+ }
+ }
+ delete mem;
+
+ // Test with InsertWithHint
+ options.memtable_insert_with_hint_prefix_extractor.reset(
+ new TestPrefixExtractor()); // which uses _ to extract the prefix
+ ioptions = ImmutableCFOptions(options);
+ mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+ kMaxSequenceNumber, 0 /* column_family_id */);
+ // Insert a duplicate key with _ in it
+ res = mem->Add(seq, kTypeValue, "key_1", "value");
+ ASSERT_TRUE(res);
+ res = mem->Add(seq, kTypeValue, "key_1", "value");
+ ASSERT_FALSE(res);
+ delete mem;
+
+ // Test when InsertConcurrently will be invoked
+ options.allow_concurrent_memtable_write = true;
+ ioptions = ImmutableCFOptions(options);
+ mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+ kMaxSequenceNumber, 0 /* column_family_id */);
+ MemTablePostProcessInfo post_process_info;
+ res = mem->Add(seq, kTypeValue, "key", "value", true, &post_process_info);
+ ASSERT_TRUE(res);
+ res = mem->Add(seq, kTypeValue, "key", "value", true, &post_process_info);
+ ASSERT_FALSE(res);
+ delete mem;
+}
+
+// A simple test to verify that the concurrent merge writes is functional
+TEST_F(DBMemTableTest, ConcurrentMergeWrite) {
+ int num_ops = 1000;
+ std::string value;
+ Status s;
+ MergeContext merge_context;
+ Options options;
+ // A merge operator that is not sensitive to concurrent writes since in this
+ // test we don't order the writes.
+ options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+
+ // Create a MemTable
+ InternalKeyComparator cmp(BytewiseComparator());
+ auto factory = std::make_shared<SkipListFactory>();
+ options.memtable_factory = factory;
+ options.allow_concurrent_memtable_write = true;
+ ImmutableCFOptions ioptions(options);
+ WriteBufferManager wb(options.db_write_buffer_size);
+ MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+ kMaxSequenceNumber, 0 /* column_family_id */);
+
+ // Put 0 as the base
+ PutFixed64(&value, static_cast<uint64_t>(0));
+ bool res = mem->Add(0, kTypeValue, "key", value);
+ ASSERT_TRUE(res);
+ value.clear();
+
+ // Write Merge concurrently
+ ROCKSDB_NAMESPACE::port::Thread write_thread1([&]() {
+ MemTablePostProcessInfo post_process_info1;
+ std::string v1;
+ for (int seq = 1; seq < num_ops / 2; seq++) {
+ PutFixed64(&v1, seq);
+ bool res1 =
+ mem->Add(seq, kTypeMerge, "key", v1, true, &post_process_info1);
+ ASSERT_TRUE(res1);
+ v1.clear();
+ }
+ });
+ ROCKSDB_NAMESPACE::port::Thread write_thread2([&]() {
+ MemTablePostProcessInfo post_process_info2;
+ std::string v2;
+ for (int seq = num_ops / 2; seq < num_ops; seq++) {
+ PutFixed64(&v2, seq);
+ bool res2 =
+ mem->Add(seq, kTypeMerge, "key", v2, true, &post_process_info2);
+ ASSERT_TRUE(res2);
+ v2.clear();
+ }
+ });
+ write_thread1.join();
+ write_thread2.join();
+
+ Status status;
+ ReadOptions roptions;
+ SequenceNumber max_covering_tombstone_seq = 0;
+ LookupKey lkey("key", kMaxSequenceNumber);
+ res = mem->Get(lkey, &value, &status, &merge_context,
+ &max_covering_tombstone_seq, roptions);
+ ASSERT_TRUE(res);
+ uint64_t ivalue = DecodeFixed64(Slice(value).data());
+ uint64_t sum = 0;
+ for (int seq = 0; seq < num_ops; seq++) {
+ sum += seq;
+ }
+ ASSERT_EQ(ivalue, sum);
+
+ delete mem;
+}
+
+TEST_F(DBMemTableTest, InsertWithHint) {
+ Options options;
+ options.allow_concurrent_memtable_write = false;
+ options.create_if_missing = true;
+ options.memtable_factory.reset(new MockMemTableRepFactory());
+ options.memtable_insert_with_hint_prefix_extractor.reset(
+ new TestPrefixExtractor());
+ options.env = env_;
+ Reopen(options);
+ MockMemTableRep* rep =
+ reinterpret_cast<MockMemTableRepFactory*>(options.memtable_factory.get())
+ ->rep();
+ ASSERT_OK(Put("foo_k1", "foo_v1"));
+ ASSERT_EQ(nullptr, rep->last_hint_in());
+ void* hint_foo = rep->last_hint_out();
+ ASSERT_OK(Put("foo_k2", "foo_v2"));
+ ASSERT_EQ(hint_foo, rep->last_hint_in());
+ ASSERT_EQ(hint_foo, rep->last_hint_out());
+ ASSERT_OK(Put("foo_k3", "foo_v3"));
+ ASSERT_EQ(hint_foo, rep->last_hint_in());
+ ASSERT_EQ(hint_foo, rep->last_hint_out());
+ ASSERT_OK(Put("bar_k1", "bar_v1"));
+ ASSERT_EQ(nullptr, rep->last_hint_in());
+ void* hint_bar = rep->last_hint_out();
+ ASSERT_NE(hint_foo, hint_bar);
+ ASSERT_OK(Put("bar_k2", "bar_v2"));
+ ASSERT_EQ(hint_bar, rep->last_hint_in());
+ ASSERT_EQ(hint_bar, rep->last_hint_out());
+ ASSERT_EQ(5, rep->num_insert_with_hint());
+ ASSERT_OK(Put("whitelisted", "vvv"));
+ ASSERT_EQ(5, rep->num_insert_with_hint());
+ ASSERT_EQ("foo_v1", Get("foo_k1"));
+ ASSERT_EQ("foo_v2", Get("foo_k2"));
+ ASSERT_EQ("foo_v3", Get("foo_k3"));
+ ASSERT_EQ("bar_v1", Get("bar_k1"));
+ ASSERT_EQ("bar_v2", Get("bar_k2"));
+ ASSERT_EQ("vvv", Get("whitelisted"));
+}
+
+TEST_F(DBMemTableTest, ColumnFamilyId) {
+ // Verifies MemTableRepFactory is told the right column family id.
+ Options options;
+ options.allow_concurrent_memtable_write = false;
+ options.create_if_missing = true;
+ options.memtable_factory.reset(new MockMemTableRepFactory());
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ for (uint32_t cf = 0; cf < 2; ++cf) {
+ ASSERT_OK(Put(cf, "key", "val"));
+ ASSERT_OK(Flush(cf));
+ ASSERT_EQ(
+ cf, static_cast<MockMemTableRepFactory*>(options.memtable_factory.get())
+ ->GetLastColumnFamilyId());
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_merge_operand_test.cc b/src/rocksdb/db/db_merge_operand_test.cc
new file mode 100644
index 000000000..a0ab34e01
--- /dev/null
+++ b/src/rocksdb/db/db_merge_operand_test.cc
@@ -0,0 +1,240 @@
+// Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/utilities/debug.h"
+#include "table/block_based/block_builder.h"
+#include "test_util/fault_injection_test_env.h"
+#if !defined(ROCKSDB_LITE)
+#include "test_util/sync_point.h"
+#endif
+#include "rocksdb/merge_operator.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/sortlist.h"
+#include "utilities/merge_operators/string_append/stringappend2.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBMergeOperandTest : public DBTestBase {
+ public:
+ DBMergeOperandTest() : DBTestBase("/db_merge_operand_test") {}
+};
+
+TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) {
+ class LimitedStringAppendMergeOp : public StringAppendTESTOperator {
+ public:
+ LimitedStringAppendMergeOp(int limit, char delim)
+ : StringAppendTESTOperator(delim), limit_(limit) {}
+
+ const char* Name() const override {
+ return "DBMergeOperatorTest::LimitedStringAppendMergeOp";
+ }
+
+ bool ShouldMerge(const std::vector<Slice>& operands) const override {
+ if (operands.size() > 0 && limit_ > 0 && operands.size() >= limit_) {
+ return true;
+ }
+ return false;
+ }
+
+ private:
+ size_t limit_ = 0;
+ };
+
+ Options options;
+ options.create_if_missing = true;
+ // Use only the latest two merge operands.
+ options.merge_operator = std::make_shared<LimitedStringAppendMergeOp>(2, ',');
+ options.env = env_;
+ Reopen(options);
+ int num_records = 4;
+ int number_of_operands = 0;
+ std::vector<PinnableSlice> values(num_records);
+ GetMergeOperandsOptions merge_operands_info;
+ merge_operands_info.expected_max_number_of_operands = num_records;
+
+ // k0 value in memtable
+ Put("k0", "PutARock");
+ db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k0",
+ values.data(), &merge_operands_info,
+ &number_of_operands);
+ ASSERT_EQ(values[0], "PutARock");
+
+ // k0.1 value in SST
+ Put("k0.1", "RockInSST");
+ ASSERT_OK(Flush());
+ db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k0.1",
+ values.data(), &merge_operands_info,
+ &number_of_operands);
+ ASSERT_EQ(values[0], "RockInSST");
+
+ // All k1 values are in memtable.
+ ASSERT_OK(Merge("k1", "a"));
+ Put("k1", "x");
+ ASSERT_OK(Merge("k1", "b"));
+ ASSERT_OK(Merge("k1", "c"));
+ ASSERT_OK(Merge("k1", "d"));
+ db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1",
+ values.data(), &merge_operands_info,
+ &number_of_operands);
+ ASSERT_EQ(values[0], "x");
+ ASSERT_EQ(values[1], "b");
+ ASSERT_EQ(values[2], "c");
+ ASSERT_EQ(values[3], "d");
+
+ // expected_max_number_of_operands is less than number of merge operands so
+ // status should be Incomplete.
+ merge_operands_info.expected_max_number_of_operands = num_records - 1;
+ Status status = db_->GetMergeOperands(
+ ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(),
+ &merge_operands_info, &number_of_operands);
+ ASSERT_EQ(status.IsIncomplete(), true);
+ merge_operands_info.expected_max_number_of_operands = num_records;
+
+ // All k1.1 values are in memtable.
+ ASSERT_OK(Merge("k1.1", "r"));
+ Delete("k1.1");
+ ASSERT_OK(Merge("k1.1", "c"));
+ ASSERT_OK(Merge("k1.1", "k"));
+ ASSERT_OK(Merge("k1.1", "s"));
+ db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1.1",
+ values.data(), &merge_operands_info,
+ &number_of_operands);
+ ASSERT_EQ(values[0], "c");
+ ASSERT_EQ(values[1], "k");
+ ASSERT_EQ(values[2], "s");
+
+ // All k2 values are flushed to L0 into a single file.
+ ASSERT_OK(Merge("k2", "q"));
+ ASSERT_OK(Merge("k2", "w"));
+ ASSERT_OK(Merge("k2", "e"));
+ ASSERT_OK(Merge("k2", "r"));
+ ASSERT_OK(Flush());
+ db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2",
+ values.data(), &merge_operands_info,
+ &number_of_operands);
+ ASSERT_EQ(values[0], "q");
+ ASSERT_EQ(values[1], "w");
+ ASSERT_EQ(values[2], "e");
+ ASSERT_EQ(values[3], "r");
+
+ // All k2.1 values are flushed to L0 into a single file.
+ ASSERT_OK(Merge("k2.1", "m"));
+ Put("k2.1", "l");
+ ASSERT_OK(Merge("k2.1", "n"));
+ ASSERT_OK(Merge("k2.1", "o"));
+ ASSERT_OK(Flush());
+ db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2.1",
+ values.data(), &merge_operands_info,
+ &number_of_operands);
+ ASSERT_EQ(values[0], "l,n,o");
+
+ // All k2.2 values are flushed to L0 into a single file.
+ ASSERT_OK(Merge("k2.2", "g"));
+ Delete("k2.2");
+ ASSERT_OK(Merge("k2.2", "o"));
+ ASSERT_OK(Merge("k2.2", "t"));
+ ASSERT_OK(Flush());
+ db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2.2",
+ values.data(), &merge_operands_info,
+ &number_of_operands);
+ ASSERT_EQ(values[0], "o,t");
+
+ // Do some compaction that will make the following tests more predictable
+ // Slice start("PutARock");
+ // Slice end("t");
+ db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+ // All k3 values are flushed and are in different files.
+ ASSERT_OK(Merge("k3", "ab"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("k3", "bc"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("k3", "cd"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("k3", "de"));
+ db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3",
+ values.data(), &merge_operands_info,
+ &number_of_operands);
+ ASSERT_EQ(values[0], "ab");
+ ASSERT_EQ(values[1], "bc");
+ ASSERT_EQ(values[2], "cd");
+ ASSERT_EQ(values[3], "de");
+
+ // All k3.1 values are flushed and are in different files.
+ ASSERT_OK(Merge("k3.1", "ab"));
+ ASSERT_OK(Flush());
+ Put("k3.1", "bc");
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("k3.1", "cd"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("k3.1", "de"));
+ db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3.1",
+ values.data(), &merge_operands_info,
+ &number_of_operands);
+ ASSERT_EQ(values[0], "bc");
+ ASSERT_EQ(values[1], "cd");
+ ASSERT_EQ(values[2], "de");
+
+ // All k3.2 values are flushed and are in different files.
+ ASSERT_OK(Merge("k3.2", "ab"));
+ ASSERT_OK(Flush());
+ Delete("k3.2");
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("k3.2", "cd"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("k3.2", "de"));
+ db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3.2",
+ values.data(), &merge_operands_info,
+ &number_of_operands);
+ ASSERT_EQ(values[0], "cd");
+ ASSERT_EQ(values[1], "de");
+
+ // All K4 values are in different levels
+ ASSERT_OK(Merge("k4", "ba"));
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(4);
+ ASSERT_OK(Merge("k4", "cb"));
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(3);
+ ASSERT_OK(Merge("k4", "dc"));
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(1);
+ ASSERT_OK(Merge("k4", "ed"));
+ db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k4",
+ values.data(), &merge_operands_info,
+ &number_of_operands);
+ ASSERT_EQ(values[0], "ba");
+ ASSERT_EQ(values[1], "cb");
+ ASSERT_EQ(values[2], "dc");
+ ASSERT_EQ(values[3], "ed");
+
+ // First 3 k5 values are in SST and next 4 k5 values are in Immutable Memtable
+ ASSERT_OK(Merge("k5", "who"));
+ ASSERT_OK(Merge("k5", "am"));
+ ASSERT_OK(Merge("k5", "i"));
+ ASSERT_OK(Flush());
+ Put("k5", "remember");
+ ASSERT_OK(Merge("k5", "i"));
+ ASSERT_OK(Merge("k5", "am"));
+ ASSERT_OK(Merge("k5", "rocks"));
+ dbfull()->TEST_SwitchMemtable();
+ db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k5",
+ values.data(), &merge_operands_info,
+ &number_of_operands);
+ ASSERT_EQ(values[0], "remember");
+ ASSERT_EQ(values[1], "i");
+ ASSERT_EQ(values[2], "am");
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_merge_operator_test.cc b/src/rocksdb/db/db_merge_operator_test.cc
new file mode 100644
index 000000000..4f762468d
--- /dev/null
+++ b/src/rocksdb/db/db_merge_operator_test.cc
@@ -0,0 +1,666 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#include <string>
+#include <vector>
+
+#include "db/db_test_util.h"
+#include "db/forward_iterator.h"
+#include "port/stack_trace.h"
+#include "rocksdb/merge_operator.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/string_append/stringappend2.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TestReadCallback : public ReadCallback {
+ public:
+ TestReadCallback(SnapshotChecker* snapshot_checker,
+ SequenceNumber snapshot_seq)
+ : ReadCallback(snapshot_seq),
+ snapshot_checker_(snapshot_checker),
+ snapshot_seq_(snapshot_seq) {}
+
+ bool IsVisibleFullCheck(SequenceNumber seq) override {
+ return snapshot_checker_->CheckInSnapshot(seq, snapshot_seq_) ==
+ SnapshotCheckerResult::kInSnapshot;
+ }
+
+ private:
+ SnapshotChecker* snapshot_checker_;
+ SequenceNumber snapshot_seq_;
+};
+
+// Test merge operator functionality.
+class DBMergeOperatorTest : public DBTestBase {
+ public:
+ DBMergeOperatorTest() : DBTestBase("/db_merge_operator_test") {}
+
+ std::string GetWithReadCallback(SnapshotChecker* snapshot_checker,
+ const Slice& key,
+ const Snapshot* snapshot = nullptr) {
+ SequenceNumber seq = snapshot == nullptr ? db_->GetLatestSequenceNumber()
+ : snapshot->GetSequenceNumber();
+ TestReadCallback read_callback(snapshot_checker, seq);
+ ReadOptions read_opt;
+ read_opt.snapshot = snapshot;
+ PinnableSlice value;
+ DBImpl::GetImplOptions get_impl_options;
+ get_impl_options.column_family = db_->DefaultColumnFamily();
+ get_impl_options.value = &value;
+ get_impl_options.callback = &read_callback;
+ Status s = dbfull()->GetImpl(read_opt, key, get_impl_options);
+ if (!s.ok()) {
+ return s.ToString();
+ }
+ return value.ToString();
+ }
+};
+
+TEST_F(DBMergeOperatorTest, LimitMergeOperands) {
+ class LimitedStringAppendMergeOp : public StringAppendTESTOperator {
+ public:
+ LimitedStringAppendMergeOp(int limit, char delim)
+ : StringAppendTESTOperator(delim), limit_(limit) {}
+
+ const char* Name() const override {
+ return "DBMergeOperatorTest::LimitedStringAppendMergeOp";
+ }
+
+ bool ShouldMerge(const std::vector<Slice>& operands) const override {
+ if (operands.size() > 0 && limit_ > 0 && operands.size() >= limit_) {
+ return true;
+ }
+ return false;
+ }
+
+ private:
+ size_t limit_ = 0;
+ };
+
+ Options options;
+ options.create_if_missing = true;
+ // Use only the latest two merge operands.
+ options.merge_operator =
+ std::make_shared<LimitedStringAppendMergeOp>(2, ',');
+ options.env = env_;
+ Reopen(options);
+ // All K1 values are in memtable.
+ ASSERT_OK(Merge("k1", "a"));
+ ASSERT_OK(Merge("k1", "b"));
+ ASSERT_OK(Merge("k1", "c"));
+ ASSERT_OK(Merge("k1", "d"));
+ std::string value;
+ ASSERT_TRUE(db_->Get(ReadOptions(), "k1", &value).ok());
+ // Make sure that only the latest two merge operands are used. If this was
+ // not the case the value would be "a,b,c,d".
+ ASSERT_EQ(value, "c,d");
+
+ // All K2 values are flushed to L0 into a single file.
+ ASSERT_OK(Merge("k2", "a"));
+ ASSERT_OK(Merge("k2", "b"));
+ ASSERT_OK(Merge("k2", "c"));
+ ASSERT_OK(Merge("k2", "d"));
+ ASSERT_OK(Flush());
+ ASSERT_TRUE(db_->Get(ReadOptions(), "k2", &value).ok());
+ ASSERT_EQ(value, "c,d");
+
+ // All K3 values are flushed and are in different files.
+ ASSERT_OK(Merge("k3", "ab"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("k3", "bc"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("k3", "cd"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("k3", "de"));
+ ASSERT_TRUE(db_->Get(ReadOptions(), "k3", &value).ok());
+ ASSERT_EQ(value, "cd,de");
+
+ // All K4 values are in different levels
+ ASSERT_OK(Merge("k4", "ab"));
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(4);
+ ASSERT_OK(Merge("k4", "bc"));
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(3);
+ ASSERT_OK(Merge("k4", "cd"));
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(1);
+ ASSERT_OK(Merge("k4", "de"));
+ ASSERT_TRUE(db_->Get(ReadOptions(), "k4", &value).ok());
+ ASSERT_EQ(value, "cd,de");
+}
+
+TEST_F(DBMergeOperatorTest, MergeErrorOnRead) {
+ Options options;
+ options.create_if_missing = true;
+ options.merge_operator.reset(new TestPutOperator());
+ options.env = env_;
+ Reopen(options);
+ ASSERT_OK(Merge("k1", "v1"));
+ ASSERT_OK(Merge("k1", "corrupted"));
+ std::string value;
+ ASSERT_TRUE(db_->Get(ReadOptions(), "k1", &value).IsCorruption());
+ VerifyDBInternal({{"k1", "corrupted"}, {"k1", "v1"}});
+}
+
+TEST_F(DBMergeOperatorTest, MergeErrorOnWrite) {
+ Options options;
+ options.create_if_missing = true;
+ options.merge_operator.reset(new TestPutOperator());
+ options.max_successive_merges = 3;
+ options.env = env_;
+ Reopen(options);
+ ASSERT_OK(Merge("k1", "v1"));
+ ASSERT_OK(Merge("k1", "v2"));
+ // Will trigger a merge when hitting max_successive_merges and the merge
+ // will fail. The delta will be inserted nevertheless.
+ ASSERT_OK(Merge("k1", "corrupted"));
+ // Data should stay unmerged after the error.
+ VerifyDBInternal({{"k1", "corrupted"}, {"k1", "v2"}, {"k1", "v1"}});
+}
+
+TEST_F(DBMergeOperatorTest, MergeErrorOnIteration) {
+ Options options;
+ options.create_if_missing = true;
+ options.merge_operator.reset(new TestPutOperator());
+ options.env = env_;
+
+ DestroyAndReopen(options);
+ ASSERT_OK(Merge("k1", "v1"));
+ ASSERT_OK(Merge("k1", "corrupted"));
+ ASSERT_OK(Put("k2", "v2"));
+ auto* iter = db_->NewIterator(ReadOptions());
+ iter->Seek("k1");
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_TRUE(iter->status().IsCorruption());
+ delete iter;
+ iter = db_->NewIterator(ReadOptions());
+ iter->Seek("k2");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ iter->Prev();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_TRUE(iter->status().IsCorruption());
+ delete iter;
+ VerifyDBInternal({{"k1", "corrupted"}, {"k1", "v1"}, {"k2", "v2"}});
+
+ DestroyAndReopen(options);
+ ASSERT_OK(Merge("k1", "v1"));
+ ASSERT_OK(Put("k2", "v2"));
+ ASSERT_OK(Merge("k2", "corrupted"));
+ iter = db_->NewIterator(ReadOptions());
+ iter->Seek("k1");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_TRUE(iter->status().IsCorruption());
+ delete iter;
+ VerifyDBInternal({{"k1", "v1"}, {"k2", "corrupted"}, {"k2", "v2"}});
+}
+
+
+class MergeOperatorPinningTest : public DBMergeOperatorTest,
+ public testing::WithParamInterface<bool> {
+ public:
+ MergeOperatorPinningTest() { disable_block_cache_ = GetParam(); }
+
+ bool disable_block_cache_;
+};
+
+INSTANTIATE_TEST_CASE_P(MergeOperatorPinningTest, MergeOperatorPinningTest,
+ ::testing::Bool());
+
+#ifndef ROCKSDB_LITE
+TEST_P(MergeOperatorPinningTest, OperandsMultiBlocks) {
+ Options options = CurrentOptions();
+ BlockBasedTableOptions table_options;
+ table_options.block_size = 1; // every block will contain one entry
+ table_options.no_block_cache = disable_block_cache_;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.merge_operator = MergeOperators::CreateStringAppendTESTOperator();
+ options.level0_slowdown_writes_trigger = (1 << 30);
+ options.level0_stop_writes_trigger = (1 << 30);
+ options.disable_auto_compactions = true;
+ DestroyAndReopen(options);
+
+ const int kKeysPerFile = 10;
+ const int kOperandsPerKeyPerFile = 7;
+ const int kOperandSize = 100;
+ // Filse to write in L0 before compacting to lower level
+ const int kFilesPerLevel = 3;
+
+ Random rnd(301);
+ std::map<std::string, std::string> true_data;
+ int batch_num = 1;
+ int lvl_to_fill = 4;
+ int key_id = 0;
+ while (true) {
+ for (int j = 0; j < kKeysPerFile; j++) {
+ std::string key = Key(key_id % 35);
+ key_id++;
+ for (int k = 0; k < kOperandsPerKeyPerFile; k++) {
+ std::string val = RandomString(&rnd, kOperandSize);
+ ASSERT_OK(db_->Merge(WriteOptions(), key, val));
+ if (true_data[key].size() == 0) {
+ true_data[key] = val;
+ } else {
+ true_data[key] += "," + val;
+ }
+ }
+ }
+
+ if (lvl_to_fill == -1) {
+ // Keep last batch in memtable and stop
+ break;
+ }
+
+ ASSERT_OK(Flush());
+ if (batch_num % kFilesPerLevel == 0) {
+ if (lvl_to_fill != 0) {
+ MoveFilesToLevel(lvl_to_fill);
+ }
+ lvl_to_fill--;
+ }
+ batch_num++;
+ }
+
+ // 3 L0 files
+ // 1 L1 file
+ // 3 L2 files
+ // 1 L3 file
+ // 3 L4 Files
+ ASSERT_EQ(FilesPerLevel(), "3,1,3,1,3");
+
+ VerifyDBFromMap(true_data);
+}
+
+class MergeOperatorHook : public MergeOperator {
+ public:
+ explicit MergeOperatorHook(std::shared_ptr<MergeOperator> _merge_op)
+ : merge_op_(_merge_op) {}
+
+ bool FullMergeV2(const MergeOperationInput& merge_in,
+ MergeOperationOutput* merge_out) const override {
+ before_merge_();
+ bool res = merge_op_->FullMergeV2(merge_in, merge_out);
+ after_merge_();
+ return res;
+ }
+
+ const char* Name() const override { return merge_op_->Name(); }
+
+ std::shared_ptr<MergeOperator> merge_op_;
+ std::function<void()> before_merge_ = []() {};
+ std::function<void()> after_merge_ = []() {};
+};
+
+TEST_P(MergeOperatorPinningTest, EvictCacheBeforeMerge) {
+ Options options = CurrentOptions();
+
+ auto merge_hook =
+ std::make_shared<MergeOperatorHook>(MergeOperators::CreateMaxOperator());
+ options.merge_operator = merge_hook;
+ options.disable_auto_compactions = true;
+ options.level0_slowdown_writes_trigger = (1 << 30);
+ options.level0_stop_writes_trigger = (1 << 30);
+ options.max_open_files = 20;
+ BlockBasedTableOptions bbto;
+ bbto.no_block_cache = disable_block_cache_;
+ if (bbto.no_block_cache == false) {
+ bbto.block_cache = NewLRUCache(64 * 1024 * 1024);
+ } else {
+ bbto.block_cache = nullptr;
+ }
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ DestroyAndReopen(options);
+
+ const int kNumOperands = 30;
+ const int kNumKeys = 1000;
+ const int kOperandSize = 100;
+ Random rnd(301);
+
+ // 1000 keys every key have 30 operands, every operand is in a different file
+ std::map<std::string, std::string> true_data;
+ for (int i = 0; i < kNumOperands; i++) {
+ for (int j = 0; j < kNumKeys; j++) {
+ std::string k = Key(j);
+ std::string v = RandomString(&rnd, kOperandSize);
+ ASSERT_OK(db_->Merge(WriteOptions(), k, v));
+
+ true_data[k] = std::max(true_data[k], v);
+ }
+ ASSERT_OK(Flush());
+ }
+
+ std::vector<uint64_t> file_numbers = ListTableFiles(env_, dbname_);
+ ASSERT_EQ(file_numbers.size(), kNumOperands);
+ int merge_cnt = 0;
+
+ // Code executed before merge operation
+ merge_hook->before_merge_ = [&]() {
+ // Evict all tables from cache before every merge operation
+ for (uint64_t num : file_numbers) {
+ TableCache::Evict(dbfull()->TEST_table_cache(), num);
+ }
+ // Decrease cache capacity to force all unrefed blocks to be evicted
+ if (bbto.block_cache) {
+ bbto.block_cache->SetCapacity(1);
+ }
+ merge_cnt++;
+ };
+
+ // Code executed after merge operation
+ merge_hook->after_merge_ = [&]() {
+ // Increase capacity again after doing the merge
+ if (bbto.block_cache) {
+ bbto.block_cache->SetCapacity(64 * 1024 * 1024);
+ }
+ };
+
+ size_t total_reads;
+ VerifyDBFromMap(true_data, &total_reads);
+ ASSERT_EQ(merge_cnt, total_reads);
+
+ db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+ VerifyDBFromMap(true_data, &total_reads);
+}
+
+TEST_P(MergeOperatorPinningTest, TailingIterator) {
+ Options options = CurrentOptions();
+ options.merge_operator = MergeOperators::CreateMaxOperator();
+ BlockBasedTableOptions bbto;
+ bbto.no_block_cache = disable_block_cache_;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ DestroyAndReopen(options);
+
+ const int kNumOperands = 100;
+ const int kNumWrites = 100000;
+
+ std::function<void()> writer_func = [&]() {
+ int k = 0;
+ for (int i = 0; i < kNumWrites; i++) {
+ db_->Merge(WriteOptions(), Key(k), Key(k));
+
+ if (i && i % kNumOperands == 0) {
+ k++;
+ }
+ if (i && i % 127 == 0) {
+ ASSERT_OK(Flush());
+ }
+ if (i && i % 317 == 0) {
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ }
+ }
+ };
+
+ std::function<void()> reader_func = [&]() {
+ ReadOptions ro;
+ ro.tailing = true;
+ Iterator* iter = db_->NewIterator(ro);
+
+ iter->SeekToFirst();
+ for (int i = 0; i < (kNumWrites / kNumOperands); i++) {
+ while (!iter->Valid()) {
+ // wait for the key to be written
+ env_->SleepForMicroseconds(100);
+ iter->Seek(Key(i));
+ }
+ ASSERT_EQ(iter->key(), Key(i));
+ ASSERT_EQ(iter->value(), Key(i));
+
+ iter->Next();
+ }
+
+ delete iter;
+ };
+
+ ROCKSDB_NAMESPACE::port::Thread writer_thread(writer_func);
+ ROCKSDB_NAMESPACE::port::Thread reader_thread(reader_func);
+
+ writer_thread.join();
+ reader_thread.join();
+}
+
+TEST_F(DBMergeOperatorTest, TailingIteratorMemtableUnrefedBySomeoneElse) {
+ Options options = CurrentOptions();
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ DestroyAndReopen(options);
+
+ // Overview of the test:
+ // * There are two merge operands for the same key: one in an sst file,
+ // another in a memtable.
+ // * Seek a tailing iterator to this key.
+ // * As part of the seek, the iterator will:
+ // (a) first visit the operand in the memtable and tell ForwardIterator
+ // to pin this operand, then
+ // (b) move on to the operand in the sst file, then pass both operands
+ // to merge operator.
+ // * The memtable may get flushed and unreferenced by another thread between
+ // (a) and (b). The test simulates it by flushing the memtable inside a
+ // SyncPoint callback located between (a) and (b).
+ // * In this case it's ForwardIterator's responsibility to keep the memtable
+ // pinned until (b) is complete. There used to be a bug causing
+ // ForwardIterator to not pin it in some circumstances. This test
+ // reproduces it.
+
+ db_->Merge(WriteOptions(), "key", "sst");
+ db_->Flush(FlushOptions()); // Switch to SuperVersion A
+ db_->Merge(WriteOptions(), "key", "memtable");
+
+ // Pin SuperVersion A
+ std::unique_ptr<Iterator> someone_else(db_->NewIterator(ReadOptions()));
+
+ bool pushed_first_operand = false;
+ bool stepped_to_next_operand = false;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBIter::MergeValuesNewToOld:PushedFirstOperand", [&](void*) {
+ EXPECT_FALSE(pushed_first_operand);
+ pushed_first_operand = true;
+ db_->Flush(FlushOptions()); // Switch to SuperVersion B
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBIter::MergeValuesNewToOld:SteppedToNextOperand", [&](void*) {
+ EXPECT_FALSE(stepped_to_next_operand);
+ stepped_to_next_operand = true;
+ someone_else.reset(); // Unpin SuperVersion A
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ReadOptions ro;
+ ro.tailing = true;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+ iter->Seek("key");
+
+ ASSERT_TRUE(iter->status().ok());
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ(std::string("sst,memtable"), iter->value().ToString());
+ EXPECT_TRUE(pushed_first_operand);
+ EXPECT_TRUE(stepped_to_next_operand);
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBMergeOperatorTest, SnapshotCheckerAndReadCallback) {
+ Options options = CurrentOptions();
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ DestroyAndReopen(options);
+
+ class TestSnapshotChecker : public SnapshotChecker {
+ public:
+ SnapshotCheckerResult CheckInSnapshot(
+ SequenceNumber seq, SequenceNumber snapshot_seq) const override {
+ return IsInSnapshot(seq, snapshot_seq)
+ ? SnapshotCheckerResult::kInSnapshot
+ : SnapshotCheckerResult::kNotInSnapshot;
+ }
+
+ bool IsInSnapshot(SequenceNumber seq, SequenceNumber snapshot_seq) const {
+ switch (snapshot_seq) {
+ case 0:
+ return seq == 0;
+ case 1:
+ return seq <= 1;
+ case 2:
+ // seq = 2 not visible to snapshot with seq = 2
+ return seq <= 1;
+ case 3:
+ return seq <= 3;
+ case 4:
+ // seq = 4 not visible to snpahost with seq = 4
+ return seq <= 3;
+ default:
+ // seq >=4 is uncommitted
+ return seq <= 4;
+ };
+ }
+ };
+ TestSnapshotChecker* snapshot_checker = new TestSnapshotChecker();
+ dbfull()->SetSnapshotChecker(snapshot_checker);
+
+ std::string value;
+ ASSERT_OK(Merge("foo", "v1"));
+ ASSERT_EQ(1, db_->GetLatestSequenceNumber());
+ ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo"));
+ ASSERT_OK(Merge("foo", "v2"));
+ ASSERT_EQ(2, db_->GetLatestSequenceNumber());
+ // v2 is not visible to latest snapshot, which has seq = 2.
+ ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo"));
+ // Take a snapshot with seq = 2.
+ const Snapshot* snapshot1 = db_->GetSnapshot();
+ ASSERT_EQ(2, snapshot1->GetSequenceNumber());
+ // v2 is not visible to snapshot1, which has seq = 2
+ ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1));
+
+ // Verify flush doesn't alter the result.
+ ASSERT_OK(Flush());
+ ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1));
+ ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo"));
+
+ ASSERT_OK(Merge("foo", "v3"));
+ ASSERT_EQ(3, db_->GetLatestSequenceNumber());
+ ASSERT_EQ("v1,v2,v3", GetWithReadCallback(snapshot_checker, "foo"));
+ ASSERT_OK(Merge("foo", "v4"));
+ ASSERT_EQ(4, db_->GetLatestSequenceNumber());
+ // v4 is not visible to latest snapshot, which has seq = 4.
+ ASSERT_EQ("v1,v2,v3", GetWithReadCallback(snapshot_checker, "foo"));
+ const Snapshot* snapshot2 = db_->GetSnapshot();
+ ASSERT_EQ(4, snapshot2->GetSequenceNumber());
+ // v4 is not visible to snapshot2, which has seq = 4.
+ ASSERT_EQ("v1,v2,v3",
+ GetWithReadCallback(snapshot_checker, "foo", snapshot2));
+
+ // Verify flush doesn't alter the result.
+ ASSERT_OK(Flush());
+ ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1));
+ ASSERT_EQ("v1,v2,v3",
+ GetWithReadCallback(snapshot_checker, "foo", snapshot2));
+ ASSERT_EQ("v1,v2,v3", GetWithReadCallback(snapshot_checker, "foo"));
+
+ ASSERT_OK(Merge("foo", "v5"));
+ ASSERT_EQ(5, db_->GetLatestSequenceNumber());
+ // v5 is uncommitted
+ ASSERT_EQ("v1,v2,v3,v4", GetWithReadCallback(snapshot_checker, "foo"));
+
+ // full manual compaction.
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ // Verify compaction doesn't alter the result.
+ ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1));
+ ASSERT_EQ("v1,v2,v3",
+ GetWithReadCallback(snapshot_checker, "foo", snapshot2));
+ ASSERT_EQ("v1,v2,v3,v4", GetWithReadCallback(snapshot_checker, "foo"));
+
+ db_->ReleaseSnapshot(snapshot1);
+ db_->ReleaseSnapshot(snapshot2);
+}
+
+class PerConfigMergeOperatorPinningTest
+ : public DBMergeOperatorTest,
+ public testing::WithParamInterface<std::tuple<bool, int>> {
+ public:
+ PerConfigMergeOperatorPinningTest() {
+ std::tie(disable_block_cache_, option_config_) = GetParam();
+ }
+
+ bool disable_block_cache_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+ MergeOperatorPinningTest, PerConfigMergeOperatorPinningTest,
+ ::testing::Combine(::testing::Bool(),
+ ::testing::Range(static_cast<int>(DBTestBase::kDefault),
+ static_cast<int>(DBTestBase::kEnd))));
+
+TEST_P(PerConfigMergeOperatorPinningTest, Randomized) {
+ if (ShouldSkipOptions(option_config_, kSkipMergePut)) {
+ return;
+ }
+
+ Options options = CurrentOptions();
+ options.merge_operator = MergeOperators::CreateMaxOperator();
+ BlockBasedTableOptions table_options;
+ table_options.no_block_cache = disable_block_cache_;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ std::map<std::string, std::string> true_data;
+
+ const int kTotalMerges = 5000;
+ // Every key gets ~10 operands
+ const int kKeyRange = kTotalMerges / 10;
+ const int kOperandSize = 20;
+ const int kNumPutBefore = kKeyRange / 10; // 10% value
+ const int kNumPutAfter = kKeyRange / 10; // 10% overwrite
+ const int kNumDelete = kKeyRange / 10; // 10% delete
+
+ // kNumPutBefore keys will have base values
+ for (int i = 0; i < kNumPutBefore; i++) {
+ std::string key = Key(rnd.Next() % kKeyRange);
+ std::string value = RandomString(&rnd, kOperandSize);
+ ASSERT_OK(db_->Put(WriteOptions(), key, value));
+
+ true_data[key] = value;
+ }
+
+ // Do kTotalMerges merges
+ for (int i = 0; i < kTotalMerges; i++) {
+ std::string key = Key(rnd.Next() % kKeyRange);
+ std::string value = RandomString(&rnd, kOperandSize);
+ ASSERT_OK(db_->Merge(WriteOptions(), key, value));
+
+ if (true_data[key] < value) {
+ true_data[key] = value;
+ }
+ }
+
+ // Overwrite random kNumPutAfter keys
+ for (int i = 0; i < kNumPutAfter; i++) {
+ std::string key = Key(rnd.Next() % kKeyRange);
+ std::string value = RandomString(&rnd, kOperandSize);
+ ASSERT_OK(db_->Put(WriteOptions(), key, value));
+
+ true_data[key] = value;
+ }
+
+ // Delete random kNumDelete keys
+ for (int i = 0; i < kNumDelete; i++) {
+ std::string key = Key(rnd.Next() % kKeyRange);
+ ASSERT_OK(db_->Delete(WriteOptions(), key));
+
+ true_data.erase(key);
+ }
+
+ VerifyDBFromMap(true_data);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_options_test.cc b/src/rocksdb/db/db_options_test.cc
new file mode 100644
index 000000000..383f66cbf
--- /dev/null
+++ b/src/rocksdb/db/db_options_test.cc
@@ -0,0 +1,870 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <limits>
+#include <string>
+#include <unordered_map>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "options/options_helper.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/rate_limiter.h"
+#include "rocksdb/stats_history.h"
+#include "test_util/sync_point.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBOptionsTest : public DBTestBase {
+ public:
+ DBOptionsTest() : DBTestBase("/db_options_test") {}
+
+#ifndef ROCKSDB_LITE
+ std::unordered_map<std::string, std::string> GetMutableDBOptionsMap(
+ const DBOptions& options) {
+ std::string options_str;
+ GetStringFromDBOptions(&options_str, options);
+ std::unordered_map<std::string, std::string> options_map;
+ StringToMap(options_str, &options_map);
+ std::unordered_map<std::string, std::string> mutable_map;
+ for (const auto opt : db_options_type_info) {
+ if (opt.second.is_mutable &&
+ opt.second.verification != OptionVerificationType::kDeprecated) {
+ mutable_map[opt.first] = options_map[opt.first];
+ }
+ }
+ return mutable_map;
+ }
+
+ std::unordered_map<std::string, std::string> GetMutableCFOptionsMap(
+ const ColumnFamilyOptions& options) {
+ std::string options_str;
+ GetStringFromColumnFamilyOptions(&options_str, options);
+ std::unordered_map<std::string, std::string> options_map;
+ StringToMap(options_str, &options_map);
+ std::unordered_map<std::string, std::string> mutable_map;
+ for (const auto opt : cf_options_type_info) {
+ if (opt.second.is_mutable &&
+ opt.second.verification != OptionVerificationType::kDeprecated) {
+ mutable_map[opt.first] = options_map[opt.first];
+ }
+ }
+ return mutable_map;
+ }
+
+ std::unordered_map<std::string, std::string> GetRandomizedMutableCFOptionsMap(
+ Random* rnd) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ ImmutableDBOptions db_options(options);
+ test::RandomInitCFOptions(&options, options, rnd);
+ auto sanitized_options = SanitizeOptions(db_options, options);
+ auto opt_map = GetMutableCFOptionsMap(sanitized_options);
+ delete options.compaction_filter;
+ return opt_map;
+ }
+
+ std::unordered_map<std::string, std::string> GetRandomizedMutableDBOptionsMap(
+ Random* rnd) {
+ DBOptions db_options;
+ test::RandomInitDBOptions(&db_options, rnd);
+ auto sanitized_options = SanitizeOptions(dbname_, db_options);
+ return GetMutableDBOptionsMap(sanitized_options);
+ }
+#endif // ROCKSDB_LITE
+};
+
+// RocksDB lite don't support dynamic options.
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBOptionsTest, GetLatestDBOptions) {
+ // GetOptions should be able to get latest option changed by SetOptions.
+ Options options;
+ options.create_if_missing = true;
+ options.env = env_;
+ Random rnd(228);
+ Reopen(options);
+ auto new_options = GetRandomizedMutableDBOptionsMap(&rnd);
+ ASSERT_OK(dbfull()->SetDBOptions(new_options));
+ ASSERT_EQ(new_options, GetMutableDBOptionsMap(dbfull()->GetDBOptions()));
+}
+
+TEST_F(DBOptionsTest, GetLatestCFOptions) {
+ // GetOptions should be able to get latest option changed by SetOptions.
+ Options options;
+ options.create_if_missing = true;
+ options.env = env_;
+ Random rnd(228);
+ Reopen(options);
+ CreateColumnFamilies({"foo"}, options);
+ ReopenWithColumnFamilies({"default", "foo"}, options);
+ auto options_default = GetRandomizedMutableCFOptionsMap(&rnd);
+ auto options_foo = GetRandomizedMutableCFOptionsMap(&rnd);
+ ASSERT_OK(dbfull()->SetOptions(handles_[0], options_default));
+ ASSERT_OK(dbfull()->SetOptions(handles_[1], options_foo));
+ ASSERT_EQ(options_default,
+ GetMutableCFOptionsMap(dbfull()->GetOptions(handles_[0])));
+ ASSERT_EQ(options_foo,
+ GetMutableCFOptionsMap(dbfull()->GetOptions(handles_[1])));
+}
+
+TEST_F(DBOptionsTest, SetBytesPerSync) {
+ const size_t kValueSize = 1024 * 1024; // 1MB
+ Options options;
+ options.create_if_missing = true;
+ options.bytes_per_sync = 1024 * 1024;
+ options.use_direct_reads = false;
+ options.write_buffer_size = 400 * kValueSize;
+ options.disable_auto_compactions = true;
+ options.compression = kNoCompression;
+ options.env = env_;
+ Reopen(options);
+ int counter = 0;
+ int low_bytes_per_sync = 0;
+ int i = 0;
+ const std::string kValue(kValueSize, 'v');
+ ASSERT_EQ(options.bytes_per_sync, dbfull()->GetDBOptions().bytes_per_sync);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WritableFileWriter::RangeSync:0", [&](void* /*arg*/) { counter++; });
+
+ WriteOptions write_opts;
+ // should sync approximately 40MB/1MB ~= 40 times.
+ for (i = 0; i < 40; i++) {
+ Put(Key(i), kValue, write_opts);
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ low_bytes_per_sync = counter;
+ ASSERT_GT(low_bytes_per_sync, 35);
+ ASSERT_LT(low_bytes_per_sync, 45);
+
+ counter = 0;
+ // 8388608 = 8 * 1024 * 1024
+ ASSERT_OK(dbfull()->SetDBOptions({{"bytes_per_sync", "8388608"}}));
+ ASSERT_EQ(8388608, dbfull()->GetDBOptions().bytes_per_sync);
+ // should sync approximately 40MB*2/8MB ~= 10 times.
+ // data will be 40*2MB because of previous Puts too.
+ for (i = 0; i < 40; i++) {
+ Put(Key(i), kValue, write_opts);
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_GT(counter, 5);
+ ASSERT_LT(counter, 15);
+
+ // Redundant assert. But leaving it here just to get the point across that
+ // low_bytes_per_sync > counter.
+ ASSERT_GT(low_bytes_per_sync, counter);
+}
+
+TEST_F(DBOptionsTest, SetWalBytesPerSync) {
+ const size_t kValueSize = 1024 * 1024 * 3;
+ Options options;
+ options.create_if_missing = true;
+ options.wal_bytes_per_sync = 512;
+ options.write_buffer_size = 100 * kValueSize;
+ options.disable_auto_compactions = true;
+ options.compression = kNoCompression;
+ options.env = env_;
+ Reopen(options);
+ ASSERT_EQ(512, dbfull()->GetDBOptions().wal_bytes_per_sync);
+ int counter = 0;
+ int low_bytes_per_sync = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WritableFileWriter::RangeSync:0", [&](void* /*arg*/) { counter++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ const std::string kValue(kValueSize, 'v');
+ int i = 0;
+ for (; i < 10; i++) {
+ Put(Key(i), kValue);
+ }
+ // Do not flush. If we flush here, SwitchWAL will reuse old WAL file since its
+ // empty and will not get the new wal_bytes_per_sync value.
+ low_bytes_per_sync = counter;
+ //5242880 = 1024 * 1024 * 5
+ ASSERT_OK(dbfull()->SetDBOptions({{"wal_bytes_per_sync", "5242880"}}));
+ ASSERT_EQ(5242880, dbfull()->GetDBOptions().wal_bytes_per_sync);
+ counter = 0;
+ i = 0;
+ for (; i < 10; i++) {
+ Put(Key(i), kValue);
+ }
+ ASSERT_GT(counter, 0);
+ ASSERT_GT(low_bytes_per_sync, 0);
+ ASSERT_GT(low_bytes_per_sync, counter);
+}
+
+TEST_F(DBOptionsTest, WritableFileMaxBufferSize) {
+ Options options;
+ options.create_if_missing = true;
+ options.writable_file_max_buffer_size = 1024 * 1024;
+ options.level0_file_num_compaction_trigger = 3;
+ options.max_manifest_file_size = 1;
+ options.env = env_;
+ int buffer_size = 1024 * 1024;
+ Reopen(options);
+ ASSERT_EQ(buffer_size,
+ dbfull()->GetDBOptions().writable_file_max_buffer_size);
+
+ std::atomic<int> match_cnt(0);
+ std::atomic<int> unmatch_cnt(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WritableFileWriter::WritableFileWriter:0", [&](void* arg) {
+ int value = static_cast<int>(reinterpret_cast<uintptr_t>(arg));
+ if (value == buffer_size) {
+ match_cnt++;
+ } else {
+ unmatch_cnt++;
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ int i = 0;
+ for (; i < 3; i++) {
+ ASSERT_OK(Put("foo", ToString(i)));
+ ASSERT_OK(Put("bar", ToString(i)));
+ Flush();
+ }
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(unmatch_cnt, 0);
+ ASSERT_GE(match_cnt, 11);
+
+ ASSERT_OK(
+ dbfull()->SetDBOptions({{"writable_file_max_buffer_size", "524288"}}));
+ buffer_size = 512 * 1024;
+ match_cnt = 0;
+ unmatch_cnt = 0; // SetDBOptions() will create a WriteableFileWriter
+
+ ASSERT_EQ(buffer_size,
+ dbfull()->GetDBOptions().writable_file_max_buffer_size);
+ i = 0;
+ for (; i < 3; i++) {
+ ASSERT_OK(Put("foo", ToString(i)));
+ ASSERT_OK(Put("bar", ToString(i)));
+ Flush();
+ }
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(unmatch_cnt, 0);
+ ASSERT_GE(match_cnt, 11);
+}
+
+TEST_F(DBOptionsTest, SetOptionsAndReopen) {
+ Random rnd(1044);
+ auto rand_opts = GetRandomizedMutableCFOptionsMap(&rnd);
+ ASSERT_OK(dbfull()->SetOptions(rand_opts));
+ // Verify if DB can be reopen after setting options.
+ Options options;
+ options.env = env_;
+ ASSERT_OK(TryReopen(options));
+}
+
+TEST_F(DBOptionsTest, EnableAutoCompactionAndTriggerStall) {
+ const std::string kValue(1024, 'v');
+ for (int method_type = 0; method_type < 2; method_type++) {
+ for (int option_type = 0; option_type < 4; option_type++) {
+ Options options;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ options.write_buffer_size = 1024 * 1024 * 10;
+ options.compression = CompressionType::kNoCompression;
+ options.level0_file_num_compaction_trigger = 1;
+ options.level0_stop_writes_trigger = std::numeric_limits<int>::max();
+ options.level0_slowdown_writes_trigger = std::numeric_limits<int>::max();
+ options.hard_pending_compaction_bytes_limit =
+ std::numeric_limits<uint64_t>::max();
+ options.soft_pending_compaction_bytes_limit =
+ std::numeric_limits<uint64_t>::max();
+ options.env = env_;
+
+ DestroyAndReopen(options);
+ int i = 0;
+ for (; i < 1024; i++) {
+ Put(Key(i), kValue);
+ }
+ Flush();
+ for (; i < 1024 * 2; i++) {
+ Put(Key(i), kValue);
+ }
+ Flush();
+ dbfull()->TEST_WaitForFlushMemTable();
+ ASSERT_EQ(2, NumTableFilesAtLevel(0));
+ uint64_t l0_size = SizeAtLevel(0);
+
+ switch (option_type) {
+ case 0:
+ // test with level0_stop_writes_trigger
+ options.level0_stop_writes_trigger = 2;
+ options.level0_slowdown_writes_trigger = 2;
+ break;
+ case 1:
+ options.level0_slowdown_writes_trigger = 2;
+ break;
+ case 2:
+ options.hard_pending_compaction_bytes_limit = l0_size;
+ options.soft_pending_compaction_bytes_limit = l0_size;
+ break;
+ case 3:
+ options.soft_pending_compaction_bytes_limit = l0_size;
+ break;
+ }
+ Reopen(options);
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped());
+ ASSERT_FALSE(dbfull()->TEST_write_controler().NeedsDelay());
+
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBOptionsTest::EnableAutoCompactionAndTriggerStall:1",
+ "BackgroundCallCompaction:0"},
+ {"DBImpl::BackgroundCompaction():BeforePickCompaction",
+ "DBOptionsTest::EnableAutoCompactionAndTriggerStall:2"},
+ {"DBOptionsTest::EnableAutoCompactionAndTriggerStall:3",
+ "DBImpl::BackgroundCompaction():AfterPickCompaction"}});
+ // Block background compaction.
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ switch (method_type) {
+ case 0:
+ ASSERT_OK(
+ dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
+ break;
+ case 1:
+ ASSERT_OK(dbfull()->EnableAutoCompaction(
+ {dbfull()->DefaultColumnFamily()}));
+ break;
+ }
+ TEST_SYNC_POINT("DBOptionsTest::EnableAutoCompactionAndTriggerStall:1");
+ // Wait for stall condition recalculate.
+ TEST_SYNC_POINT("DBOptionsTest::EnableAutoCompactionAndTriggerStall:2");
+
+ switch (option_type) {
+ case 0:
+ ASSERT_TRUE(dbfull()->TEST_write_controler().IsStopped());
+ break;
+ case 1:
+ ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ break;
+ case 2:
+ ASSERT_TRUE(dbfull()->TEST_write_controler().IsStopped());
+ break;
+ case 3:
+ ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ break;
+ }
+ TEST_SYNC_POINT("DBOptionsTest::EnableAutoCompactionAndTriggerStall:3");
+
+ // Background compaction executed.
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped());
+ ASSERT_FALSE(dbfull()->TEST_write_controler().NeedsDelay());
+ }
+ }
+}
+
+TEST_F(DBOptionsTest, SetOptionsMayTriggerCompaction) {
+ Options options;
+ options.create_if_missing = true;
+ options.level0_file_num_compaction_trigger = 1000;
+ options.env = env_;
+ Reopen(options);
+ for (int i = 0; i < 3; i++) {
+ // Need to insert two keys to avoid trivial move.
+ ASSERT_OK(Put("foo", ToString(i)));
+ ASSERT_OK(Put("bar", ToString(i)));
+ Flush();
+ }
+ ASSERT_EQ("3", FilesPerLevel());
+ ASSERT_OK(
+ dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "3"}}));
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ("0,1", FilesPerLevel());
+}
+
+TEST_F(DBOptionsTest, SetBackgroundCompactionThreads) {
+ Options options;
+ options.create_if_missing = true;
+ options.max_background_compactions = 1; // default value
+ options.env = env_;
+ Reopen(options);
+ ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+ ASSERT_OK(dbfull()->SetDBOptions({{"max_background_compactions", "3"}}));
+ ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+ auto stop_token = dbfull()->TEST_write_controler().GetStopToken();
+ ASSERT_EQ(3, dbfull()->TEST_BGCompactionsAllowed());
+}
+
+TEST_F(DBOptionsTest, SetBackgroundJobs) {
+ Options options;
+ options.create_if_missing = true;
+ options.max_background_jobs = 8;
+ options.env = env_;
+ Reopen(options);
+
+ for (int i = 0; i < 2; ++i) {
+ if (i > 0) {
+ options.max_background_jobs = 12;
+ ASSERT_OK(dbfull()->SetDBOptions(
+ {{"max_background_jobs",
+ std::to_string(options.max_background_jobs)}}));
+ }
+
+ const int expected_max_flushes = options.max_background_jobs / 4;
+
+ ASSERT_EQ(expected_max_flushes, dbfull()->TEST_BGFlushesAllowed());
+ ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+ auto stop_token = dbfull()->TEST_write_controler().GetStopToken();
+
+ const int expected_max_compactions = 3 * expected_max_flushes;
+
+ ASSERT_EQ(expected_max_flushes, dbfull()->TEST_BGFlushesAllowed());
+ ASSERT_EQ(expected_max_compactions, dbfull()->TEST_BGCompactionsAllowed());
+
+ ASSERT_EQ(expected_max_flushes,
+ env_->GetBackgroundThreads(Env::Priority::HIGH));
+ ASSERT_EQ(expected_max_compactions,
+ env_->GetBackgroundThreads(Env::Priority::LOW));
+ }
+}
+
+TEST_F(DBOptionsTest, AvoidFlushDuringShutdown) {
+ Options options;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ options.env = env_;
+ WriteOptions write_without_wal;
+ write_without_wal.disableWAL = true;
+
+ ASSERT_FALSE(options.avoid_flush_during_shutdown);
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("foo", "v1", write_without_wal));
+ Reopen(options);
+ ASSERT_EQ("v1", Get("foo"));
+ ASSERT_EQ("1", FilesPerLevel());
+
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("foo", "v2", write_without_wal));
+ ASSERT_OK(dbfull()->SetDBOptions({{"avoid_flush_during_shutdown", "true"}}));
+ Reopen(options);
+ ASSERT_EQ("NOT_FOUND", Get("foo"));
+ ASSERT_EQ("", FilesPerLevel());
+}
+
+TEST_F(DBOptionsTest, SetDelayedWriteRateOption) {
+ Options options;
+ options.create_if_missing = true;
+ options.delayed_write_rate = 2 * 1024U * 1024U;
+ options.env = env_;
+ Reopen(options);
+ ASSERT_EQ(2 * 1024U * 1024U, dbfull()->TEST_write_controler().max_delayed_write_rate());
+
+ ASSERT_OK(dbfull()->SetDBOptions({{"delayed_write_rate", "20000"}}));
+ ASSERT_EQ(20000, dbfull()->TEST_write_controler().max_delayed_write_rate());
+}
+
+TEST_F(DBOptionsTest, MaxTotalWalSizeChange) {
+ Random rnd(1044);
+ const auto value_size = size_t(1024);
+ std::string value;
+ test::RandomString(&rnd, value_size, &value);
+
+ Options options;
+ options.create_if_missing = true;
+ options.env = env_;
+ CreateColumnFamilies({"1", "2", "3"}, options);
+ ReopenWithColumnFamilies({"default", "1", "2", "3"}, options);
+
+ WriteOptions write_options;
+
+ const int key_count = 100;
+ for (int i = 0; i < key_count; ++i) {
+ for (size_t cf = 0; cf < handles_.size(); ++cf) {
+ ASSERT_OK(Put(static_cast<int>(cf), Key(i), value));
+ }
+ }
+ ASSERT_OK(dbfull()->SetDBOptions({{"max_total_wal_size", "10"}}));
+
+ for (size_t cf = 0; cf < handles_.size(); ++cf) {
+ dbfull()->TEST_WaitForFlushMemTable(handles_[cf]);
+ ASSERT_EQ("1", FilesPerLevel(static_cast<int>(cf)));
+ }
+}
+
+TEST_F(DBOptionsTest, SetStatsDumpPeriodSec) {
+ Options options;
+ options.create_if_missing = true;
+ options.stats_dump_period_sec = 5;
+ options.env = env_;
+ Reopen(options);
+ ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_dump_period_sec);
+
+ for (int i = 0; i < 20; i++) {
+ unsigned int num = rand() % 5000 + 1;
+ ASSERT_OK(
+ dbfull()->SetDBOptions({{"stats_dump_period_sec", ToString(num)}}));
+ ASSERT_EQ(num, dbfull()->GetDBOptions().stats_dump_period_sec);
+ }
+ Close();
+}
+
+TEST_F(DBOptionsTest, SetOptionsStatsPersistPeriodSec) {
+ Options options;
+ options.create_if_missing = true;
+ options.stats_persist_period_sec = 5;
+ options.env = env_;
+ Reopen(options);
+ ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_persist_period_sec);
+
+ ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "12345"}}));
+ ASSERT_EQ(12345u, dbfull()->GetDBOptions().stats_persist_period_sec);
+ ASSERT_NOK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "abcde"}}));
+ ASSERT_EQ(12345u, dbfull()->GetDBOptions().stats_persist_period_sec);
+}
+
+static void assert_candidate_files_empty(DBImpl* dbfull, const bool empty) {
+ dbfull->TEST_LockMutex();
+ JobContext job_context(0);
+ dbfull->FindObsoleteFiles(&job_context, false);
+ ASSERT_EQ(empty, job_context.full_scan_candidate_files.empty());
+ dbfull->TEST_UnlockMutex();
+ if (job_context.HaveSomethingToDelete()) {
+ // fulfill the contract of FindObsoleteFiles by calling PurgeObsoleteFiles
+ // afterwards; otherwise the test may hang on shutdown
+ dbfull->PurgeObsoleteFiles(job_context);
+ }
+ job_context.Clean();
+}
+
+TEST_F(DBOptionsTest, DeleteObsoleteFilesPeriodChange) {
+ SpecialEnv env(env_);
+ env.time_elapse_only_sleep_ = true;
+ Options options;
+ options.env = &env;
+ options.create_if_missing = true;
+ ASSERT_OK(TryReopen(options));
+
+ // Verify that candidate files set is empty when no full scan requested.
+ assert_candidate_files_empty(dbfull(), true);
+
+ ASSERT_OK(
+ dbfull()->SetDBOptions({{"delete_obsolete_files_period_micros", "0"}}));
+
+ // After delete_obsolete_files_period_micros updated to 0, the next call
+ // to FindObsoleteFiles should make a full scan
+ assert_candidate_files_empty(dbfull(), false);
+
+ ASSERT_OK(
+ dbfull()->SetDBOptions({{"delete_obsolete_files_period_micros", "20"}}));
+
+ assert_candidate_files_empty(dbfull(), true);
+
+ env.addon_time_.store(20);
+ assert_candidate_files_empty(dbfull(), true);
+
+ env.addon_time_.store(21);
+ assert_candidate_files_empty(dbfull(), false);
+
+ Close();
+}
+
+TEST_F(DBOptionsTest, MaxOpenFilesChange) {
+ SpecialEnv env(env_);
+ Options options;
+ options.env = CurrentOptions().env;
+ options.max_open_files = -1;
+
+ Reopen(options);
+
+ Cache* tc = dbfull()->TEST_table_cache();
+
+ ASSERT_EQ(-1, dbfull()->GetDBOptions().max_open_files);
+ ASSERT_LT(2000, tc->GetCapacity());
+ ASSERT_OK(dbfull()->SetDBOptions({{"max_open_files", "1024"}}));
+ ASSERT_EQ(1024, dbfull()->GetDBOptions().max_open_files);
+ // examine the table cache (actual size should be 1014)
+ ASSERT_GT(1500, tc->GetCapacity());
+ Close();
+}
+
+TEST_F(DBOptionsTest, SanitizeDelayedWriteRate) {
+ Options options;
+ options.delayed_write_rate = 0;
+ Reopen(options);
+ ASSERT_EQ(16 * 1024 * 1024, dbfull()->GetDBOptions().delayed_write_rate);
+
+ options.rate_limiter.reset(NewGenericRateLimiter(31 * 1024 * 1024));
+ Reopen(options);
+ ASSERT_EQ(31 * 1024 * 1024, dbfull()->GetDBOptions().delayed_write_rate);
+}
+
+TEST_F(DBOptionsTest, SanitizeUniversalTTLCompaction) {
+ Options options;
+ options.compaction_style = kCompactionStyleUniversal;
+
+ options.ttl = 0;
+ options.periodic_compaction_seconds = 0;
+ Reopen(options);
+ ASSERT_EQ(0, dbfull()->GetOptions().ttl);
+ ASSERT_EQ(0, dbfull()->GetOptions().periodic_compaction_seconds);
+
+ options.ttl = 0;
+ options.periodic_compaction_seconds = 100;
+ Reopen(options);
+ ASSERT_EQ(0, dbfull()->GetOptions().ttl);
+ ASSERT_EQ(100, dbfull()->GetOptions().periodic_compaction_seconds);
+
+ options.ttl = 100;
+ options.periodic_compaction_seconds = 0;
+ Reopen(options);
+ ASSERT_EQ(100, dbfull()->GetOptions().ttl);
+ ASSERT_EQ(100, dbfull()->GetOptions().periodic_compaction_seconds);
+
+ options.ttl = 100;
+ options.periodic_compaction_seconds = 500;
+ Reopen(options);
+ ASSERT_EQ(100, dbfull()->GetOptions().ttl);
+ ASSERT_EQ(100, dbfull()->GetOptions().periodic_compaction_seconds);
+}
+
+TEST_F(DBOptionsTest, SanitizeTtlDefault) {
+ Options options;
+ Reopen(options);
+ ASSERT_EQ(30 * 24 * 60 * 60, dbfull()->GetOptions().ttl);
+
+ options.compaction_style = kCompactionStyleLevel;
+ options.ttl = 0;
+ Reopen(options);
+ ASSERT_EQ(0, dbfull()->GetOptions().ttl);
+
+ options.ttl = 100;
+ Reopen(options);
+ ASSERT_EQ(100, dbfull()->GetOptions().ttl);
+}
+
+TEST_F(DBOptionsTest, SanitizeFIFOPeriodicCompaction) {
+ Options options;
+ options.compaction_style = kCompactionStyleFIFO;
+ options.ttl = 0;
+ Reopen(options);
+ ASSERT_EQ(30 * 24 * 60 * 60, dbfull()->GetOptions().ttl);
+
+ options.ttl = 100;
+ Reopen(options);
+ ASSERT_EQ(100, dbfull()->GetOptions().ttl);
+
+ options.ttl = 100 * 24 * 60 * 60;
+ Reopen(options);
+ ASSERT_EQ(100 * 24 * 60 * 60, dbfull()->GetOptions().ttl);
+
+ options.ttl = 200;
+ options.periodic_compaction_seconds = 300;
+ Reopen(options);
+ ASSERT_EQ(200, dbfull()->GetOptions().ttl);
+
+ options.ttl = 500;
+ options.periodic_compaction_seconds = 300;
+ Reopen(options);
+ ASSERT_EQ(300, dbfull()->GetOptions().ttl);
+}
+
+TEST_F(DBOptionsTest, SetFIFOCompactionOptions) {
+ Options options;
+ options.compaction_style = kCompactionStyleFIFO;
+ options.write_buffer_size = 10 << 10; // 10KB
+ options.arena_block_size = 4096;
+ options.compression = kNoCompression;
+ options.create_if_missing = true;
+ options.compaction_options_fifo.allow_compaction = false;
+ env_->time_elapse_only_sleep_ = false;
+ options.env = env_;
+
+ // Test dynamically changing ttl.
+ env_->addon_time_.store(0);
+ options.ttl = 1 * 60 * 60; // 1 hour
+ ASSERT_OK(TryReopen(options));
+
+ Random rnd(301);
+ for (int i = 0; i < 10; i++) {
+ // Generate and flush a file about 10KB.
+ for (int j = 0; j < 10; j++) {
+ ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+ }
+ Flush();
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+ // Add 61 seconds to the time.
+ env_->addon_time_.fetch_add(61);
+
+ // No files should be compacted as ttl is set to 1 hour.
+ ASSERT_EQ(dbfull()->GetOptions().ttl, 3600);
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+ // Set ttl to 1 minute. So all files should get deleted.
+ ASSERT_OK(dbfull()->SetOptions({{"ttl", "60"}}));
+ ASSERT_EQ(dbfull()->GetOptions().ttl, 60);
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+ // Test dynamically changing compaction_options_fifo.max_table_files_size
+ env_->addon_time_.store(0);
+ options.compaction_options_fifo.max_table_files_size = 500 << 10; // 00KB
+ options.ttl = 0;
+ DestroyAndReopen(options);
+
+ for (int i = 0; i < 10; i++) {
+ // Generate and flush a file about 10KB.
+ for (int j = 0; j < 10; j++) {
+ ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+ }
+ Flush();
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+ // No files should be compacted as max_table_files_size is set to 500 KB.
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+ 500 << 10);
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+ // Set max_table_files_size to 12 KB. So only 1 file should remain now.
+ ASSERT_OK(dbfull()->SetOptions(
+ {{"compaction_options_fifo", "{max_table_files_size=12288;}"}}));
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+ 12 << 10);
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+
+ // Test dynamically changing compaction_options_fifo.allow_compaction
+ options.compaction_options_fifo.max_table_files_size = 500 << 10; // 500KB
+ options.ttl = 0;
+ options.compaction_options_fifo.allow_compaction = false;
+ options.level0_file_num_compaction_trigger = 6;
+ DestroyAndReopen(options);
+
+ for (int i = 0; i < 10; i++) {
+ // Generate and flush a file about 10KB.
+ for (int j = 0; j < 10; j++) {
+ ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+ }
+ Flush();
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+ // No files should be compacted as max_table_files_size is set to 500 KB and
+ // allow_compaction is false
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+ false);
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+ // Set allow_compaction to true. So number of files should be between 1 and 5.
+ ASSERT_OK(dbfull()->SetOptions(
+ {{"compaction_options_fifo", "{allow_compaction=true;}"}}));
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+ true);
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_GE(NumTableFilesAtLevel(0), 1);
+ ASSERT_LE(NumTableFilesAtLevel(0), 5);
+}
+
+TEST_F(DBOptionsTest, CompactionReadaheadSizeChange) {
+ SpecialEnv env(env_);
+ Options options;
+ options.env = &env;
+
+ options.compaction_readahead_size = 0;
+ options.new_table_reader_for_compaction_inputs = true;
+ options.level0_file_num_compaction_trigger = 2;
+ const std::string kValue(1024, 'v');
+ Reopen(options);
+
+ ASSERT_EQ(0, dbfull()->GetDBOptions().compaction_readahead_size);
+ ASSERT_OK(dbfull()->SetDBOptions({{"compaction_readahead_size", "256"}}));
+ ASSERT_EQ(256, dbfull()->GetDBOptions().compaction_readahead_size);
+ for (int i = 0; i < 1024; i++) {
+ Put(Key(i), kValue);
+ }
+ Flush();
+ for (int i = 0; i < 1024 * 2; i++) {
+ Put(Key(i), kValue);
+ }
+ Flush();
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(256, env_->compaction_readahead_size_);
+ Close();
+}
+
+TEST_F(DBOptionsTest, FIFOTtlBackwardCompatible) {
+ Options options;
+ options.compaction_style = kCompactionStyleFIFO;
+ options.write_buffer_size = 10 << 10; // 10KB
+ options.create_if_missing = true;
+
+ ASSERT_OK(TryReopen(options));
+
+ Random rnd(301);
+ for (int i = 0; i < 10; i++) {
+ // Generate and flush a file about 10KB.
+ for (int j = 0; j < 10; j++) {
+ ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+ }
+ Flush();
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+ // In release 6.0, ttl was promoted from a secondary level option under
+ // compaction_options_fifo to a top level option under ColumnFamilyOptions.
+ // We still need to handle old SetOptions calls but should ignore
+ // ttl under compaction_options_fifo.
+ ASSERT_OK(dbfull()->SetOptions(
+ {{"compaction_options_fifo",
+ "{allow_compaction=true;max_table_files_size=1024;ttl=731;}"},
+ {"ttl", "60"}}));
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+ true);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+ 1024);
+ ASSERT_EQ(dbfull()->GetOptions().ttl, 60);
+
+ // Put ttl as the first option inside compaction_options_fifo. That works as
+ // it doesn't overwrite any other option.
+ ASSERT_OK(dbfull()->SetOptions(
+ {{"compaction_options_fifo",
+ "{ttl=985;allow_compaction=true;max_table_files_size=1024;}"},
+ {"ttl", "191"}}));
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+ true);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+ 1024);
+ ASSERT_EQ(dbfull()->GetOptions().ttl, 191);
+}
+
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_properties_test.cc b/src/rocksdb/db/db_properties_test.cc
new file mode 100644
index 000000000..50dc3efef
--- /dev/null
+++ b/src/rocksdb/db/db_properties_test.cc
@@ -0,0 +1,1711 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <stdio.h>
+
+#include <algorithm>
+#include <string>
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/perf_level.h"
+#include "rocksdb/table.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBPropertiesTest : public DBTestBase {
+ public:
+ DBPropertiesTest() : DBTestBase("/db_properties_test") {}
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBPropertiesTest, Empty) {
+ do {
+ Options options;
+ options.env = env_;
+ options.write_buffer_size = 100000; // Small write buffer
+ options.allow_concurrent_memtable_write = false;
+ options = CurrentOptions(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ std::string num;
+ ASSERT_TRUE(dbfull()->GetProperty(
+ handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+ ASSERT_EQ("0", num);
+
+ ASSERT_OK(Put(1, "foo", "v1"));
+ ASSERT_EQ("v1", Get(1, "foo"));
+ ASSERT_TRUE(dbfull()->GetProperty(
+ handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+ ASSERT_EQ("1", num);
+
+ // Block sync calls
+ env_->delay_sstable_sync_.store(true, std::memory_order_release);
+ Put(1, "k1", std::string(100000, 'x')); // Fill memtable
+ ASSERT_TRUE(dbfull()->GetProperty(
+ handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+ ASSERT_EQ("2", num);
+
+ Put(1, "k2", std::string(100000, 'y')); // Trigger compaction
+ ASSERT_TRUE(dbfull()->GetProperty(
+ handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+ ASSERT_EQ("1", num);
+
+ ASSERT_EQ("v1", Get(1, "foo"));
+ // Release sync calls
+ env_->delay_sstable_sync_.store(false, std::memory_order_release);
+
+ ASSERT_OK(db_->DisableFileDeletions());
+ ASSERT_TRUE(
+ dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+ ASSERT_EQ("0", num);
+
+ ASSERT_OK(db_->DisableFileDeletions());
+ ASSERT_TRUE(
+ dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+ ASSERT_EQ("0", num);
+
+ ASSERT_OK(db_->DisableFileDeletions());
+ ASSERT_TRUE(
+ dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+ ASSERT_EQ("0", num);
+
+ ASSERT_OK(db_->EnableFileDeletions(false));
+ ASSERT_TRUE(
+ dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+ ASSERT_EQ("0", num);
+
+ ASSERT_OK(db_->EnableFileDeletions());
+ ASSERT_TRUE(
+ dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+ ASSERT_EQ("1", num);
+ } while (ChangeOptions());
+}
+
+TEST_F(DBPropertiesTest, CurrentVersionNumber) {
+ uint64_t v1, v2, v3;
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.current-super-version-number", &v1));
+ Put("12345678", "");
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.current-super-version-number", &v2));
+ Flush();
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.current-super-version-number", &v3));
+
+ ASSERT_EQ(v1, v2);
+ ASSERT_GT(v3, v2);
+}
+
+TEST_F(DBPropertiesTest, GetAggregatedIntPropertyTest) {
+ const int kKeySize = 100;
+ const int kValueSize = 500;
+ const int kKeyNum = 100;
+
+ Options options;
+ options.env = env_;
+ options.create_if_missing = true;
+ options.write_buffer_size = (kKeySize + kValueSize) * kKeyNum / 10;
+ // Make them never flush
+ options.min_write_buffer_number_to_merge = 1000;
+ options.max_write_buffer_number = 1000;
+ options = CurrentOptions(options);
+ CreateAndReopenWithCF({"one", "two", "three", "four"}, options);
+
+ Random rnd(301);
+ for (auto* handle : handles_) {
+ for (int i = 0; i < kKeyNum; ++i) {
+ db_->Put(WriteOptions(), handle, RandomString(&rnd, kKeySize),
+ RandomString(&rnd, kValueSize));
+ }
+ }
+
+ uint64_t manual_sum = 0;
+ uint64_t api_sum = 0;
+ uint64_t value = 0;
+ for (auto* handle : handles_) {
+ ASSERT_TRUE(
+ db_->GetIntProperty(handle, DB::Properties::kSizeAllMemTables, &value));
+ manual_sum += value;
+ }
+ ASSERT_TRUE(db_->GetAggregatedIntProperty(DB::Properties::kSizeAllMemTables,
+ &api_sum));
+ ASSERT_GT(manual_sum, 0);
+ ASSERT_EQ(manual_sum, api_sum);
+
+ ASSERT_FALSE(db_->GetAggregatedIntProperty(DB::Properties::kDBStats, &value));
+
+ uint64_t before_flush_trm;
+ uint64_t after_flush_trm;
+ for (auto* handle : handles_) {
+ ASSERT_TRUE(db_->GetAggregatedIntProperty(
+ DB::Properties::kEstimateTableReadersMem, &before_flush_trm));
+
+ // Issue flush and expect larger memory usage of table readers.
+ db_->Flush(FlushOptions(), handle);
+
+ ASSERT_TRUE(db_->GetAggregatedIntProperty(
+ DB::Properties::kEstimateTableReadersMem, &after_flush_trm));
+ ASSERT_GT(after_flush_trm, before_flush_trm);
+ }
+}
+
+namespace {
+void ResetTableProperties(TableProperties* tp) {
+ tp->data_size = 0;
+ tp->index_size = 0;
+ tp->filter_size = 0;
+ tp->raw_key_size = 0;
+ tp->raw_value_size = 0;
+ tp->num_data_blocks = 0;
+ tp->num_entries = 0;
+ tp->num_deletions = 0;
+ tp->num_merge_operands = 0;
+ tp->num_range_deletions = 0;
+}
+
+void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) {
+ double dummy_double;
+ std::replace(tp_string.begin(), tp_string.end(), ';', ' ');
+ std::replace(tp_string.begin(), tp_string.end(), '=', ' ');
+ ResetTableProperties(tp);
+ sscanf(tp_string.c_str(),
+ "# data blocks %" SCNu64 " # entries %" SCNu64 " # deletions %" SCNu64
+ " # merge operands %" SCNu64 " # range deletions %" SCNu64
+ " raw key size %" SCNu64
+ " raw average key size %lf "
+ " raw value size %" SCNu64
+ " raw average value size %lf "
+ " data block size %" SCNu64 " index block size (user-key? %" SCNu64
+ ", delta-value? %" SCNu64 ") %" SCNu64 " filter block size %" SCNu64,
+ &tp->num_data_blocks, &tp->num_entries, &tp->num_deletions,
+ &tp->num_merge_operands, &tp->num_range_deletions, &tp->raw_key_size,
+ &dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size,
+ &tp->index_key_is_user_key, &tp->index_value_is_delta_encoded,
+ &tp->index_size, &tp->filter_size);
+}
+
+void VerifySimilar(uint64_t a, uint64_t b, double bias) {
+ ASSERT_EQ(a == 0U, b == 0U);
+ if (a == 0) {
+ return;
+ }
+ double dbl_a = static_cast<double>(a);
+ double dbl_b = static_cast<double>(b);
+ if (dbl_a > dbl_b) {
+ ASSERT_LT(static_cast<double>(dbl_a - dbl_b) / (dbl_a + dbl_b), bias);
+ } else {
+ ASSERT_LT(static_cast<double>(dbl_b - dbl_a) / (dbl_a + dbl_b), bias);
+ }
+}
+
+void VerifyTableProperties(
+ const TableProperties& base_tp, const TableProperties& new_tp,
+ double filter_size_bias = CACHE_LINE_SIZE >= 256 ? 0.15 : 0.1,
+ double index_size_bias = 0.1, double data_size_bias = 0.1,
+ double num_data_blocks_bias = 0.05) {
+ VerifySimilar(base_tp.data_size, new_tp.data_size, data_size_bias);
+ VerifySimilar(base_tp.index_size, new_tp.index_size, index_size_bias);
+ VerifySimilar(base_tp.filter_size, new_tp.filter_size, filter_size_bias);
+ VerifySimilar(base_tp.num_data_blocks, new_tp.num_data_blocks,
+ num_data_blocks_bias);
+
+ ASSERT_EQ(base_tp.raw_key_size, new_tp.raw_key_size);
+ ASSERT_EQ(base_tp.raw_value_size, new_tp.raw_value_size);
+ ASSERT_EQ(base_tp.num_entries, new_tp.num_entries);
+ ASSERT_EQ(base_tp.num_deletions, new_tp.num_deletions);
+ ASSERT_EQ(base_tp.num_range_deletions, new_tp.num_range_deletions);
+
+ // Merge operands may become Puts, so we only have an upper bound the exact
+ // number of merge operands.
+ ASSERT_GE(base_tp.num_merge_operands, new_tp.num_merge_operands);
+}
+
+void GetExpectedTableProperties(
+ TableProperties* expected_tp, const int kKeySize, const int kValueSize,
+ const int kPutsPerTable, const int kDeletionsPerTable,
+ const int kMergeOperandsPerTable, const int kRangeDeletionsPerTable,
+ const int kTableCount, const int kBloomBitsPerKey, const size_t kBlockSize,
+ const bool index_key_is_user_key, const bool value_delta_encoding) {
+ const int kKeysPerTable =
+ kPutsPerTable + kDeletionsPerTable + kMergeOperandsPerTable;
+ const int kPutCount = kTableCount * kPutsPerTable;
+ const int kDeletionCount = kTableCount * kDeletionsPerTable;
+ const int kMergeCount = kTableCount * kMergeOperandsPerTable;
+ const int kRangeDeletionCount = kTableCount * kRangeDeletionsPerTable;
+ const int kKeyCount = kPutCount + kDeletionCount + kMergeCount + kRangeDeletionCount;
+ const int kAvgSuccessorSize = kKeySize / 5;
+ const int kEncodingSavePerKey = kKeySize / 4;
+ expected_tp->raw_key_size = kKeyCount * (kKeySize + 8);
+ expected_tp->raw_value_size =
+ (kPutCount + kMergeCount + kRangeDeletionCount) * kValueSize;
+ expected_tp->num_entries = kKeyCount;
+ expected_tp->num_deletions = kDeletionCount + kRangeDeletionCount;
+ expected_tp->num_merge_operands = kMergeCount;
+ expected_tp->num_range_deletions = kRangeDeletionCount;
+ expected_tp->num_data_blocks =
+ kTableCount * (kKeysPerTable * (kKeySize - kEncodingSavePerKey + kValueSize)) /
+ kBlockSize;
+ expected_tp->data_size =
+ kTableCount * (kKeysPerTable * (kKeySize + 8 + kValueSize));
+ expected_tp->index_size =
+ expected_tp->num_data_blocks *
+ (kAvgSuccessorSize + (index_key_is_user_key ? 0 : 8) -
+ // discount 1 byte as value size is not encoded in value delta encoding
+ (value_delta_encoding ? 1 : 0));
+ expected_tp->filter_size =
+ kTableCount * ((kKeysPerTable * kBloomBitsPerKey + 7) / 8 +
+ /*average-ish overhead*/ CACHE_LINE_SIZE / 2);
+}
+} // anonymous namespace
+
+TEST_F(DBPropertiesTest, ValidatePropertyInfo) {
+ for (const auto& ppt_name_and_info : InternalStats::ppt_name_to_info) {
+ // If C++ gets a std::string_literal, this would be better to check at
+ // compile-time using static_assert.
+ ASSERT_TRUE(ppt_name_and_info.first.empty() ||
+ !isdigit(ppt_name_and_info.first.back()));
+
+ int count = 0;
+ count += (ppt_name_and_info.second.handle_string == nullptr) ? 0 : 1;
+ count += (ppt_name_and_info.second.handle_int == nullptr) ? 0 : 1;
+ count += (ppt_name_and_info.second.handle_string_dbimpl == nullptr) ? 0 : 1;
+ ASSERT_TRUE(count == 1);
+ }
+}
+
+TEST_F(DBPropertiesTest, ValidateSampleNumber) {
+ // When "max_open_files" is -1, we read all the files for
+ // "rocksdb.estimate-num-keys" computation, which is the ground truth.
+ // Otherwise, we sample 20 newest files to make an estimation.
+ // Formula: lastest_20_files_active_key_ratio * total_files
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.level0_stop_writes_trigger = 1000;
+ DestroyAndReopen(options);
+ int key = 0;
+ for (int files = 20; files >= 10; files -= 10) {
+ for (int i = 0; i < files; i++) {
+ int rows = files / 10;
+ for (int j = 0; j < rows; j++) {
+ db_->Put(WriteOptions(), std::to_string(++key), "foo");
+ }
+ db_->Flush(FlushOptions());
+ }
+ }
+ std::string num;
+ Reopen(options);
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
+ ASSERT_EQ("45", num);
+ options.max_open_files = -1;
+ Reopen(options);
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
+ ASSERT_EQ("50", num);
+}
+
+TEST_F(DBPropertiesTest, AggregatedTableProperties) {
+ for (int kTableCount = 40; kTableCount <= 100; kTableCount += 30) {
+ const int kDeletionsPerTable = 5;
+ const int kMergeOperandsPerTable = 15;
+ const int kRangeDeletionsPerTable = 5;
+ const int kPutsPerTable = 100;
+ const int kKeySize = 80;
+ const int kValueSize = 200;
+ const int kBloomBitsPerKey = 20;
+
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = 8;
+ options.compression = kNoCompression;
+ options.create_if_missing = true;
+ options.preserve_deletes = true;
+ options.merge_operator.reset(new TestPutOperator());
+
+ BlockBasedTableOptions table_options;
+ table_options.filter_policy.reset(
+ NewBloomFilterPolicy(kBloomBitsPerKey, false));
+ table_options.block_size = 1024;
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+
+ DestroyAndReopen(options);
+
+ // Hold open a snapshot to prevent range tombstones from being compacted
+ // away.
+ ManagedSnapshot snapshot(db_);
+
+ Random rnd(5632);
+ for (int table = 1; table <= kTableCount; ++table) {
+ for (int i = 0; i < kPutsPerTable; ++i) {
+ db_->Put(WriteOptions(), RandomString(&rnd, kKeySize),
+ RandomString(&rnd, kValueSize));
+ }
+ for (int i = 0; i < kDeletionsPerTable; i++) {
+ db_->Delete(WriteOptions(), RandomString(&rnd, kKeySize));
+ }
+ for (int i = 0; i < kMergeOperandsPerTable; i++) {
+ db_->Merge(WriteOptions(), RandomString(&rnd, kKeySize),
+ RandomString(&rnd, kValueSize));
+ }
+ for (int i = 0; i < kRangeDeletionsPerTable; i++) {
+ std::string start = RandomString(&rnd, kKeySize);
+ std::string end = start;
+ end.resize(kValueSize);
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end);
+ }
+ db_->Flush(FlushOptions());
+ }
+ std::string property;
+ db_->GetProperty(DB::Properties::kAggregatedTableProperties, &property);
+ TableProperties output_tp;
+ ParseTablePropertiesString(property, &output_tp);
+ bool index_key_is_user_key = output_tp.index_key_is_user_key > 0;
+ bool value_is_delta_encoded = output_tp.index_value_is_delta_encoded > 0;
+
+ TableProperties expected_tp;
+ GetExpectedTableProperties(
+ &expected_tp, kKeySize, kValueSize, kPutsPerTable, kDeletionsPerTable,
+ kMergeOperandsPerTable, kRangeDeletionsPerTable, kTableCount,
+ kBloomBitsPerKey, table_options.block_size, index_key_is_user_key,
+ value_is_delta_encoded);
+
+ VerifyTableProperties(expected_tp, output_tp);
+ }
+}
+
+TEST_F(DBPropertiesTest, ReadLatencyHistogramByLevel) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 110 << 10;
+ options.level0_file_num_compaction_trigger = 6;
+ options.num_levels = 4;
+ options.compression = kNoCompression;
+ options.max_bytes_for_level_base = 4500 << 10;
+ options.target_file_size_base = 98 << 10;
+ options.max_write_buffer_number = 2;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.max_open_files = 11; // Make sure no proloading of table readers
+
+ // RocksDB sanitize max open files to at least 20. Modify it back.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+ int* max_open_files = static_cast<int*>(arg);
+ *max_open_files = 11;
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ BlockBasedTableOptions table_options;
+ table_options.no_block_cache = true;
+
+ CreateAndReopenWithCF({"pikachu"}, options);
+ int key_index = 0;
+ Random rnd(301);
+ for (int num = 0; num < 8; num++) {
+ Put("foo", "bar");
+ GenerateNewFile(&rnd, &key_index);
+ dbfull()->TEST_WaitForCompact();
+ }
+ dbfull()->TEST_WaitForCompact();
+
+ std::string prop;
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop));
+
+ // Get() after flushes, See latency histogram tracked.
+ for (int key = 0; key < key_index; key++) {
+ Get(Key(key));
+ }
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cfstats", &prop));
+ ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+ ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram"));
+ ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+
+ // Reopen and issue Get(). See thee latency tracked
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ dbfull()->TEST_WaitForCompact();
+ for (int key = 0; key < key_index; key++) {
+ Get(Key(key));
+ }
+
+ // Test for getting immutable_db_options_.statistics
+ ASSERT_TRUE(dbfull()->GetProperty(dbfull()->DefaultColumnFamily(),
+ "rocksdb.options-statistics", &prop));
+ ASSERT_NE(std::string::npos, prop.find("rocksdb.block.cache.miss"));
+ ASSERT_EQ(std::string::npos, prop.find("rocksdb.db.f.micros"));
+
+ ASSERT_TRUE(dbfull()->GetProperty(dbfull()->DefaultColumnFamily(),
+ "rocksdb.cf-file-histogram", &prop));
+ ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+ ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram"));
+ ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+
+ // Reopen and issue iterating. See thee latency tracked
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cf-file-histogram", &prop));
+ ASSERT_EQ(std::string::npos, prop.find("** Level 0 read latency histogram"));
+ ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram"));
+ ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+ {
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+ for (iter->Seek(Key(0)); iter->Valid(); iter->Next()) {
+ }
+ }
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cf-file-histogram", &prop));
+ ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+ ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram"));
+ ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+
+ // CF 1 should show no histogram.
+ ASSERT_TRUE(
+ dbfull()->GetProperty(handles_[1], "rocksdb.cf-file-histogram", &prop));
+ ASSERT_EQ(std::string::npos, prop.find("** Level 0 read latency histogram"));
+ ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram"));
+ ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+ // put something and read it back , CF 1 should show histogram.
+ Put(1, "foo", "bar");
+ Flush(1);
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ("bar", Get(1, "foo"));
+
+ ASSERT_TRUE(
+ dbfull()->GetProperty(handles_[1], "rocksdb.cf-file-histogram", &prop));
+ ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+ ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram"));
+ ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+
+ // options.max_open_files preloads table readers.
+ options.max_open_files = -1;
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_TRUE(dbfull()->GetProperty(dbfull()->DefaultColumnFamily(),
+ "rocksdb.cf-file-histogram", &prop));
+ ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+ ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram"));
+ ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+ for (int key = 0; key < key_index; key++) {
+ Get(Key(key));
+ }
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cfstats", &prop));
+ ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+ ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram"));
+ ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+
+ // Clear internal stats
+ dbfull()->ResetStats();
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cfstats", &prop));
+ ASSERT_EQ(std::string::npos, prop.find("** Level 0 read latency histogram"));
+ ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram"));
+ ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+}
+
+TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
+ const int kTableCount = 100;
+ const int kDeletionsPerTable = 2;
+ const int kMergeOperandsPerTable = 2;
+ const int kRangeDeletionsPerTable = 2;
+ const int kPutsPerTable = 10;
+ const int kKeySize = 50;
+ const int kValueSize = 400;
+ const int kMaxLevel = 7;
+ const int kBloomBitsPerKey = 20;
+ Random rnd(301);
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = 8;
+ options.compression = kNoCompression;
+ options.create_if_missing = true;
+ options.level0_file_num_compaction_trigger = 2;
+ options.target_file_size_base = 8192;
+ options.max_bytes_for_level_base = 10000;
+ options.max_bytes_for_level_multiplier = 2;
+ // This ensures there no compaction happening when we call GetProperty().
+ options.disable_auto_compactions = true;
+ options.preserve_deletes = true;
+ options.merge_operator.reset(new TestPutOperator());
+
+ BlockBasedTableOptions table_options;
+ table_options.filter_policy.reset(
+ NewBloomFilterPolicy(kBloomBitsPerKey, false));
+ table_options.block_size = 1024;
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+
+ DestroyAndReopen(options);
+
+ // Hold open a snapshot to prevent range tombstones from being compacted away.
+ ManagedSnapshot snapshot(db_);
+
+ std::string level_tp_strings[kMaxLevel];
+ std::string tp_string;
+ TableProperties level_tps[kMaxLevel];
+ TableProperties tp, sum_tp, expected_tp;
+ for (int table = 1; table <= kTableCount; ++table) {
+ for (int i = 0; i < kPutsPerTable; ++i) {
+ db_->Put(WriteOptions(), RandomString(&rnd, kKeySize),
+ RandomString(&rnd, kValueSize));
+ }
+ for (int i = 0; i < kDeletionsPerTable; i++) {
+ db_->Delete(WriteOptions(), RandomString(&rnd, kKeySize));
+ }
+ for (int i = 0; i < kMergeOperandsPerTable; i++) {
+ db_->Merge(WriteOptions(), RandomString(&rnd, kKeySize),
+ RandomString(&rnd, kValueSize));
+ }
+ for (int i = 0; i < kRangeDeletionsPerTable; i++) {
+ std::string start = RandomString(&rnd, kKeySize);
+ std::string end = start;
+ end.resize(kValueSize);
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end);
+ }
+ db_->Flush(FlushOptions());
+ db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ ResetTableProperties(&sum_tp);
+ for (int level = 0; level < kMaxLevel; ++level) {
+ db_->GetProperty(
+ DB::Properties::kAggregatedTablePropertiesAtLevel + ToString(level),
+ &level_tp_strings[level]);
+ ParseTablePropertiesString(level_tp_strings[level], &level_tps[level]);
+ sum_tp.data_size += level_tps[level].data_size;
+ sum_tp.index_size += level_tps[level].index_size;
+ sum_tp.filter_size += level_tps[level].filter_size;
+ sum_tp.raw_key_size += level_tps[level].raw_key_size;
+ sum_tp.raw_value_size += level_tps[level].raw_value_size;
+ sum_tp.num_data_blocks += level_tps[level].num_data_blocks;
+ sum_tp.num_entries += level_tps[level].num_entries;
+ sum_tp.num_deletions += level_tps[level].num_deletions;
+ sum_tp.num_merge_operands += level_tps[level].num_merge_operands;
+ sum_tp.num_range_deletions += level_tps[level].num_range_deletions;
+ }
+ db_->GetProperty(DB::Properties::kAggregatedTableProperties, &tp_string);
+ ParseTablePropertiesString(tp_string, &tp);
+ bool index_key_is_user_key = tp.index_key_is_user_key > 0;
+ bool value_is_delta_encoded = tp.index_value_is_delta_encoded > 0;
+ ASSERT_EQ(sum_tp.data_size, tp.data_size);
+ ASSERT_EQ(sum_tp.index_size, tp.index_size);
+ ASSERT_EQ(sum_tp.filter_size, tp.filter_size);
+ ASSERT_EQ(sum_tp.raw_key_size, tp.raw_key_size);
+ ASSERT_EQ(sum_tp.raw_value_size, tp.raw_value_size);
+ ASSERT_EQ(sum_tp.num_data_blocks, tp.num_data_blocks);
+ ASSERT_EQ(sum_tp.num_entries, tp.num_entries);
+ ASSERT_EQ(sum_tp.num_deletions, tp.num_deletions);
+ ASSERT_EQ(sum_tp.num_merge_operands, tp.num_merge_operands);
+ ASSERT_EQ(sum_tp.num_range_deletions, tp.num_range_deletions);
+ if (table > 3) {
+ GetExpectedTableProperties(
+ &expected_tp, kKeySize, kValueSize, kPutsPerTable, kDeletionsPerTable,
+ kMergeOperandsPerTable, kRangeDeletionsPerTable, table,
+ kBloomBitsPerKey, table_options.block_size, index_key_is_user_key,
+ value_is_delta_encoded);
+ // Gives larger bias here as index block size, filter block size,
+ // and data block size become much harder to estimate in this test.
+ VerifyTableProperties(expected_tp, tp, 0.5, 0.4, 0.4, 0.25);
+ }
+ }
+}
+
+TEST_F(DBPropertiesTest, NumImmutableMemTable) {
+ do {
+ Options options = CurrentOptions();
+ WriteOptions writeOpt = WriteOptions();
+ writeOpt.disableWAL = true;
+ options.max_write_buffer_number = 4;
+ options.min_write_buffer_number_to_merge = 3;
+ options.write_buffer_size = 1000000;
+ options.max_write_buffer_size_to_maintain =
+ 5 * static_cast<int64_t>(options.write_buffer_size);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ std::string big_value(1000000 * 2, 'x');
+ std::string num;
+ uint64_t value;
+ SetPerfLevel(kEnableTime);
+ ASSERT_TRUE(GetPerfLevel() == kEnableTime);
+
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k1", big_value));
+ ASSERT_TRUE(dbfull()->GetProperty(handles_[1],
+ "rocksdb.num-immutable-mem-table", &num));
+ ASSERT_EQ(num, "0");
+ ASSERT_TRUE(dbfull()->GetProperty(
+ handles_[1], DB::Properties::kNumImmutableMemTableFlushed, &num));
+ ASSERT_EQ(num, "0");
+ ASSERT_TRUE(dbfull()->GetProperty(
+ handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+ ASSERT_EQ(num, "1");
+ get_perf_context()->Reset();
+ Get(1, "k1");
+ ASSERT_EQ(1, static_cast<int>(get_perf_context()->get_from_memtable_count));
+
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value));
+ ASSERT_TRUE(dbfull()->GetProperty(handles_[1],
+ "rocksdb.num-immutable-mem-table", &num));
+ ASSERT_EQ(num, "1");
+ ASSERT_TRUE(dbfull()->GetProperty(
+ handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+ ASSERT_EQ(num, "1");
+ ASSERT_TRUE(dbfull()->GetProperty(
+ handles_[1], "rocksdb.num-entries-imm-mem-tables", &num));
+ ASSERT_EQ(num, "1");
+
+ get_perf_context()->Reset();
+ Get(1, "k1");
+ ASSERT_EQ(2, static_cast<int>(get_perf_context()->get_from_memtable_count));
+ get_perf_context()->Reset();
+ Get(1, "k2");
+ ASSERT_EQ(1, static_cast<int>(get_perf_context()->get_from_memtable_count));
+
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k3", big_value));
+ ASSERT_TRUE(dbfull()->GetProperty(
+ handles_[1], "rocksdb.cur-size-active-mem-table", &num));
+ ASSERT_TRUE(dbfull()->GetProperty(handles_[1],
+ "rocksdb.num-immutable-mem-table", &num));
+ ASSERT_EQ(num, "2");
+ ASSERT_TRUE(dbfull()->GetProperty(
+ handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+ ASSERT_EQ(num, "1");
+ ASSERT_TRUE(dbfull()->GetProperty(
+ handles_[1], "rocksdb.num-entries-imm-mem-tables", &num));
+ ASSERT_EQ(num, "2");
+ get_perf_context()->Reset();
+ Get(1, "k2");
+ ASSERT_EQ(2, static_cast<int>(get_perf_context()->get_from_memtable_count));
+ get_perf_context()->Reset();
+ Get(1, "k3");
+ ASSERT_EQ(1, static_cast<int>(get_perf_context()->get_from_memtable_count));
+ get_perf_context()->Reset();
+ Get(1, "k1");
+ ASSERT_EQ(3, static_cast<int>(get_perf_context()->get_from_memtable_count));
+
+ ASSERT_OK(Flush(1));
+ ASSERT_TRUE(dbfull()->GetProperty(handles_[1],
+ "rocksdb.num-immutable-mem-table", &num));
+ ASSERT_EQ(num, "0");
+ ASSERT_TRUE(dbfull()->GetProperty(
+ handles_[1], DB::Properties::kNumImmutableMemTableFlushed, &num));
+ ASSERT_EQ(num, "3");
+ ASSERT_TRUE(dbfull()->GetIntProperty(
+ handles_[1], "rocksdb.cur-size-active-mem-table", &value));
+ // "192" is the size of the metadata of two empty skiplists, this would
+ // break if we change the default skiplist implementation
+ ASSERT_GE(value, 192);
+
+ uint64_t int_num;
+ uint64_t base_total_size;
+ ASSERT_TRUE(dbfull()->GetIntProperty(
+ handles_[1], "rocksdb.estimate-num-keys", &base_total_size));
+
+ ASSERT_OK(dbfull()->Delete(writeOpt, handles_[1], "k2"));
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k3", ""));
+ ASSERT_OK(dbfull()->Delete(writeOpt, handles_[1], "k3"));
+ ASSERT_TRUE(dbfull()->GetIntProperty(
+ handles_[1], "rocksdb.num-deletes-active-mem-table", &int_num));
+ ASSERT_EQ(int_num, 2U);
+ ASSERT_TRUE(dbfull()->GetIntProperty(
+ handles_[1], "rocksdb.num-entries-active-mem-table", &int_num));
+ ASSERT_EQ(int_num, 3U);
+
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value));
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value));
+ ASSERT_TRUE(dbfull()->GetIntProperty(
+ handles_[1], "rocksdb.num-entries-imm-mem-tables", &int_num));
+ ASSERT_EQ(int_num, 4U);
+ ASSERT_TRUE(dbfull()->GetIntProperty(
+ handles_[1], "rocksdb.num-deletes-imm-mem-tables", &int_num));
+ ASSERT_EQ(int_num, 2U);
+
+ ASSERT_TRUE(dbfull()->GetIntProperty(
+ handles_[1], "rocksdb.estimate-num-keys", &int_num));
+ ASSERT_EQ(int_num, base_total_size + 1);
+
+ SetPerfLevel(kDisable);
+ ASSERT_TRUE(GetPerfLevel() == kDisable);
+ } while (ChangeCompactOptions());
+}
+
+// TODO(techdept) : Disabled flaky test #12863555
+TEST_F(DBPropertiesTest, DISABLED_GetProperty) {
+ // Set sizes to both background thread pool to be 1 and block them.
+ env_->SetBackgroundThreads(1, Env::HIGH);
+ env_->SetBackgroundThreads(1, Env::LOW);
+ test::SleepingBackgroundTask sleeping_task_low;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+ test::SleepingBackgroundTask sleeping_task_high;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+ &sleeping_task_high, Env::Priority::HIGH);
+
+ Options options = CurrentOptions();
+ WriteOptions writeOpt = WriteOptions();
+ writeOpt.disableWAL = true;
+ options.compaction_style = kCompactionStyleUniversal;
+ options.level0_file_num_compaction_trigger = 1;
+ options.compaction_options_universal.size_ratio = 50;
+ options.max_background_compactions = 1;
+ options.max_background_flushes = 1;
+ options.max_write_buffer_number = 10;
+ options.min_write_buffer_number_to_merge = 1;
+ options.max_write_buffer_size_to_maintain = 0;
+ options.write_buffer_size = 1000000;
+ Reopen(options);
+
+ std::string big_value(1000000 * 2, 'x');
+ std::string num;
+ uint64_t int_num;
+ SetPerfLevel(kEnableTime);
+
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+ ASSERT_EQ(int_num, 0U);
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.estimate-live-data-size", &int_num));
+ ASSERT_EQ(int_num, 0U);
+
+ ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value));
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
+ ASSERT_EQ(num, "0");
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num));
+ ASSERT_EQ(num, "0");
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num));
+ ASSERT_EQ(num, "0");
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
+ ASSERT_EQ(num, "1");
+ get_perf_context()->Reset();
+
+ ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value));
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
+ ASSERT_EQ(num, "1");
+ ASSERT_OK(dbfull()->Delete(writeOpt, "k-non-existing"));
+ ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value));
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
+ ASSERT_EQ(num, "2");
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num));
+ ASSERT_EQ(num, "1");
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num));
+ ASSERT_EQ(num, "0");
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
+ ASSERT_EQ(num, "2");
+ // Verify the same set of properties through GetIntProperty
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.num-immutable-mem-table", &int_num));
+ ASSERT_EQ(int_num, 2U);
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.mem-table-flush-pending", &int_num));
+ ASSERT_EQ(int_num, 1U);
+ ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.compaction-pending", &int_num));
+ ASSERT_EQ(int_num, 0U);
+ ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &int_num));
+ ASSERT_EQ(int_num, 2U);
+
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+ ASSERT_EQ(int_num, 0U);
+
+ sleeping_task_high.WakeUp();
+ sleeping_task_high.WaitUntilDone();
+ dbfull()->TEST_WaitForFlushMemTable();
+
+ ASSERT_OK(dbfull()->Put(writeOpt, "k4", big_value));
+ ASSERT_OK(dbfull()->Put(writeOpt, "k5", big_value));
+ dbfull()->TEST_WaitForFlushMemTable();
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num));
+ ASSERT_EQ(num, "0");
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num));
+ ASSERT_EQ(num, "1");
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
+ ASSERT_EQ(num, "4");
+
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+ ASSERT_GT(int_num, 0U);
+
+ sleeping_task_low.WakeUp();
+ sleeping_task_low.WaitUntilDone();
+
+ // Wait for compaction to be done. This is important because otherwise RocksDB
+ // might schedule a compaction when reopening the database, failing assertion
+ // (A) as a result.
+ dbfull()->TEST_WaitForCompact();
+ options.max_open_files = 10;
+ Reopen(options);
+ // After reopening, no table reader is loaded, so no memory for table readers
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+ ASSERT_EQ(int_num, 0U); // (A)
+ ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &int_num));
+ ASSERT_GT(int_num, 0U);
+
+ // After reading a key, at least one table reader is loaded.
+ Get("k5");
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+ ASSERT_GT(int_num, 0U);
+
+ // Test rocksdb.num-live-versions
+ {
+ options.level0_file_num_compaction_trigger = 20;
+ Reopen(options);
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+ ASSERT_EQ(int_num, 1U);
+
+ // Use an iterator to hold current version
+ std::unique_ptr<Iterator> iter1(dbfull()->NewIterator(ReadOptions()));
+
+ ASSERT_OK(dbfull()->Put(writeOpt, "k6", big_value));
+ Flush();
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+ ASSERT_EQ(int_num, 2U);
+
+ // Use an iterator to hold current version
+ std::unique_ptr<Iterator> iter2(dbfull()->NewIterator(ReadOptions()));
+
+ ASSERT_OK(dbfull()->Put(writeOpt, "k7", big_value));
+ Flush();
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+ ASSERT_EQ(int_num, 3U);
+
+ iter2.reset();
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+ ASSERT_EQ(int_num, 2U);
+
+ iter1.reset();
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+ ASSERT_EQ(int_num, 1U);
+ }
+}
+
+TEST_F(DBPropertiesTest, ApproximateMemoryUsage) {
+ const int kNumRounds = 10;
+ // TODO(noetzli) kFlushesPerRound does not really correlate with how many
+ // flushes happen.
+ const int kFlushesPerRound = 10;
+ const int kWritesPerFlush = 10;
+ const int kKeySize = 100;
+ const int kValueSize = 1000;
+ Options options;
+ options.write_buffer_size = 1000; // small write buffer
+ options.min_write_buffer_number_to_merge = 4;
+ options.compression = kNoCompression;
+ options.create_if_missing = true;
+ options = CurrentOptions(options);
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+
+ std::vector<Iterator*> iters;
+
+ uint64_t active_mem;
+ uint64_t unflushed_mem;
+ uint64_t all_mem;
+ uint64_t prev_all_mem;
+
+ // Phase 0. The verify the initial value of all these properties are the same
+ // as we have no mem-tables.
+ dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem);
+ dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem);
+ dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+ ASSERT_EQ(all_mem, active_mem);
+ ASSERT_EQ(all_mem, unflushed_mem);
+
+ // Phase 1. Simply issue Put() and expect "cur-size-all-mem-tables" equals to
+ // "size-all-mem-tables"
+ for (int r = 0; r < kNumRounds; ++r) {
+ for (int f = 0; f < kFlushesPerRound; ++f) {
+ for (int w = 0; w < kWritesPerFlush; ++w) {
+ Put(RandomString(&rnd, kKeySize), RandomString(&rnd, kValueSize));
+ }
+ }
+ // Make sure that there is no flush between getting the two properties.
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem);
+ dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+ // in no iterator case, these two number should be the same.
+ ASSERT_EQ(unflushed_mem, all_mem);
+ }
+ prev_all_mem = all_mem;
+
+ // Phase 2. Keep issuing Put() but also create new iterators. This time we
+ // expect "size-all-mem-tables" > "cur-size-all-mem-tables".
+ for (int r = 0; r < kNumRounds; ++r) {
+ iters.push_back(db_->NewIterator(ReadOptions()));
+ for (int f = 0; f < kFlushesPerRound; ++f) {
+ for (int w = 0; w < kWritesPerFlush; ++w) {
+ Put(RandomString(&rnd, kKeySize), RandomString(&rnd, kValueSize));
+ }
+ }
+ // Force flush to prevent flush from happening between getting the
+ // properties or after getting the properties and before the new round.
+ Flush();
+
+ // In the second round, add iterators.
+ dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem);
+ dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem);
+ dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+ ASSERT_GT(all_mem, active_mem);
+ ASSERT_GT(all_mem, unflushed_mem);
+ ASSERT_GT(all_mem, prev_all_mem);
+ prev_all_mem = all_mem;
+ }
+
+ // Phase 3. Delete iterators and expect "size-all-mem-tables" shrinks
+ // whenever we release an iterator.
+ for (auto* iter : iters) {
+ delete iter;
+ dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+ // Expect the size shrinking
+ ASSERT_LT(all_mem, prev_all_mem);
+ prev_all_mem = all_mem;
+ }
+
+ // Expect all these three counters to be the same.
+ dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem);
+ dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem);
+ dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+ ASSERT_EQ(active_mem, unflushed_mem);
+ ASSERT_EQ(unflushed_mem, all_mem);
+
+ // Phase 5. Reopen, and expect all these three counters to be the same again.
+ Reopen(options);
+ dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem);
+ dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem);
+ dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+ ASSERT_EQ(active_mem, unflushed_mem);
+ ASSERT_EQ(unflushed_mem, all_mem);
+}
+
+TEST_F(DBPropertiesTest, EstimatePendingCompBytes) {
+ // Set sizes to both background thread pool to be 1 and block them.
+ env_->SetBackgroundThreads(1, Env::HIGH);
+ env_->SetBackgroundThreads(1, Env::LOW);
+ test::SleepingBackgroundTask sleeping_task_low;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+
+ Options options = CurrentOptions();
+ WriteOptions writeOpt = WriteOptions();
+ writeOpt.disableWAL = true;
+ options.compaction_style = kCompactionStyleLevel;
+ options.level0_file_num_compaction_trigger = 2;
+ options.max_background_compactions = 1;
+ options.max_background_flushes = 1;
+ options.max_write_buffer_number = 10;
+ options.min_write_buffer_number_to_merge = 1;
+ options.max_write_buffer_size_to_maintain = 0;
+ options.write_buffer_size = 1000000;
+ Reopen(options);
+
+ std::string big_value(1000000 * 2, 'x');
+ std::string num;
+ uint64_t int_num;
+
+ ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value));
+ Flush();
+ ASSERT_TRUE(dbfull()->GetIntProperty(
+ "rocksdb.estimate-pending-compaction-bytes", &int_num));
+ ASSERT_EQ(int_num, 0U);
+
+ ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value));
+ Flush();
+ ASSERT_TRUE(dbfull()->GetIntProperty(
+ "rocksdb.estimate-pending-compaction-bytes", &int_num));
+ ASSERT_GT(int_num, 0U);
+
+ ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value));
+ Flush();
+ ASSERT_TRUE(dbfull()->GetIntProperty(
+ "rocksdb.estimate-pending-compaction-bytes", &int_num));
+ ASSERT_GT(int_num, 0U);
+
+ sleeping_task_low.WakeUp();
+ sleeping_task_low.WaitUntilDone();
+
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_TRUE(dbfull()->GetIntProperty(
+ "rocksdb.estimate-pending-compaction-bytes", &int_num));
+ ASSERT_EQ(int_num, 0U);
+}
+
+TEST_F(DBPropertiesTest, EstimateCompressionRatio) {
+ if (!Snappy_Supported()) {
+ return;
+ }
+ const int kNumL0Files = 3;
+ const int kNumEntriesPerFile = 1000;
+
+ Options options = CurrentOptions();
+ options.compression_per_level = {kNoCompression, kSnappyCompression};
+ options.disable_auto_compactions = true;
+ options.num_levels = 2;
+ Reopen(options);
+
+ // compression ratio is -1.0 when no open files at level
+ ASSERT_EQ(CompressionRatioAtLevel(0), -1.0);
+
+ const std::string kVal(100, 'a');
+ for (int i = 0; i < kNumL0Files; ++i) {
+ for (int j = 0; j < kNumEntriesPerFile; ++j) {
+ // Put common data ("key") at end to prevent delta encoding from
+ // compressing the key effectively
+ std::string key = ToString(i) + ToString(j) + "key";
+ ASSERT_OK(dbfull()->Put(WriteOptions(), key, kVal));
+ }
+ Flush();
+ }
+
+ // no compression at L0, so ratio is less than one
+ ASSERT_LT(CompressionRatioAtLevel(0), 1.0);
+ ASSERT_GT(CompressionRatioAtLevel(0), 0.0);
+ ASSERT_EQ(CompressionRatioAtLevel(1), -1.0);
+
+ dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+
+ ASSERT_EQ(CompressionRatioAtLevel(0), -1.0);
+ // Data at L1 should be highly compressed thanks to Snappy and redundant data
+ // in values (ratio is 12.846 as of 4/19/2016).
+ ASSERT_GT(CompressionRatioAtLevel(1), 10.0);
+}
+
+#endif // ROCKSDB_LITE
+
+class CountingUserTblPropCollector : public TablePropertiesCollector {
+ public:
+ const char* Name() const override { return "CountingUserTblPropCollector"; }
+
+ Status Finish(UserCollectedProperties* properties) override {
+ std::string encoded;
+ PutVarint32(&encoded, count_);
+ *properties = UserCollectedProperties{
+ {"CountingUserTblPropCollector", message_}, {"Count", encoded},
+ };
+ return Status::OK();
+ }
+
+ Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/,
+ EntryType /*type*/, SequenceNumber /*seq*/,
+ uint64_t /*file_size*/) override {
+ ++count_;
+ return Status::OK();
+ }
+
+ UserCollectedProperties GetReadableProperties() const override {
+ return UserCollectedProperties{};
+ }
+
+ private:
+ std::string message_ = "Rocksdb";
+ uint32_t count_ = 0;
+};
+
+class CountingUserTblPropCollectorFactory
+ : public TablePropertiesCollectorFactory {
+ public:
+ explicit CountingUserTblPropCollectorFactory(
+ uint32_t expected_column_family_id)
+ : expected_column_family_id_(expected_column_family_id),
+ num_created_(0) {}
+ TablePropertiesCollector* CreateTablePropertiesCollector(
+ TablePropertiesCollectorFactory::Context context) override {
+ EXPECT_EQ(expected_column_family_id_, context.column_family_id);
+ num_created_++;
+ return new CountingUserTblPropCollector();
+ }
+ const char* Name() const override {
+ return "CountingUserTblPropCollectorFactory";
+ }
+ void set_expected_column_family_id(uint32_t v) {
+ expected_column_family_id_ = v;
+ }
+ uint32_t expected_column_family_id_;
+ uint32_t num_created_;
+};
+
+class CountingDeleteTabPropCollector : public TablePropertiesCollector {
+ public:
+ const char* Name() const override { return "CountingDeleteTabPropCollector"; }
+
+ Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/,
+ EntryType type, SequenceNumber /*seq*/,
+ uint64_t /*file_size*/) override {
+ if (type == kEntryDelete) {
+ num_deletes_++;
+ }
+ return Status::OK();
+ }
+
+ bool NeedCompact() const override { return num_deletes_ > 10; }
+
+ UserCollectedProperties GetReadableProperties() const override {
+ return UserCollectedProperties{};
+ }
+
+ Status Finish(UserCollectedProperties* properties) override {
+ *properties =
+ UserCollectedProperties{{"num_delete", ToString(num_deletes_)}};
+ return Status::OK();
+ }
+
+ private:
+ uint32_t num_deletes_ = 0;
+};
+
+class CountingDeleteTabPropCollectorFactory
+ : public TablePropertiesCollectorFactory {
+ public:
+ TablePropertiesCollector* CreateTablePropertiesCollector(
+ TablePropertiesCollectorFactory::Context /*context*/) override {
+ return new CountingDeleteTabPropCollector();
+ }
+ const char* Name() const override {
+ return "CountingDeleteTabPropCollectorFactory";
+ }
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBPropertiesTest, GetUserDefinedTableProperties) {
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = (1 << 30);
+ options.table_properties_collector_factories.resize(1);
+ std::shared_ptr<CountingUserTblPropCollectorFactory> collector_factory =
+ std::make_shared<CountingUserTblPropCollectorFactory>(0);
+ options.table_properties_collector_factories[0] = collector_factory;
+ Reopen(options);
+ // Create 4 tables
+ for (int table = 0; table < 4; ++table) {
+ for (int i = 0; i < 10 + table; ++i) {
+ db_->Put(WriteOptions(), ToString(table * 100 + i), "val");
+ }
+ db_->Flush(FlushOptions());
+ }
+
+ TablePropertiesCollection props;
+ ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+ ASSERT_EQ(4U, props.size());
+ uint32_t sum = 0;
+ for (const auto& item : props) {
+ auto& user_collected = item.second->user_collected_properties;
+ ASSERT_TRUE(user_collected.find("CountingUserTblPropCollector") !=
+ user_collected.end());
+ ASSERT_EQ(user_collected.at("CountingUserTblPropCollector"), "Rocksdb");
+ ASSERT_TRUE(user_collected.find("Count") != user_collected.end());
+ Slice key(user_collected.at("Count"));
+ uint32_t count;
+ ASSERT_TRUE(GetVarint32(&key, &count));
+ sum += count;
+ }
+ ASSERT_EQ(10u + 11u + 12u + 13u, sum);
+
+ ASSERT_GT(collector_factory->num_created_, 0U);
+ collector_factory->num_created_ = 0;
+ dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+ ASSERT_GT(collector_factory->num_created_, 0U);
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBPropertiesTest, UserDefinedTablePropertiesContext) {
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = 3;
+ options.table_properties_collector_factories.resize(1);
+ std::shared_ptr<CountingUserTblPropCollectorFactory> collector_factory =
+ std::make_shared<CountingUserTblPropCollectorFactory>(1);
+ options.table_properties_collector_factories[0] = collector_factory,
+ CreateAndReopenWithCF({"pikachu"}, options);
+ // Create 2 files
+ for (int table = 0; table < 2; ++table) {
+ for (int i = 0; i < 10 + table; ++i) {
+ Put(1, ToString(table * 100 + i), "val");
+ }
+ Flush(1);
+ }
+ ASSERT_GT(collector_factory->num_created_, 0U);
+
+ collector_factory->num_created_ = 0;
+ // Trigger automatic compactions.
+ for (int table = 0; table < 3; ++table) {
+ for (int i = 0; i < 10 + table; ++i) {
+ Put(1, ToString(table * 100 + i), "val");
+ }
+ Flush(1);
+ dbfull()->TEST_WaitForCompact();
+ }
+ ASSERT_GT(collector_factory->num_created_, 0U);
+
+ collector_factory->num_created_ = 0;
+ dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+ ASSERT_GT(collector_factory->num_created_, 0U);
+
+ // Come back to write to default column family
+ collector_factory->num_created_ = 0;
+ collector_factory->set_expected_column_family_id(0); // default CF
+ // Create 4 tables in default column family
+ for (int table = 0; table < 2; ++table) {
+ for (int i = 0; i < 10 + table; ++i) {
+ Put(ToString(table * 100 + i), "val");
+ }
+ Flush();
+ }
+ ASSERT_GT(collector_factory->num_created_, 0U);
+
+ collector_factory->num_created_ = 0;
+ // Trigger automatic compactions.
+ for (int table = 0; table < 3; ++table) {
+ for (int i = 0; i < 10 + table; ++i) {
+ Put(ToString(table * 100 + i), "val");
+ }
+ Flush();
+ dbfull()->TEST_WaitForCompact();
+ }
+ ASSERT_GT(collector_factory->num_created_, 0U);
+
+ collector_factory->num_created_ = 0;
+ dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+ ASSERT_GT(collector_factory->num_created_, 0U);
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBPropertiesTest, TablePropertiesNeedCompactTest) {
+ Random rnd(301);
+
+ Options options;
+ options.create_if_missing = true;
+ options.write_buffer_size = 4096;
+ options.max_write_buffer_number = 8;
+ options.level0_file_num_compaction_trigger = 2;
+ options.level0_slowdown_writes_trigger = 2;
+ options.level0_stop_writes_trigger = 4;
+ options.target_file_size_base = 2048;
+ options.max_bytes_for_level_base = 10240;
+ options.max_bytes_for_level_multiplier = 4;
+ options.soft_pending_compaction_bytes_limit = 1024 * 1024;
+ options.num_levels = 8;
+ options.env = env_;
+
+ std::shared_ptr<TablePropertiesCollectorFactory> collector_factory =
+ std::make_shared<CountingDeleteTabPropCollectorFactory>();
+ options.table_properties_collector_factories.resize(1);
+ options.table_properties_collector_factories[0] = collector_factory;
+
+ DestroyAndReopen(options);
+
+ const int kMaxKey = 1000;
+ for (int i = 0; i < kMaxKey; i++) {
+ ASSERT_OK(Put(Key(i), RandomString(&rnd, 102)));
+ ASSERT_OK(Put(Key(kMaxKey + i), RandomString(&rnd, 102)));
+ }
+ Flush();
+ dbfull()->TEST_WaitForCompact();
+ if (NumTableFilesAtLevel(0) == 1) {
+ // Clear Level 0 so that when later flush a file with deletions,
+ // we don't trigger an organic compaction.
+ ASSERT_OK(Put(Key(0), ""));
+ ASSERT_OK(Put(Key(kMaxKey * 2), ""));
+ Flush();
+ dbfull()->TEST_WaitForCompact();
+ }
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+ {
+ int c = 0;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+ iter->Seek(Key(kMaxKey - 100));
+ while (iter->Valid() && iter->key().compare(Key(kMaxKey + 100)) < 0) {
+ iter->Next();
+ ++c;
+ }
+ ASSERT_EQ(c, 200);
+ }
+
+ Delete(Key(0));
+ for (int i = kMaxKey - 100; i < kMaxKey + 100; i++) {
+ Delete(Key(i));
+ }
+ Delete(Key(kMaxKey * 2));
+
+ Flush();
+ dbfull()->TEST_WaitForCompact();
+
+ {
+ SetPerfLevel(kEnableCount);
+ get_perf_context()->Reset();
+ int c = 0;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+ iter->Seek(Key(kMaxKey - 100));
+ while (iter->Valid() && iter->key().compare(Key(kMaxKey + 100)) < 0) {
+ iter->Next();
+ }
+ ASSERT_EQ(c, 0);
+ ASSERT_LT(get_perf_context()->internal_delete_skipped_count, 30u);
+ ASSERT_LT(get_perf_context()->internal_key_skipped_count, 30u);
+ SetPerfLevel(kDisable);
+ }
+}
+
+TEST_F(DBPropertiesTest, NeedCompactHintPersistentTest) {
+ Random rnd(301);
+
+ Options options;
+ options.create_if_missing = true;
+ options.max_write_buffer_number = 8;
+ options.level0_file_num_compaction_trigger = 10;
+ options.level0_slowdown_writes_trigger = 10;
+ options.level0_stop_writes_trigger = 10;
+ options.disable_auto_compactions = true;
+ options.env = env_;
+
+ std::shared_ptr<TablePropertiesCollectorFactory> collector_factory =
+ std::make_shared<CountingDeleteTabPropCollectorFactory>();
+ options.table_properties_collector_factories.resize(1);
+ options.table_properties_collector_factories[0] = collector_factory;
+
+ DestroyAndReopen(options);
+
+ const int kMaxKey = 100;
+ for (int i = 0; i < kMaxKey; i++) {
+ ASSERT_OK(Put(Key(i), ""));
+ }
+ Flush();
+ dbfull()->TEST_WaitForFlushMemTable();
+
+ for (int i = 1; i < kMaxKey - 1; i++) {
+ Delete(Key(i));
+ }
+ Flush();
+ dbfull()->TEST_WaitForFlushMemTable();
+ ASSERT_EQ(NumTableFilesAtLevel(0), 2);
+
+ // Restart the DB. Although number of files didn't reach
+ // options.level0_file_num_compaction_trigger, compaction should
+ // still be triggered because of the need-compaction hint.
+ options.disable_auto_compactions = false;
+ Reopen(options);
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ {
+ SetPerfLevel(kEnableCount);
+ get_perf_context()->Reset();
+ int c = 0;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+ for (iter->Seek(Key(0)); iter->Valid(); iter->Next()) {
+ c++;
+ }
+ ASSERT_EQ(c, 2);
+ ASSERT_EQ(get_perf_context()->internal_delete_skipped_count, 0);
+ // We iterate every key twice. Is it a bug?
+ ASSERT_LE(get_perf_context()->internal_key_skipped_count, 2);
+ SetPerfLevel(kDisable);
+ }
+}
+
+TEST_F(DBPropertiesTest, EstimateNumKeysUnderflow) {
+ Options options;
+ Reopen(options);
+ Put("foo", "bar");
+ Delete("foo");
+ Delete("foo");
+ uint64_t num_keys = 0;
+ ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &num_keys));
+ ASSERT_EQ(0, num_keys);
+}
+
+TEST_F(DBPropertiesTest, EstimateOldestKeyTime) {
+ std::unique_ptr<MockTimeEnv> mock_env(new MockTimeEnv(Env::Default()));
+ uint64_t oldest_key_time = 0;
+ Options options;
+ options.env = mock_env.get();
+
+ // "rocksdb.estimate-oldest-key-time" only available to fifo compaction.
+ mock_env->set_current_time(100);
+ for (auto compaction : {kCompactionStyleLevel, kCompactionStyleUniversal,
+ kCompactionStyleNone}) {
+ options.compaction_style = compaction;
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_FALSE(dbfull()->GetIntProperty(
+ DB::Properties::kEstimateOldestKeyTime, &oldest_key_time));
+ }
+
+ options.compaction_style = kCompactionStyleFIFO;
+ options.ttl = 300;
+ options.compaction_options_fifo.allow_compaction = false;
+ DestroyAndReopen(options);
+
+ mock_env->set_current_time(100);
+ ASSERT_OK(Put("k1", "v1"));
+ ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+ &oldest_key_time));
+ ASSERT_EQ(100, oldest_key_time);
+ ASSERT_OK(Flush());
+ ASSERT_EQ("1", FilesPerLevel());
+ ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+ &oldest_key_time));
+ ASSERT_EQ(100, oldest_key_time);
+
+ mock_env->set_current_time(200);
+ ASSERT_OK(Put("k2", "v2"));
+ ASSERT_OK(Flush());
+ ASSERT_EQ("2", FilesPerLevel());
+ ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+ &oldest_key_time));
+ ASSERT_EQ(100, oldest_key_time);
+
+ mock_env->set_current_time(300);
+ ASSERT_OK(Put("k3", "v3"));
+ ASSERT_OK(Flush());
+ ASSERT_EQ("3", FilesPerLevel());
+ ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+ &oldest_key_time));
+ ASSERT_EQ(100, oldest_key_time);
+
+ mock_env->set_current_time(450);
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("2", FilesPerLevel());
+ ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+ &oldest_key_time));
+ ASSERT_EQ(200, oldest_key_time);
+
+ mock_env->set_current_time(550);
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("1", FilesPerLevel());
+ ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+ &oldest_key_time));
+ ASSERT_EQ(300, oldest_key_time);
+
+ mock_env->set_current_time(650);
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("", FilesPerLevel());
+ ASSERT_FALSE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+ &oldest_key_time));
+
+ // Close before mock_env destructs.
+ Close();
+}
+
+TEST_F(DBPropertiesTest, SstFilesSize) {
+ struct TestListener : public EventListener {
+ void OnCompactionCompleted(DB* db,
+ const CompactionJobInfo& /*info*/) override {
+ assert(callback_triggered == false);
+ assert(size_before_compaction > 0);
+ callback_triggered = true;
+ uint64_t total_sst_size = 0;
+ uint64_t live_sst_size = 0;
+ bool ok = db->GetIntProperty(DB::Properties::kTotalSstFilesSize,
+ &total_sst_size);
+ ASSERT_TRUE(ok);
+ // total_sst_size include files before and after compaction.
+ ASSERT_GT(total_sst_size, size_before_compaction);
+ ok =
+ db->GetIntProperty(DB::Properties::kLiveSstFilesSize, &live_sst_size);
+ ASSERT_TRUE(ok);
+ // live_sst_size only include files after compaction.
+ ASSERT_GT(live_sst_size, 0);
+ ASSERT_LT(live_sst_size, size_before_compaction);
+ }
+
+ uint64_t size_before_compaction = 0;
+ bool callback_triggered = false;
+ };
+ std::shared_ptr<TestListener> listener = std::make_shared<TestListener>();
+
+ Options options;
+ options.disable_auto_compactions = true;
+ options.listeners.push_back(listener);
+ Reopen(options);
+
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(Put("key" + ToString(i), std::string(1000, 'v')));
+ }
+ ASSERT_OK(Flush());
+ for (int i = 0; i < 5; i++) {
+ ASSERT_OK(Delete("key" + ToString(i)));
+ }
+ ASSERT_OK(Flush());
+ uint64_t sst_size;
+ bool ok = db_->GetIntProperty(DB::Properties::kTotalSstFilesSize, &sst_size);
+ ASSERT_TRUE(ok);
+ ASSERT_GT(sst_size, 0);
+ listener->size_before_compaction = sst_size;
+ // Compact to clean all keys and trigger listener.
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_TRUE(listener->callback_triggered);
+}
+
+TEST_F(DBPropertiesTest, MinObsoleteSstNumberToKeep) {
+ class TestListener : public EventListener {
+ public:
+ void OnTableFileCreated(const TableFileCreationInfo& info) override {
+ if (info.reason == TableFileCreationReason::kCompaction) {
+ // Verify the property indicates that SSTs created by a running
+ // compaction cannot be deleted.
+ uint64_t created_file_num;
+ FileType created_file_type;
+ std::string filename =
+ info.file_path.substr(info.file_path.rfind('/') + 1);
+ ASSERT_TRUE(
+ ParseFileName(filename, &created_file_num, &created_file_type));
+ ASSERT_EQ(kTableFile, created_file_type);
+
+ uint64_t keep_sst_lower_bound;
+ ASSERT_TRUE(
+ db_->GetIntProperty(DB::Properties::kMinObsoleteSstNumberToKeep,
+ &keep_sst_lower_bound));
+
+ ASSERT_LE(keep_sst_lower_bound, created_file_num);
+ validated_ = true;
+ }
+ }
+
+ void SetDB(DB* db) { db_ = db; }
+
+ int GetNumCompactions() { return num_compactions_; }
+
+ // True if we've verified the property for at least one output file
+ bool Validated() { return validated_; }
+
+ private:
+ int num_compactions_ = 0;
+ bool validated_ = false;
+ DB* db_ = nullptr;
+ };
+
+ const int kNumL0Files = 4;
+
+ std::shared_ptr<TestListener> listener = std::make_shared<TestListener>();
+
+ Options options = CurrentOptions();
+ options.listeners.push_back(listener);
+ options.level0_file_num_compaction_trigger = kNumL0Files;
+ DestroyAndReopen(options);
+ listener->SetDB(db_);
+
+ for (int i = 0; i < kNumL0Files; ++i) {
+ // Make sure they overlap in keyspace to prevent trivial move
+ Put("key1", "val");
+ Put("key2", "val");
+ Flush();
+ }
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_TRUE(listener->Validated());
+}
+
+TEST_F(DBPropertiesTest, BlockCacheProperties) {
+ Options options;
+ uint64_t value;
+
+ // Block cache properties are not available for tables other than
+ // block-based table.
+ options.table_factory.reset(NewPlainTableFactory());
+ Reopen(options);
+ ASSERT_FALSE(
+ db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+ ASSERT_FALSE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+ ASSERT_FALSE(
+ db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+
+ options.table_factory.reset(NewCuckooTableFactory());
+ Reopen(options);
+ ASSERT_FALSE(
+ db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+ ASSERT_FALSE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+ ASSERT_FALSE(
+ db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+
+ // Block cache properties are not available if block cache is not used.
+ BlockBasedTableOptions table_options;
+ table_options.no_block_cache = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+ ASSERT_FALSE(
+ db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+ ASSERT_FALSE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+ ASSERT_FALSE(
+ db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+
+ // Test with empty block cache.
+ constexpr size_t kCapacity = 100;
+ LRUCacheOptions co;
+ co.capacity = kCapacity;
+ co.num_shard_bits = 0;
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
+ auto block_cache = NewLRUCache(co);
+ table_options.block_cache = block_cache;
+ table_options.no_block_cache = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+ ASSERT_EQ(kCapacity, value);
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+ ASSERT_EQ(0, value);
+ ASSERT_TRUE(
+ db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+ ASSERT_EQ(0, value);
+
+ // Insert unpinned item to the cache and check size.
+ constexpr size_t kSize1 = 50;
+ block_cache->Insert("item1", nullptr /*value*/, kSize1, nullptr /*deleter*/);
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+ ASSERT_EQ(kCapacity, value);
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+ ASSERT_EQ(kSize1, value);
+ ASSERT_TRUE(
+ db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+ ASSERT_EQ(0, value);
+
+ // Insert pinned item to the cache and check size.
+ constexpr size_t kSize2 = 30;
+ Cache::Handle* item2 = nullptr;
+ block_cache->Insert("item2", nullptr /*value*/, kSize2, nullptr /*deleter*/,
+ &item2);
+ ASSERT_NE(nullptr, item2);
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+ ASSERT_EQ(kCapacity, value);
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+ ASSERT_EQ(kSize1 + kSize2, value);
+ ASSERT_TRUE(
+ db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+ ASSERT_EQ(kSize2, value);
+
+ // Insert another pinned item to make the cache over-sized.
+ constexpr size_t kSize3 = 80;
+ Cache::Handle* item3 = nullptr;
+ block_cache->Insert("item3", nullptr /*value*/, kSize3, nullptr /*deleter*/,
+ &item3);
+ ASSERT_NE(nullptr, item2);
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+ ASSERT_EQ(kCapacity, value);
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+ // Item 1 is evicted.
+ ASSERT_EQ(kSize2 + kSize3, value);
+ ASSERT_TRUE(
+ db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+ ASSERT_EQ(kSize2 + kSize3, value);
+
+ // Check size after release.
+ block_cache->Release(item2);
+ block_cache->Release(item3);
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+ ASSERT_EQ(kCapacity, value);
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+ // item2 will be evicted, while item3 remain in cache after release.
+ ASSERT_EQ(kSize3, value);
+ ASSERT_TRUE(
+ db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+ ASSERT_EQ(0, value);
+}
+
+#endif // ROCKSDB_LITE
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_range_del_test.cc b/src/rocksdb/db/db_range_del_test.cc
new file mode 100644
index 000000000..15225875d
--- /dev/null
+++ b/src/rocksdb/db/db_range_del_test.cc
@@ -0,0 +1,1660 @@
+// Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "test_util/testutil.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBRangeDelTest : public DBTestBase {
+ public:
+ DBRangeDelTest() : DBTestBase("/db_range_del_test") {}
+
+ std::string GetNumericStr(int key) {
+ uint64_t uint64_key = static_cast<uint64_t>(key);
+ std::string str;
+ str.resize(8);
+ memcpy(&str[0], static_cast<void*>(&uint64_key), 8);
+ return str;
+ }
+};
+
+// PlainTableFactory, WriteBatchWithIndex, and NumTableFilesAtLevel() are not
+// supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+TEST_F(DBRangeDelTest, NonBlockBasedTableNotSupported) {
+ // TODO: figure out why MmapReads trips the iterator pinning assertion in
+ // RangeDelAggregator. Ideally it would be supported; otherwise it should at
+ // least be explicitly unsupported.
+ for (auto config : {kPlainTableAllBytesPrefix, /* kWalDirAndMmapReads */}) {
+ option_config_ = config;
+ DestroyAndReopen(CurrentOptions());
+ ASSERT_TRUE(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ "dr1", "dr1")
+ .IsNotSupported());
+ }
+}
+
+TEST_F(DBRangeDelTest, WriteBatchWithIndexNotSupported) {
+ WriteBatchWithIndex indexedBatch{};
+ ASSERT_TRUE(indexedBatch.DeleteRange(db_->DefaultColumnFamily(), "dr1", "dr1")
+ .IsNotSupported());
+ ASSERT_TRUE(indexedBatch.DeleteRange("dr1", "dr1").IsNotSupported());
+}
+
+TEST_F(DBRangeDelTest, FlushOutputHasOnlyRangeTombstones) {
+ do {
+ DestroyAndReopen(CurrentOptions());
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ "dr1", "dr2"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+ } while (ChangeOptions(kRangeDelSkipConfigs));
+}
+
+TEST_F(DBRangeDelTest, CompactionOutputHasOnlyRangeTombstone) {
+ do {
+ Options opts = CurrentOptions();
+ opts.disable_auto_compactions = true;
+ opts.statistics = CreateDBStatistics();
+ DestroyAndReopen(opts);
+
+ // snapshot protects range tombstone from dropping due to becoming obsolete.
+ const Snapshot* snapshot = db_->GetSnapshot();
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z");
+ db_->Flush(FlushOptions());
+
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+ ASSERT_EQ(0, NumTableFilesAtLevel(1));
+ dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+ true /* disallow_trivial_move */);
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_EQ(1, NumTableFilesAtLevel(1));
+ ASSERT_EQ(0, TestGetTickerCount(opts, COMPACTION_RANGE_DEL_DROP_OBSOLETE));
+ db_->ReleaseSnapshot(snapshot);
+ // Skip cuckoo memtables, which do not support snapshots. Skip non-leveled
+ // compactions as the above assertions about the number of files in a level
+ // do not hold true.
+ } while (ChangeOptions(kRangeDelSkipConfigs | kSkipUniversalCompaction |
+ kSkipFIFOCompaction));
+}
+
+TEST_F(DBRangeDelTest, CompactionOutputFilesExactlyFilled) {
+ // regression test for exactly filled compaction output files. Previously
+ // another file would be generated containing all range deletions, which
+ // could invalidate the non-overlapping file boundary invariant.
+ const int kNumPerFile = 4, kNumFiles = 2, kFileBytes = 9 << 10;
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.level0_file_num_compaction_trigger = kNumFiles;
+ options.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
+ options.num_levels = 2;
+ options.target_file_size_base = kFileBytes;
+ BlockBasedTableOptions table_options;
+ table_options.block_size_deviation = 50; // each block holds two keys
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+
+ // snapshot protects range tombstone from dropping due to becoming obsolete.
+ const Snapshot* snapshot = db_->GetSnapshot();
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), Key(1));
+
+ Random rnd(301);
+ for (int i = 0; i < kNumFiles; ++i) {
+ std::vector<std::string> values;
+ // Write 12K (4 values, each 3K)
+ for (int j = 0; j < kNumPerFile; j++) {
+ values.push_back(RandomString(&rnd, 3 << 10));
+ ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j]));
+ if (j == 0 && i > 0) {
+ dbfull()->TEST_WaitForFlushMemTable();
+ }
+ }
+ }
+ // put extra key to trigger final flush
+ ASSERT_OK(Put("", ""));
+ dbfull()->TEST_WaitForFlushMemTable();
+ ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+ ASSERT_EQ(0, NumTableFilesAtLevel(1));
+
+ dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+ true /* disallow_trivial_move */);
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_EQ(2, NumTableFilesAtLevel(1));
+ db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, MaxCompactionBytesCutsOutputFiles) {
+ // Ensures range deletion spanning multiple compaction output files that are
+ // cut by max_compaction_bytes will have non-overlapping key-ranges.
+ // https://github.com/facebook/rocksdb/issues/1778
+ const int kNumFiles = 2, kNumPerFile = 1 << 8, kBytesPerVal = 1 << 12;
+ Options opts = CurrentOptions();
+ opts.comparator = test::Uint64Comparator();
+ opts.disable_auto_compactions = true;
+ opts.level0_file_num_compaction_trigger = kNumFiles;
+ opts.max_compaction_bytes = kNumPerFile * kBytesPerVal;
+ opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
+ // Want max_compaction_bytes to trigger the end of compaction output file, not
+ // target_file_size_base, so make the latter much bigger
+ opts.target_file_size_base = 100 * opts.max_compaction_bytes;
+ Reopen(opts);
+
+ // snapshot protects range tombstone from dropping due to becoming obsolete.
+ const Snapshot* snapshot = db_->GetSnapshot();
+
+ // It spans the whole key-range, thus will be included in all output files
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ GetNumericStr(0),
+ GetNumericStr(kNumFiles * kNumPerFile - 1)));
+ Random rnd(301);
+ for (int i = 0; i < kNumFiles; ++i) {
+ std::vector<std::string> values;
+ // Write 1MB (256 values, each 4K)
+ for (int j = 0; j < kNumPerFile; j++) {
+ values.push_back(RandomString(&rnd, kBytesPerVal));
+ ASSERT_OK(Put(GetNumericStr(kNumPerFile * i + j), values[j]));
+ }
+ // extra entry to trigger SpecialSkipListFactory's flush
+ ASSERT_OK(Put(GetNumericStr(kNumPerFile), ""));
+ dbfull()->TEST_WaitForFlushMemTable();
+ ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+ }
+
+ dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+ true /* disallow_trivial_move */);
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_GE(NumTableFilesAtLevel(1), 2);
+
+ std::vector<std::vector<FileMetaData>> files;
+ dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files);
+
+ for (size_t i = 0; i < files[1].size() - 1; ++i) {
+ ASSERT_TRUE(InternalKeyComparator(opts.comparator)
+ .Compare(files[1][i].largest, files[1][i + 1].smallest) <
+ 0);
+ }
+ db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, SentinelsOmittedFromOutputFile) {
+ // Regression test for bug where sentinel range deletions (i.e., ones with
+ // sequence number of zero) were included in output files.
+ // snapshot protects range tombstone from dropping due to becoming obsolete.
+ const Snapshot* snapshot = db_->GetSnapshot();
+
+ // gaps between ranges creates sentinels in our internal representation
+ std::vector<std::pair<std::string, std::string>> range_dels = {{"a", "b"}, {"c", "d"}, {"e", "f"}};
+ for (const auto& range_del : range_dels) {
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ range_del.first, range_del.second));
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+ std::vector<std::vector<FileMetaData>> files;
+ dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files);
+ ASSERT_GT(files[0][0].fd.smallest_seqno, 0);
+
+ db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, FlushRangeDelsSameStartKey) {
+ db_->Put(WriteOptions(), "b1", "val");
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "c"));
+ db_->Put(WriteOptions(), "b2", "val");
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b"));
+ // first iteration verifies query correctness in memtable, second verifies
+ // query correctness for a single SST file
+ for (int i = 0; i < 2; ++i) {
+ if (i > 0) {
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+ }
+ std::string value;
+ ASSERT_TRUE(db_->Get(ReadOptions(), "b1", &value).IsNotFound());
+ ASSERT_OK(db_->Get(ReadOptions(), "b2", &value));
+ }
+}
+
+TEST_F(DBRangeDelTest, CompactRangeDelsSameStartKey) {
+ db_->Put(WriteOptions(), "unused", "val"); // prevents empty after compaction
+ db_->Put(WriteOptions(), "b1", "val");
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "c"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_EQ(3, NumTableFilesAtLevel(0));
+
+ for (int i = 0; i < 2; ++i) {
+ if (i > 0) {
+ dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+ true /* disallow_trivial_move */);
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_EQ(1, NumTableFilesAtLevel(1));
+ }
+ std::string value;
+ ASSERT_TRUE(db_->Get(ReadOptions(), "b1", &value).IsNotFound());
+ }
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBRangeDelTest, FlushRemovesCoveredKeys) {
+ const int kNum = 300, kRangeBegin = 50, kRangeEnd = 250;
+ Options opts = CurrentOptions();
+ opts.comparator = test::Uint64Comparator();
+ Reopen(opts);
+
+ // Write a third before snapshot, a third between snapshot and tombstone, and
+ // a third after the tombstone. Keys older than snapshot or newer than the
+ // tombstone should be preserved.
+ const Snapshot* snapshot = nullptr;
+ for (int i = 0; i < kNum; ++i) {
+ if (i == kNum / 3) {
+ snapshot = db_->GetSnapshot();
+ } else if (i == 2 * kNum / 3) {
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd));
+ }
+ db_->Put(WriteOptions(), GetNumericStr(i), "val");
+ }
+ db_->Flush(FlushOptions());
+
+ for (int i = 0; i < kNum; ++i) {
+ ReadOptions read_opts;
+ read_opts.ignore_range_deletions = true;
+ std::string value;
+ if (i < kRangeBegin || i > kRangeEnd || i < kNum / 3 || i >= 2 * kNum / 3) {
+ ASSERT_OK(db_->Get(read_opts, GetNumericStr(i), &value));
+ } else {
+ ASSERT_TRUE(db_->Get(read_opts, GetNumericStr(i), &value).IsNotFound());
+ }
+ }
+ db_->ReleaseSnapshot(snapshot);
+}
+
+// NumTableFilesAtLevel() is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+TEST_F(DBRangeDelTest, CompactionRemovesCoveredKeys) {
+ const int kNumPerFile = 100, kNumFiles = 4;
+ Options opts = CurrentOptions();
+ opts.comparator = test::Uint64Comparator();
+ opts.disable_auto_compactions = true;
+ opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
+ opts.num_levels = 2;
+ opts.statistics = CreateDBStatistics();
+ Reopen(opts);
+
+ for (int i = 0; i < kNumFiles; ++i) {
+ if (i > 0) {
+ // range tombstone covers first half of the previous file
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ GetNumericStr((i - 1) * kNumPerFile),
+ GetNumericStr((i - 1) * kNumPerFile + kNumPerFile / 2));
+ }
+ // Make sure a given key appears in each file so compaction won't be able to
+ // use trivial move, which would happen if the ranges were non-overlapping.
+ // Also, we need an extra element since flush is only triggered when the
+ // number of keys is one greater than SpecialSkipListFactory's limit.
+ // We choose a key outside the key-range used by the test to avoid conflict.
+ db_->Put(WriteOptions(), GetNumericStr(kNumPerFile * kNumFiles), "val");
+
+ for (int j = 0; j < kNumPerFile; ++j) {
+ db_->Put(WriteOptions(), GetNumericStr(i * kNumPerFile + j), "val");
+ }
+ dbfull()->TEST_WaitForFlushMemTable();
+ ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+ }
+ db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_GT(NumTableFilesAtLevel(1), 0);
+ ASSERT_EQ((kNumFiles - 1) * kNumPerFile / 2,
+ TestGetTickerCount(opts, COMPACTION_KEY_DROP_RANGE_DEL));
+
+ for (int i = 0; i < kNumFiles; ++i) {
+ for (int j = 0; j < kNumPerFile; ++j) {
+ ReadOptions read_opts;
+ read_opts.ignore_range_deletions = true;
+ std::string value;
+ if (i == kNumFiles - 1 || j >= kNumPerFile / 2) {
+ ASSERT_OK(
+ db_->Get(read_opts, GetNumericStr(i * kNumPerFile + j), &value));
+ } else {
+ ASSERT_TRUE(
+ db_->Get(read_opts, GetNumericStr(i * kNumPerFile + j), &value)
+ .IsNotFound());
+ }
+ }
+ }
+}
+
+TEST_F(DBRangeDelTest, ValidLevelSubcompactionBoundaries) {
+ const int kNumPerFile = 100, kNumFiles = 4, kFileBytes = 100 << 10;
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.level0_file_num_compaction_trigger = kNumFiles;
+ options.max_bytes_for_level_base = 2 * kFileBytes;
+ options.max_subcompactions = 4;
+ options.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
+ options.num_levels = 3;
+ options.target_file_size_base = kFileBytes;
+ options.target_file_size_multiplier = 1;
+ Reopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < kNumFiles; ++j) {
+ if (i > 0) {
+ // delete [95,105) in two files, [295,305) in next two
+ int mid = (j + (1 - j % 2)) * kNumPerFile;
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ Key(mid - 5), Key(mid + 5));
+ }
+ std::vector<std::string> values;
+ // Write 100KB (100 values, each 1K)
+ for (int k = 0; k < kNumPerFile; k++) {
+ values.push_back(RandomString(&rnd, 990));
+ ASSERT_OK(Put(Key(j * kNumPerFile + k), values[k]));
+ }
+ // put extra key to trigger flush
+ ASSERT_OK(Put("", ""));
+ dbfull()->TEST_WaitForFlushMemTable();
+ if (j < kNumFiles - 1) {
+ // background compaction may happen early for kNumFiles'th file
+ ASSERT_EQ(NumTableFilesAtLevel(0), j + 1);
+ }
+ if (j == options.level0_file_num_compaction_trigger - 1) {
+ // When i == 1, compaction will output some files to L1, at which point
+ // L1 is not bottommost so range deletions cannot be compacted away. The
+ // new L1 files must be generated with non-overlapping key ranges even
+ // though multiple subcompactions see the same ranges deleted, else an
+ // assertion will fail.
+ //
+ // Only enable auto-compactions when we're ready; otherwise, the
+ // oversized L0 (relative to base_level) causes the compaction to run
+ // earlier.
+ ASSERT_OK(db_->EnableAutoCompaction({db_->DefaultColumnFamily()}));
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(),
+ {{"disable_auto_compactions", "true"}}));
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ ASSERT_GT(NumTableFilesAtLevel(1), 0);
+ ASSERT_GT(NumTableFilesAtLevel(2), 0);
+ }
+ }
+ }
+}
+
+TEST_F(DBRangeDelTest, ValidUniversalSubcompactionBoundaries) {
+ const int kNumPerFile = 100, kFilesPerLevel = 4, kNumLevels = 4;
+ Options options = CurrentOptions();
+ options.compaction_options_universal.min_merge_width = kFilesPerLevel;
+ options.compaction_options_universal.max_merge_width = kFilesPerLevel;
+ options.compaction_options_universal.size_ratio = 10;
+ options.compaction_style = kCompactionStyleUniversal;
+ options.level0_file_num_compaction_trigger = kFilesPerLevel;
+ options.max_subcompactions = 4;
+ options.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
+ options.num_levels = kNumLevels;
+ options.target_file_size_base = kNumPerFile << 10;
+ options.target_file_size_multiplier = 1;
+ Reopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < kNumLevels - 1; ++i) {
+ for (int j = 0; j < kFilesPerLevel; ++j) {
+ if (i == kNumLevels - 2) {
+ // insert range deletions [95,105) in two files, [295,305) in next two
+ // to prepare L1 for later manual compaction.
+ int mid = (j + (1 - j % 2)) * kNumPerFile;
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ Key(mid - 5), Key(mid + 5));
+ }
+ std::vector<std::string> values;
+ // Write 100KB (100 values, each 1K)
+ for (int k = 0; k < kNumPerFile; k++) {
+ values.push_back(RandomString(&rnd, 990));
+ ASSERT_OK(Put(Key(j * kNumPerFile + k), values[k]));
+ }
+ // put extra key to trigger flush
+ ASSERT_OK(Put("", ""));
+ dbfull()->TEST_WaitForFlushMemTable();
+ if (j < kFilesPerLevel - 1) {
+ // background compaction may happen early for kFilesPerLevel'th file
+ ASSERT_EQ(NumTableFilesAtLevel(0), j + 1);
+ }
+ }
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ ASSERT_GT(NumTableFilesAtLevel(kNumLevels - 1 - i), kFilesPerLevel - 1);
+ }
+ // Now L1-L3 are full, when we compact L1->L2 we should see (1) subcompactions
+ // happen since input level > 0; (2) range deletions are not dropped since
+ // output level is not bottommost. If no file boundary assertion fails, that
+ // probably means universal compaction + subcompaction + range deletion are
+ // compatible.
+ ASSERT_OK(dbfull()->RunManualCompaction(
+ reinterpret_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())
+ ->cfd(),
+ 1 /* input_level */, 2 /* output_level */, CompactRangeOptions(),
+ nullptr /* begin */, nullptr /* end */, true /* exclusive */,
+ true /* disallow_trivial_move */,
+ port::kMaxUint64 /* max_file_num_to_ignore */));
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBRangeDelTest, CompactionRemovesCoveredMergeOperands) {
+ const int kNumPerFile = 3, kNumFiles = 3;
+ Options opts = CurrentOptions();
+ opts.disable_auto_compactions = true;
+ opts.memtable_factory.reset(new SpecialSkipListFactory(2 * kNumPerFile));
+ opts.merge_operator = MergeOperators::CreateUInt64AddOperator();
+ opts.num_levels = 2;
+ Reopen(opts);
+
+ // Iterates kNumFiles * kNumPerFile + 1 times since flushing the last file
+ // requires an extra entry.
+ for (int i = 0; i <= kNumFiles * kNumPerFile; ++i) {
+ if (i % kNumPerFile == 0 && i / kNumPerFile == kNumFiles - 1) {
+ // Delete merge operands from all but the last file
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "key",
+ "key_");
+ }
+ std::string val;
+ PutFixed64(&val, i);
+ db_->Merge(WriteOptions(), "key", val);
+ // we need to prevent trivial move using Puts so compaction will actually
+ // process the merge operands.
+ db_->Put(WriteOptions(), "prevent_trivial_move", "");
+ if (i > 0 && i % kNumPerFile == 0) {
+ dbfull()->TEST_WaitForFlushMemTable();
+ }
+ }
+
+ ReadOptions read_opts;
+ read_opts.ignore_range_deletions = true;
+ std::string expected, actual;
+ ASSERT_OK(db_->Get(read_opts, "key", &actual));
+ PutFixed64(&expected, 45); // 1+2+...+9
+ ASSERT_EQ(expected, actual);
+
+ db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+ expected.clear();
+ ASSERT_OK(db_->Get(read_opts, "key", &actual));
+ uint64_t tmp;
+ Slice tmp2(actual);
+ GetFixed64(&tmp2, &tmp);
+ PutFixed64(&expected, 30); // 6+7+8+9 (earlier operands covered by tombstone)
+ ASSERT_EQ(expected, actual);
+}
+
+TEST_F(DBRangeDelTest, PutDeleteRangeMergeFlush) {
+ // Test the sequence of operations: (1) Put, (2) DeleteRange, (3) Merge, (4)
+ // Flush. The `CompactionIterator` previously had a bug where we forgot to
+ // check for covering range tombstones when processing the (1) Put, causing
+ // it to reappear after the flush.
+ Options opts = CurrentOptions();
+ opts.merge_operator = MergeOperators::CreateUInt64AddOperator();
+ Reopen(opts);
+
+ std::string val;
+ PutFixed64(&val, 1);
+ ASSERT_OK(db_->Put(WriteOptions(), "key", val));
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ "key", "key_"));
+ ASSERT_OK(db_->Merge(WriteOptions(), "key", val));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+
+ ReadOptions read_opts;
+ std::string expected, actual;
+ ASSERT_OK(db_->Get(read_opts, "key", &actual));
+ PutFixed64(&expected, 1);
+ ASSERT_EQ(expected, actual);
+}
+
+// NumTableFilesAtLevel() is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+TEST_F(DBRangeDelTest, ObsoleteTombstoneCleanup) {
+ // During compaction to bottommost level, verify range tombstones older than
+ // the oldest snapshot are removed, while others are preserved.
+ Options opts = CurrentOptions();
+ opts.disable_auto_compactions = true;
+ opts.num_levels = 2;
+ opts.statistics = CreateDBStatistics();
+ Reopen(opts);
+
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1",
+ "dr10"); // obsolete after compaction
+ db_->Put(WriteOptions(), "key", "val");
+ db_->Flush(FlushOptions());
+ const Snapshot* snapshot = db_->GetSnapshot();
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr2",
+ "dr20"); // protected by snapshot
+ db_->Put(WriteOptions(), "key", "val");
+ db_->Flush(FlushOptions());
+
+ ASSERT_EQ(2, NumTableFilesAtLevel(0));
+ ASSERT_EQ(0, NumTableFilesAtLevel(1));
+ db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_EQ(1, NumTableFilesAtLevel(1));
+ ASSERT_EQ(1, TestGetTickerCount(opts, COMPACTION_RANGE_DEL_DROP_OBSOLETE));
+
+ db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, TableEvictedDuringScan) {
+ // The RangeDelAggregator holds pointers into range deletion blocks created by
+ // table readers. This test ensures the aggregator can still access those
+ // blocks even if it outlives the table readers that created them.
+ //
+ // DBIter always keeps readers open for L0 files. So, in order to test
+ // aggregator outliving reader, we need to have deletions in L1 files, which
+ // are opened/closed on-demand during the scan. This is accomplished by
+ // setting kNumRanges > level0_stop_writes_trigger, which prevents deletions
+ // from all lingering in L0 (there is at most one range deletion per L0 file).
+ //
+ // The first L1 file will contain a range deletion since its begin key is 0.
+ // SeekToFirst() references that table's reader and adds its range tombstone
+ // to the aggregator. Upon advancing beyond that table's key-range via Next(),
+ // the table reader will be unreferenced by the iterator. Since we manually
+ // call Evict() on all readers before the full scan, this unreference causes
+ // the reader's refcount to drop to zero and thus be destroyed.
+ //
+ // When it is destroyed, we do not remove its range deletions from the
+ // aggregator. So, subsequent calls to Next() must be able to use these
+ // deletions to decide whether a key is covered. This will work as long as
+ // the aggregator properly references the range deletion block.
+ const int kNum = 25, kRangeBegin = 0, kRangeEnd = 7, kNumRanges = 5;
+ Options opts = CurrentOptions();
+ opts.comparator = test::Uint64Comparator();
+ opts.level0_file_num_compaction_trigger = 4;
+ opts.level0_stop_writes_trigger = 4;
+ opts.memtable_factory.reset(new SpecialSkipListFactory(1));
+ opts.num_levels = 2;
+ BlockBasedTableOptions bbto;
+ bbto.cache_index_and_filter_blocks = true;
+ bbto.block_cache = NewLRUCache(8 << 20);
+ opts.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ Reopen(opts);
+
+ // Hold a snapshot so range deletions can't become obsolete during compaction
+ // to bottommost level (i.e., L1).
+ const Snapshot* snapshot = db_->GetSnapshot();
+ for (int i = 0; i < kNum; ++i) {
+ db_->Put(WriteOptions(), GetNumericStr(i), "val");
+ if (i > 0) {
+ dbfull()->TEST_WaitForFlushMemTable();
+ }
+ if (i >= kNum / 2 && i < kNum / 2 + kNumRanges) {
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd));
+ }
+ }
+ // Must be > 1 so the first L1 file can be closed before scan finishes
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_GT(NumTableFilesAtLevel(1), 1);
+ std::vector<uint64_t> file_numbers = ListTableFiles(env_, dbname_);
+
+ ReadOptions read_opts;
+ auto* iter = db_->NewIterator(read_opts);
+ int expected = kRangeEnd;
+ iter->SeekToFirst();
+ for (auto file_number : file_numbers) {
+ // This puts table caches in the state of being externally referenced only
+ // so they are destroyed immediately upon iterator unreferencing.
+ TableCache::Evict(dbfull()->TEST_table_cache(), file_number);
+ }
+ for (; iter->Valid(); iter->Next()) {
+ ASSERT_EQ(GetNumericStr(expected), iter->key());
+ ++expected;
+ // Keep clearing block cache's LRU so range deletion block can be freed as
+ // soon as its refcount drops to zero.
+ bbto.block_cache->EraseUnRefEntries();
+ }
+ ASSERT_EQ(kNum, expected);
+ delete iter;
+ db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, GetCoveredKeyFromMutableMemtable) {
+ do {
+ DestroyAndReopen(CurrentOptions());
+ db_->Put(WriteOptions(), "key", "val");
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+
+ ReadOptions read_opts;
+ std::string value;
+ ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound());
+ } while (ChangeOptions(kRangeDelSkipConfigs));
+}
+
+TEST_F(DBRangeDelTest, GetCoveredKeyFromImmutableMemtable) {
+ do {
+ Options opts = CurrentOptions();
+ opts.max_write_buffer_number = 3;
+ opts.min_write_buffer_number_to_merge = 2;
+ // SpecialSkipListFactory lets us specify maximum number of elements the
+ // memtable can hold. It switches the active memtable to immutable (flush is
+ // prevented by the above options) upon inserting an element that would
+ // overflow the memtable.
+ opts.memtable_factory.reset(new SpecialSkipListFactory(1));
+ DestroyAndReopen(opts);
+
+ db_->Put(WriteOptions(), "key", "val");
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+ db_->Put(WriteOptions(), "blah", "val");
+
+ ReadOptions read_opts;
+ std::string value;
+ ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound());
+ } while (ChangeOptions(kRangeDelSkipConfigs));
+}
+
+TEST_F(DBRangeDelTest, GetCoveredKeyFromSst) {
+ do {
+ DestroyAndReopen(CurrentOptions());
+ db_->Put(WriteOptions(), "key", "val");
+ // snapshot prevents key from being deleted during flush
+ const Snapshot* snapshot = db_->GetSnapshot();
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+
+ ReadOptions read_opts;
+ std::string value;
+ ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound());
+ db_->ReleaseSnapshot(snapshot);
+ } while (ChangeOptions(kRangeDelSkipConfigs));
+}
+
+TEST_F(DBRangeDelTest, GetCoveredMergeOperandFromMemtable) {
+ const int kNumMergeOps = 10;
+ Options opts = CurrentOptions();
+ opts.merge_operator = MergeOperators::CreateUInt64AddOperator();
+ Reopen(opts);
+
+ for (int i = 0; i < kNumMergeOps; ++i) {
+ std::string val;
+ PutFixed64(&val, i);
+ db_->Merge(WriteOptions(), "key", val);
+ if (i == kNumMergeOps / 2) {
+ // deletes [0, 5]
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "key",
+ "key_");
+ }
+ }
+
+ ReadOptions read_opts;
+ std::string expected, actual;
+ ASSERT_OK(db_->Get(read_opts, "key", &actual));
+ PutFixed64(&expected, 30); // 6+7+8+9
+ ASSERT_EQ(expected, actual);
+
+ expected.clear();
+ read_opts.ignore_range_deletions = true;
+ ASSERT_OK(db_->Get(read_opts, "key", &actual));
+ PutFixed64(&expected, 45); // 0+1+2+...+9
+ ASSERT_EQ(expected, actual);
+}
+
+TEST_F(DBRangeDelTest, GetIgnoresRangeDeletions) {
+ Options opts = CurrentOptions();
+ opts.max_write_buffer_number = 4;
+ opts.min_write_buffer_number_to_merge = 3;
+ opts.memtable_factory.reset(new SpecialSkipListFactory(1));
+ Reopen(opts);
+
+ db_->Put(WriteOptions(), "sst_key", "val");
+ // snapshot prevents key from being deleted during flush
+ const Snapshot* snapshot = db_->GetSnapshot();
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ db_->Put(WriteOptions(), "imm_key", "val");
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+ db_->Put(WriteOptions(), "mem_key", "val");
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+
+ ReadOptions read_opts;
+ read_opts.ignore_range_deletions = true;
+ for (std::string key : {"sst_key", "imm_key", "mem_key"}) {
+ std::string value;
+ ASSERT_OK(db_->Get(read_opts, key, &value));
+ }
+ db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, IteratorRemovesCoveredKeys) {
+ const int kNum = 200, kRangeBegin = 50, kRangeEnd = 150, kNumPerFile = 25;
+ Options opts = CurrentOptions();
+ opts.comparator = test::Uint64Comparator();
+ opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
+ Reopen(opts);
+
+ // Write half of the keys before the tombstone and half after the tombstone.
+ // Only covered keys (i.e., within the range and older than the tombstone)
+ // should be deleted.
+ for (int i = 0; i < kNum; ++i) {
+ if (i == kNum / 2) {
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd));
+ }
+ db_->Put(WriteOptions(), GetNumericStr(i), "val");
+ }
+ ReadOptions read_opts;
+ auto* iter = db_->NewIterator(read_opts);
+
+ int expected = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_EQ(GetNumericStr(expected), iter->key());
+ if (expected == kRangeBegin - 1) {
+ expected = kNum / 2;
+ } else {
+ ++expected;
+ }
+ }
+ ASSERT_EQ(kNum, expected);
+ delete iter;
+}
+
+TEST_F(DBRangeDelTest, IteratorOverUserSnapshot) {
+ const int kNum = 200, kRangeBegin = 50, kRangeEnd = 150, kNumPerFile = 25;
+ Options opts = CurrentOptions();
+ opts.comparator = test::Uint64Comparator();
+ opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
+ Reopen(opts);
+
+ const Snapshot* snapshot = nullptr;
+ // Put a snapshot before the range tombstone, verify an iterator using that
+ // snapshot sees all inserted keys.
+ for (int i = 0; i < kNum; ++i) {
+ if (i == kNum / 2) {
+ snapshot = db_->GetSnapshot();
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd));
+ }
+ db_->Put(WriteOptions(), GetNumericStr(i), "val");
+ }
+ ReadOptions read_opts;
+ read_opts.snapshot = snapshot;
+ auto* iter = db_->NewIterator(read_opts);
+
+ int expected = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_EQ(GetNumericStr(expected), iter->key());
+ ++expected;
+ }
+ ASSERT_EQ(kNum / 2, expected);
+ delete iter;
+ db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, IteratorIgnoresRangeDeletions) {
+ Options opts = CurrentOptions();
+ opts.max_write_buffer_number = 4;
+ opts.min_write_buffer_number_to_merge = 3;
+ opts.memtable_factory.reset(new SpecialSkipListFactory(1));
+ Reopen(opts);
+
+ db_->Put(WriteOptions(), "sst_key", "val");
+ // snapshot prevents key from being deleted during flush
+ const Snapshot* snapshot = db_->GetSnapshot();
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ db_->Put(WriteOptions(), "imm_key", "val");
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+ db_->Put(WriteOptions(), "mem_key", "val");
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+
+ ReadOptions read_opts;
+ read_opts.ignore_range_deletions = true;
+ auto* iter = db_->NewIterator(read_opts);
+ int i = 0;
+ std::string expected[] = {"imm_key", "mem_key", "sst_key"};
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++i) {
+ std::string key;
+ ASSERT_EQ(expected[i], iter->key());
+ }
+ ASSERT_EQ(3, i);
+ delete iter;
+ db_->ReleaseSnapshot(snapshot);
+}
+
+#ifndef ROCKSDB_UBSAN_RUN
+TEST_F(DBRangeDelTest, TailingIteratorRangeTombstoneUnsupported) {
+ db_->Put(WriteOptions(), "key", "val");
+ // snapshot prevents key from being deleted during flush
+ const Snapshot* snapshot = db_->GetSnapshot();
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+
+ // iterations check unsupported in memtable, l0, and then l1
+ for (int i = 0; i < 3; ++i) {
+ ReadOptions read_opts;
+ read_opts.tailing = true;
+ auto* iter = db_->NewIterator(read_opts);
+ if (i == 2) {
+ // For L1+, iterators over files are created on-demand, so need seek
+ iter->SeekToFirst();
+ }
+ ASSERT_TRUE(iter->status().IsNotSupported());
+ delete iter;
+ if (i == 0) {
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ } else if (i == 1) {
+ MoveFilesToLevel(1);
+ }
+ }
+ db_->ReleaseSnapshot(snapshot);
+}
+
+#endif // !ROCKSDB_UBSAN_RUN
+
+TEST_F(DBRangeDelTest, SubcompactionHasEmptyDedicatedRangeDelFile) {
+ const int kNumFiles = 2, kNumKeysPerFile = 4;
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.disable_auto_compactions = true;
+ options.level0_file_num_compaction_trigger = kNumFiles;
+ options.max_subcompactions = 2;
+ options.num_levels = 2;
+ options.target_file_size_base = 4096;
+ Reopen(options);
+
+ // need a L1 file for subcompaction to be triggered
+ ASSERT_OK(
+ db_->Put(WriteOptions(), db_->DefaultColumnFamily(), Key(0), "val"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(1);
+
+ // put enough keys to fill up the first subcompaction, and later range-delete
+ // them so that the first subcompaction outputs no key-values. In that case
+ // it'll consider making an SST file dedicated to range deletions.
+ for (int i = 0; i < kNumKeysPerFile; ++i) {
+ ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), Key(i),
+ std::string(1024, 'a')));
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+ Key(kNumKeysPerFile)));
+
+ // the above range tombstone can be dropped, so that one alone won't cause a
+ // dedicated file to be opened. We can make one protected by snapshot that
+ // must be considered. Make its range outside the first subcompaction's range
+ // to exercise the tricky part of the code.
+ const Snapshot* snapshot = db_->GetSnapshot();
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ Key(kNumKeysPerFile + 1),
+ Key(kNumKeysPerFile + 2)));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+
+ ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+ ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+ db_->EnableAutoCompaction({db_->DefaultColumnFamily()});
+ dbfull()->TEST_WaitForCompact();
+ db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, MemtableBloomFilter) {
+ // regression test for #2743. the range delete tombstones in memtable should
+ // be added even when Get() skips searching due to its prefix bloom filter
+ const int kMemtableSize = 1 << 20; // 1MB
+ const int kMemtablePrefixFilterSize = 1 << 13; // 8KB
+ const int kNumKeys = 1000;
+ const int kPrefixLen = 8;
+ Options options = CurrentOptions();
+ options.memtable_prefix_bloom_size_ratio =
+ static_cast<double>(kMemtablePrefixFilterSize) / kMemtableSize;
+ options.prefix_extractor.reset(
+ ROCKSDB_NAMESPACE::NewFixedPrefixTransform(kPrefixLen));
+ options.write_buffer_size = kMemtableSize;
+ Reopen(options);
+
+ for (int i = 0; i < kNumKeys; ++i) {
+ ASSERT_OK(Put(Key(i), "val"));
+ }
+ Flush();
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+ Key(kNumKeys)));
+ for (int i = 0; i < kNumKeys; ++i) {
+ std::string value;
+ ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound());
+ }
+}
+
+TEST_F(DBRangeDelTest, CompactionTreatsSplitInputLevelDeletionAtomically) {
+ // This test originally verified that compaction treated files containing a
+ // split range deletion in the input level as an atomic unit. I.e.,
+ // compacting any input-level file(s) containing a portion of the range
+ // deletion causes all other input-level files containing portions of that
+ // same range deletion to be included in the compaction. Range deletion
+ // tombstones are now truncated to sstable boundaries which removed the need
+ // for that behavior (which could lead to excessively large
+ // compactions).
+ const int kNumFilesPerLevel = 4, kValueBytes = 4 << 10;
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.level0_file_num_compaction_trigger = kNumFilesPerLevel;
+ options.memtable_factory.reset(
+ new SpecialSkipListFactory(2 /* num_entries_flush */));
+ options.target_file_size_base = kValueBytes;
+ // i == 0: CompactFiles
+ // i == 1: CompactRange
+ // i == 2: automatic compaction
+ for (int i = 0; i < 3; ++i) {
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put(Key(0), ""));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(2);
+ ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+ // snapshot protects range tombstone from dropping due to becoming obsolete.
+ const Snapshot* snapshot = db_->GetSnapshot();
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+ Key(2 * kNumFilesPerLevel));
+
+ Random rnd(301);
+ std::string value = RandomString(&rnd, kValueBytes);
+ for (int j = 0; j < kNumFilesPerLevel; ++j) {
+ // give files overlapping key-ranges to prevent trivial move
+ ASSERT_OK(Put(Key(j), value));
+ ASSERT_OK(Put(Key(2 * kNumFilesPerLevel - 1 - j), value));
+ if (j > 0) {
+ dbfull()->TEST_WaitForFlushMemTable();
+ ASSERT_EQ(j, NumTableFilesAtLevel(0));
+ }
+ }
+ // put extra key to trigger final flush
+ ASSERT_OK(Put("", ""));
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_EQ(kNumFilesPerLevel, NumTableFilesAtLevel(1));
+
+ ColumnFamilyMetaData meta;
+ db_->GetColumnFamilyMetaData(&meta);
+ if (i == 0) {
+ ASSERT_OK(db_->CompactFiles(
+ CompactionOptions(), {meta.levels[1].files[0].name}, 2 /* level */));
+ ASSERT_EQ(0, NumTableFilesAtLevel(1));
+ } else if (i == 1) {
+ auto begin_str = Key(0), end_str = Key(1);
+ Slice begin = begin_str, end = end_str;
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &begin, &end));
+ ASSERT_EQ(3, NumTableFilesAtLevel(1));
+ } else if (i == 2) {
+ ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(),
+ {{"max_bytes_for_level_base", "10000"}}));
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(1, NumTableFilesAtLevel(1));
+ }
+ ASSERT_GT(NumTableFilesAtLevel(2), 0);
+
+ db_->ReleaseSnapshot(snapshot);
+ }
+}
+
+TEST_F(DBRangeDelTest, RangeTombstoneEndKeyAsSstableUpperBound) {
+ // Test the handling of the range-tombstone end-key as the
+ // upper-bound for an sstable.
+
+ const int kNumFilesPerLevel = 2, kValueBytes = 4 << 10;
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.level0_file_num_compaction_trigger = kNumFilesPerLevel;
+ options.memtable_factory.reset(
+ new SpecialSkipListFactory(2 /* num_entries_flush */));
+ options.target_file_size_base = kValueBytes;
+ options.disable_auto_compactions = true;
+
+ DestroyAndReopen(options);
+
+ // Create an initial sstable at L2:
+ // [key000000#1,1, key000000#1,1]
+ ASSERT_OK(Put(Key(0), ""));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(2);
+ ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+ // A snapshot protects the range tombstone from dropping due to
+ // becoming obsolete.
+ const Snapshot* snapshot = db_->GetSnapshot();
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ Key(0), Key(2 * kNumFilesPerLevel));
+
+ // Create 2 additional sstables in L0. Note that the first sstable
+ // contains the range tombstone.
+ // [key000000#3,1, key000004#72057594037927935,15]
+ // [key000001#5,1, key000002#6,1]
+ Random rnd(301);
+ std::string value = RandomString(&rnd, kValueBytes);
+ for (int j = 0; j < kNumFilesPerLevel; ++j) {
+ // Give files overlapping key-ranges to prevent a trivial move when we
+ // compact from L0 to L1.
+ ASSERT_OK(Put(Key(j), value));
+ ASSERT_OK(Put(Key(2 * kNumFilesPerLevel - 1 - j), value));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_EQ(j + 1, NumTableFilesAtLevel(0));
+ }
+ // Compact the 2 L0 sstables to L1, resulting in the following LSM. There
+ // are 2 sstables generated in L1 due to the target_file_size_base setting.
+ // L1:
+ // [key000000#3,1, key000002#72057594037927935,15]
+ // [key000002#6,1, key000004#72057594037927935,15]
+ // L2:
+ // [key000000#1,1, key000000#1,1]
+ MoveFilesToLevel(1);
+ ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+ {
+ // Compact the second sstable in L1:
+ // L1:
+ // [key000000#3,1, key000002#72057594037927935,15]
+ // L2:
+ // [key000000#1,1, key000000#1,1]
+ // [key000002#6,1, key000004#72057594037927935,15]
+ //
+ // At the same time, verify the compaction does not cause the key at the
+ // endpoint (key000002#6,1) to disappear.
+ ASSERT_EQ(value, Get(Key(2)));
+ auto begin_str = Key(3);
+ const ROCKSDB_NAMESPACE::Slice begin = begin_str;
+ dbfull()->TEST_CompactRange(1, &begin, nullptr);
+ ASSERT_EQ(1, NumTableFilesAtLevel(1));
+ ASSERT_EQ(2, NumTableFilesAtLevel(2));
+ ASSERT_EQ(value, Get(Key(2)));
+ }
+
+ {
+ // Compact the first sstable in L1. This should be copacetic, but
+ // was previously resulting in overlapping sstables in L2 due to
+ // mishandling of the range tombstone end-key when used as the
+ // largest key for an sstable. The resulting LSM structure should
+ // be:
+ //
+ // L2:
+ // [key000000#1,1, key000001#72057594037927935,15]
+ // [key000001#5,1, key000002#72057594037927935,15]
+ // [key000002#6,1, key000004#72057594037927935,15]
+ auto begin_str = Key(0);
+ const ROCKSDB_NAMESPACE::Slice begin = begin_str;
+ dbfull()->TEST_CompactRange(1, &begin, &begin);
+ ASSERT_EQ(0, NumTableFilesAtLevel(1));
+ ASSERT_EQ(3, NumTableFilesAtLevel(2));
+ }
+
+ db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, UnorderedTombstones) {
+ // Regression test for #2752. Range delete tombstones between
+ // different snapshot stripes are not stored in order, so the first
+ // tombstone of each snapshot stripe should be checked as a smallest
+ // candidate.
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+
+ auto cf = db_->DefaultColumnFamily();
+
+ ASSERT_OK(db_->Put(WriteOptions(), cf, "a", "a"));
+ ASSERT_OK(db_->Flush(FlushOptions(), cf));
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+ ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), cf, "b", "c"));
+ // Hold a snapshot to separate these two delete ranges.
+ auto snapshot = db_->GetSnapshot();
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), cf, "a", "b"));
+ ASSERT_OK(db_->Flush(FlushOptions(), cf));
+ db_->ReleaseSnapshot(snapshot);
+
+ std::vector<std::vector<FileMetaData>> files;
+ dbfull()->TEST_GetFilesMetaData(cf, &files);
+ ASSERT_EQ(1, files[0].size());
+ ASSERT_EQ("a", files[0][0].smallest.user_key());
+ ASSERT_EQ("c", files[0][0].largest.user_key());
+
+ std::string v;
+ auto s = db_->Get(ReadOptions(), "a", &v);
+ ASSERT_TRUE(s.IsNotFound());
+}
+
+class MockMergeOperator : public MergeOperator {
+ // Mock non-associative operator. Non-associativity is expressed by lack of
+ // implementation for any `PartialMerge*` functions.
+ public:
+ bool FullMergeV2(const MergeOperationInput& merge_in,
+ MergeOperationOutput* merge_out) const override {
+ assert(merge_out != nullptr);
+ merge_out->new_value = merge_in.operand_list.back().ToString();
+ return true;
+ }
+
+ const char* Name() const override { return "MockMergeOperator"; }
+};
+
+TEST_F(DBRangeDelTest, KeyAtOverlappingEndpointReappears) {
+ // This test uses a non-associative merge operator since that is a convenient
+ // way to get compaction to write out files with overlapping user-keys at the
+ // endpoints. Note, however, overlapping endpoints can also occur with other
+ // value types (Put, etc.), assuming the right snapshots are present.
+ const int kFileBytes = 1 << 20;
+ const int kValueBytes = 1 << 10;
+ const int kNumFiles = 4;
+
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.disable_auto_compactions = true;
+ options.merge_operator.reset(new MockMergeOperator());
+ options.target_file_size_base = kFileBytes;
+ Reopen(options);
+
+ // Push dummy data to L3 so that our actual test files on L0-L2
+ // will not be considered "bottommost" level, otherwise compaction
+ // may prevent us from creating overlapping user keys
+ // as on the bottommost layer MergeHelper
+ ASSERT_OK(db_->Merge(WriteOptions(), "key", "dummy"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(3);
+
+ Random rnd(301);
+ const Snapshot* snapshot = nullptr;
+ for (int i = 0; i < kNumFiles; ++i) {
+ for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
+ auto value = RandomString(&rnd, kValueBytes);
+ ASSERT_OK(db_->Merge(WriteOptions(), "key", value));
+ }
+ if (i == kNumFiles - 1) {
+ // Take snapshot to prevent covered merge operands from being dropped by
+ // compaction.
+ snapshot = db_->GetSnapshot();
+ // The DeleteRange is the last write so all merge operands are covered.
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ "key", "key_"));
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ }
+ ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+ std::string value;
+ ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound());
+
+ dbfull()->TEST_CompactRange(0 /* level */, nullptr /* begin */,
+ nullptr /* end */, nullptr /* column_family */,
+ true /* disallow_trivial_move */);
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ // Now we have multiple files at L1 all containing a single user key, thus
+ // guaranteeing overlap in the file endpoints.
+ ASSERT_GT(NumTableFilesAtLevel(1), 1);
+
+ // Verify no merge operands reappeared after the compaction.
+ ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound());
+
+ // Compact and verify again. It's worthwhile because now the files have
+ // tighter endpoints, so we can verify that doesn't mess anything up.
+ dbfull()->TEST_CompactRange(1 /* level */, nullptr /* begin */,
+ nullptr /* end */, nullptr /* column_family */,
+ true /* disallow_trivial_move */);
+ ASSERT_GT(NumTableFilesAtLevel(2), 1);
+ ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound());
+
+ db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, UntruncatedTombstoneDoesNotDeleteNewerKey) {
+ // Verify a key newer than a range tombstone cannot be deleted by being
+ // compacted to the bottom level (and thus having its seqnum zeroed) before
+ // the range tombstone. This used to happen when range tombstones were
+ // untruncated on reads such that they extended past their file boundaries.
+ //
+ // Test summary:
+ //
+ // - L1 is bottommost.
+ // - A couple snapshots are strategically taken to prevent seqnums from being
+ // zeroed, range tombstone from being dropped, merge operands from being
+ // dropped, and merge operands from being combined.
+ // - Left half of files in L1 all have same user key, ensuring their file
+ // boundaries overlap. In the past this would cause range tombstones to be
+ // untruncated.
+ // - Right half of L1 files all have different keys, ensuring no overlap.
+ // - A range tombstone spans all L1 keys, so it is stored in every L1 file.
+ // - Keys in the right side of the key-range are overwritten. These are
+ // compacted down to L1 after releasing snapshots such that their seqnums
+ // will be zeroed.
+ // - A full range scan is performed. If the tombstone in the left L1 files
+ // were untruncated, it would now cover keys newer than it (but with zeroed
+ // seqnums) in the right L1 files.
+ const int kFileBytes = 1 << 20;
+ const int kValueBytes = 1 << 10;
+ const int kNumFiles = 4;
+ const int kMaxKey = kNumFiles* kFileBytes / kValueBytes;
+ const int kKeysOverwritten = 10;
+
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.disable_auto_compactions = true;
+ options.merge_operator.reset(new MockMergeOperator());
+ options.num_levels = 2;
+ options.target_file_size_base = kFileBytes;
+ Reopen(options);
+
+ Random rnd(301);
+ // - snapshots[0] prevents merge operands from being combined during
+ // compaction.
+ // - snapshots[1] prevents merge operands from being dropped due to the
+ // covering range tombstone.
+ const Snapshot* snapshots[] = {nullptr, nullptr};
+ for (int i = 0; i < kNumFiles; ++i) {
+ for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
+ auto value = RandomString(&rnd, kValueBytes);
+ std::string key;
+ if (i < kNumFiles / 2) {
+ key = Key(0);
+ } else {
+ key = Key(1 + i * kFileBytes / kValueBytes + j);
+ }
+ ASSERT_OK(db_->Merge(WriteOptions(), key, value));
+ }
+ if (i == 0) {
+ snapshots[0] = db_->GetSnapshot();
+ }
+ if (i == kNumFiles - 1) {
+ snapshots[1] = db_->GetSnapshot();
+ // The DeleteRange is the last write so all merge operands are covered.
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ Key(0), Key(kMaxKey + 1)));
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ }
+ ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+
+ auto get_key_count = [this]() -> int {
+ auto* iter = db_->NewIterator(ReadOptions());
+ iter->SeekToFirst();
+ int keys_found = 0;
+ for (; iter->Valid(); iter->Next()) {
+ ++keys_found;
+ }
+ delete iter;
+ return keys_found;
+ };
+
+ // All keys should be covered
+ ASSERT_EQ(0, get_key_count());
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr /* begin_key */,
+ nullptr /* end_key */));
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ // Roughly the left half of L1 files should have overlapping boundary keys,
+ // while the right half should not.
+ ASSERT_GE(NumTableFilesAtLevel(1), kNumFiles);
+
+ // Now overwrite a few keys that are in L1 files that definitely don't have
+ // overlapping boundary keys.
+ for (int i = kMaxKey; i > kMaxKey - kKeysOverwritten; --i) {
+ auto value = RandomString(&rnd, kValueBytes);
+ ASSERT_OK(db_->Merge(WriteOptions(), Key(i), value));
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+
+ // The overwritten keys are in L0 now, so clearly aren't covered by the range
+ // tombstone in L1.
+ ASSERT_EQ(kKeysOverwritten, get_key_count());
+
+ // Release snapshots so seqnums can be zeroed when L0->L1 happens.
+ db_->ReleaseSnapshot(snapshots[0]);
+ db_->ReleaseSnapshot(snapshots[1]);
+
+ auto begin_key_storage = Key(kMaxKey - kKeysOverwritten + 1);
+ auto end_key_storage = Key(kMaxKey);
+ Slice begin_key(begin_key_storage);
+ Slice end_key(end_key_storage);
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &begin_key, &end_key));
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_GE(NumTableFilesAtLevel(1), kNumFiles);
+
+ ASSERT_EQ(kKeysOverwritten, get_key_count());
+}
+
+TEST_F(DBRangeDelTest, DeletedMergeOperandReappearsIterPrev) {
+ // Exposes a bug where we were using
+ // `RangeDelPositioningMode::kBackwardTraversal` while scanning merge operands
+ // in the forward direction. Confusingly, this case happened during
+ // `DBIter::Prev`. It could cause assertion failure, or reappearing keys.
+ const int kFileBytes = 1 << 20;
+ const int kValueBytes = 1 << 10;
+ // Need multiple keys so we can get results when calling `Prev()` after
+ // `SeekToLast()`.
+ const int kNumKeys = 3;
+ const int kNumFiles = 4;
+
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.disable_auto_compactions = true;
+ options.merge_operator.reset(new MockMergeOperator());
+ options.target_file_size_base = kFileBytes;
+ Reopen(options);
+
+ Random rnd(301);
+ const Snapshot* snapshot = nullptr;
+ for (int i = 0; i < kNumFiles; ++i) {
+ for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
+ auto value = RandomString(&rnd, kValueBytes);
+ ASSERT_OK(db_->Merge(WriteOptions(), Key(j % kNumKeys), value));
+ if (i == 0 && j == kNumKeys) {
+ // Take snapshot to prevent covered merge operands from being dropped or
+ // merged by compaction.
+ snapshot = db_->GetSnapshot();
+ // Do a DeleteRange near the beginning so only the oldest merge operand
+ // for each key is covered. This ensures the sequence of events:
+ //
+ // - `DBIter::Prev()` is called
+ // - After several same versions of the same user key are encountered,
+ // it decides to seek using `DBIter::FindValueForCurrentKeyUsingSeek`.
+ // - Binary searches to the newest version of the key, which is in the
+ // leftmost file containing the user key.
+ // - Scans forwards to collect all merge operands. Eventually reaches
+ // the rightmost file containing the oldest merge operand, which
+ // should be covered by the `DeleteRange`. If `RangeDelAggregator`
+ // were not properly using `kForwardTraversal` here, that operand
+ // would reappear.
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ Key(0), Key(kNumKeys + 1)));
+ }
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ }
+ ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr /* begin_key */,
+ nullptr /* end_key */));
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_GT(NumTableFilesAtLevel(1), 1);
+
+ auto* iter = db_->NewIterator(ReadOptions());
+ iter->SeekToLast();
+ int keys_found = 0;
+ for (; iter->Valid(); iter->Prev()) {
+ ++keys_found;
+ }
+ delete iter;
+ ASSERT_EQ(kNumKeys, keys_found);
+
+ db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, SnapshotPreventsDroppedKeys) {
+ const int kFileBytes = 1 << 20;
+
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.disable_auto_compactions = true;
+ options.target_file_size_base = kFileBytes;
+ Reopen(options);
+
+ ASSERT_OK(Put(Key(0), "a"));
+ const Snapshot* snapshot = db_->GetSnapshot();
+
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+ Key(10)));
+
+ db_->Flush(FlushOptions());
+
+ ReadOptions read_opts;
+ read_opts.snapshot = snapshot;
+ auto* iter = db_->NewIterator(read_opts);
+
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(Key(0), iter->key());
+
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+
+ delete iter;
+ db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, SnapshotPreventsDroppedKeysInImmMemTables) {
+ const int kFileBytes = 1 << 20;
+
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.disable_auto_compactions = true;
+ options.target_file_size_base = kFileBytes;
+ Reopen(options);
+
+ // block flush thread -> pin immtables in memory
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->LoadDependency({
+ {"SnapshotPreventsDroppedKeysInImmMemTables:AfterNewIterator",
+ "DBImpl::BGWorkFlush"},
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put(Key(0), "a"));
+ std::unique_ptr<const Snapshot, std::function<void(const Snapshot*)>>
+ snapshot(db_->GetSnapshot(),
+ [this](const Snapshot* s) { db_->ReleaseSnapshot(s); });
+
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+ Key(10)));
+
+ ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+
+ ReadOptions read_opts;
+ read_opts.snapshot = snapshot.get();
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+
+ TEST_SYNC_POINT("SnapshotPreventsDroppedKeysInImmMemTables:AfterNewIterator");
+
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(Key(0), iter->key());
+
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+}
+
+TEST_F(DBRangeDelTest, RangeTombstoneWrittenToMinimalSsts) {
+ // Adapted from
+ // https://github.com/cockroachdb/cockroach/blob/de8b3ea603dd1592d9dc26443c2cc92c356fbc2f/pkg/storage/engine/rocksdb_test.go#L1267-L1398.
+ // Regression test for issue where range tombstone was written to more files
+ // than necessary when it began exactly at the begin key in the next
+ // compaction output file.
+ const int kFileBytes = 1 << 20;
+ const int kValueBytes = 4 << 10;
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.disable_auto_compactions = true;
+ // Have a bit of slack in the size limits but we enforce them more strictly
+ // when manually flushing/compacting.
+ options.max_compaction_bytes = 2 * kFileBytes;
+ options.target_file_size_base = 2 * kFileBytes;
+ options.write_buffer_size = 2 * kFileBytes;
+ Reopen(options);
+
+ Random rnd(301);
+ for (char first_char : {'a', 'b', 'c'}) {
+ for (int i = 0; i < kFileBytes / kValueBytes; ++i) {
+ std::string key(1, first_char);
+ key.append(Key(i));
+ std::string value = RandomString(&rnd, kValueBytes);
+ ASSERT_OK(Put(key, value));
+ }
+ db_->Flush(FlushOptions());
+ MoveFilesToLevel(2);
+ }
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_EQ(3, NumTableFilesAtLevel(2));
+
+ // Populate the memtable lightly while spanning the whole key-space. The
+ // setting of `max_compaction_bytes` will cause the L0->L1 to output multiple
+ // files to prevent a large L1->L2 compaction later.
+ ASSERT_OK(Put("a", "val"));
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ "c" + Key(1), "d"));
+ // Our compaction output file cutting logic currently only considers point
+ // keys. So, in order for the range tombstone to have a chance at landing at
+ // the start of a new file, we need a point key at the range tombstone's
+ // start.
+ // TODO(ajkr): remove this `Put` after file cutting accounts for range
+ // tombstones (#3977).
+ ASSERT_OK(Put("c" + Key(1), "value"));
+ db_->Flush(FlushOptions());
+
+ // Ensure manual L0->L1 compaction cuts the outputs before the range tombstone
+ // and the range tombstone is only placed in the second SST.
+ std::string begin_key_storage("c" + Key(1));
+ Slice begin_key(begin_key_storage);
+ std::string end_key_storage("d");
+ Slice end_key(end_key_storage);
+ dbfull()->TEST_CompactRange(0 /* level */, &begin_key /* begin */,
+ &end_key /* end */, nullptr /* column_family */,
+ true /* disallow_trivial_move */);
+ ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+ std::vector<LiveFileMetaData> all_metadata;
+ std::vector<LiveFileMetaData> l1_metadata;
+ db_->GetLiveFilesMetaData(&all_metadata);
+ for (const auto& metadata : all_metadata) {
+ if (metadata.level == 1) {
+ l1_metadata.push_back(metadata);
+ }
+ }
+ std::sort(l1_metadata.begin(), l1_metadata.end(),
+ [&](const LiveFileMetaData& a, const LiveFileMetaData& b) {
+ return options.comparator->Compare(a.smallestkey, b.smallestkey) <
+ 0;
+ });
+ ASSERT_EQ("a", l1_metadata[0].smallestkey);
+ ASSERT_EQ("a", l1_metadata[0].largestkey);
+ ASSERT_EQ("c" + Key(1), l1_metadata[1].smallestkey);
+ ASSERT_EQ("d", l1_metadata[1].largestkey);
+
+ TablePropertiesCollection all_table_props;
+ ASSERT_OK(db_->GetPropertiesOfAllTables(&all_table_props));
+ int64_t num_range_deletions = 0;
+ for (const auto& name_and_table_props : all_table_props) {
+ const auto& name = name_and_table_props.first;
+ const auto& table_props = name_and_table_props.second;
+ // The range tombstone should only be output to the second L1 SST.
+ if (name.size() >= l1_metadata[1].name.size() &&
+ name.substr(name.size() - l1_metadata[1].name.size()).compare(l1_metadata[1].name) == 0) {
+ ASSERT_EQ(1, table_props->num_range_deletions);
+ ++num_range_deletions;
+ } else {
+ ASSERT_EQ(0, table_props->num_range_deletions);
+ }
+ }
+ ASSERT_EQ(1, num_range_deletions);
+}
+
+TEST_F(DBRangeDelTest, OverlappedTombstones) {
+ const int kNumPerFile = 4, kNumFiles = 2;
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.max_compaction_bytes = 9 * 1024;
+ DestroyAndReopen(options);
+ Random rnd(301);
+ for (int i = 0; i < kNumFiles; ++i) {
+ std::vector<std::string> values;
+ // Write 12K (4 values, each 3K)
+ for (int j = 0; j < kNumPerFile; j++) {
+ values.push_back(RandomString(&rnd, 3 << 10));
+ ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j]));
+ }
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+ MoveFilesToLevel(2);
+ ASSERT_EQ(2, NumTableFilesAtLevel(2));
+
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(1),
+ Key((kNumFiles)*kNumPerFile + 1)));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+ dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+ true /* disallow_trivial_move */);
+
+ // The tombstone range is not broken up into multiple SSTs which may incur a
+ // large compaction with L2.
+ ASSERT_EQ(1, NumTableFilesAtLevel(1));
+ std::vector<std::vector<FileMetaData>> files;
+ dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr,
+ true /* disallow_trivial_move */);
+ ASSERT_EQ(1, NumTableFilesAtLevel(2));
+ ASSERT_EQ(0, NumTableFilesAtLevel(1));
+}
+
+TEST_F(DBRangeDelTest, OverlappedKeys) {
+ const int kNumPerFile = 4, kNumFiles = 2;
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.max_compaction_bytes = 9 * 1024;
+ DestroyAndReopen(options);
+ Random rnd(301);
+ for (int i = 0; i < kNumFiles; ++i) {
+ std::vector<std::string> values;
+ // Write 12K (4 values, each 3K)
+ for (int j = 0; j < kNumPerFile; j++) {
+ values.push_back(RandomString(&rnd, 3 << 10));
+ ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j]));
+ }
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+ MoveFilesToLevel(2);
+ ASSERT_EQ(2, NumTableFilesAtLevel(2));
+
+ for (int i = 1; i < kNumFiles * kNumPerFile + 1; i++) {
+ ASSERT_OK(Put(Key(i), "0x123"));
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+ // The key range is broken up into three SSTs to avoid a future big compaction
+ // with the grandparent
+ dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+ true /* disallow_trivial_move */);
+ ASSERT_EQ(3, NumTableFilesAtLevel(1));
+
+ std::vector<std::vector<FileMetaData>> files;
+ dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr,
+ true /* disallow_trivial_move */);
+ ASSERT_EQ(1, NumTableFilesAtLevel(2));
+ ASSERT_EQ(0, NumTableFilesAtLevel(1));
+}
+
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_sst_test.cc b/src/rocksdb/db/db_sst_test.cc
new file mode 100644
index 000000000..e0ecfb641
--- /dev/null
+++ b/src/rocksdb/db/db_sst_test.cc
@@ -0,0 +1,1227 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "file/sst_file_manager_impl.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/sst_file_manager.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBSSTTest : public DBTestBase {
+ public:
+ DBSSTTest() : DBTestBase("/db_sst_test") {}
+};
+
+#ifndef ROCKSDB_LITE
+// A class which remembers the name of each flushed file.
+class FlushedFileCollector : public EventListener {
+ public:
+ FlushedFileCollector() {}
+ ~FlushedFileCollector() override {}
+
+ void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+ std::lock_guard<std::mutex> lock(mutex_);
+ flushed_files_.push_back(info.file_path);
+ }
+
+ std::vector<std::string> GetFlushedFiles() {
+ std::lock_guard<std::mutex> lock(mutex_);
+ std::vector<std::string> result;
+ for (auto fname : flushed_files_) {
+ result.push_back(fname);
+ }
+ return result;
+ }
+ void ClearFlushedFiles() {
+ std::lock_guard<std::mutex> lock(mutex_);
+ flushed_files_.clear();
+ }
+
+ private:
+ std::vector<std::string> flushed_files_;
+ std::mutex mutex_;
+};
+#endif // ROCKSDB_LITE
+
+TEST_F(DBSSTTest, DontDeletePendingOutputs) {
+ Options options;
+ options.env = env_;
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+
+ // Every time we write to a table file, call FOF/POF with full DB scan. This
+ // will make sure our pending_outputs_ protection work correctly
+ std::function<void()> purge_obsolete_files_function = [&]() {
+ JobContext job_context(0);
+ dbfull()->TEST_LockMutex();
+ dbfull()->FindObsoleteFiles(&job_context, true /*force*/);
+ dbfull()->TEST_UnlockMutex();
+ dbfull()->PurgeObsoleteFiles(job_context);
+ job_context.Clean();
+ };
+
+ env_->table_write_callback_ = &purge_obsolete_files_function;
+
+ for (int i = 0; i < 2; ++i) {
+ ASSERT_OK(Put("a", "begin"));
+ ASSERT_OK(Put("z", "end"));
+ ASSERT_OK(Flush());
+ }
+
+ // If pending output guard does not work correctly, PurgeObsoleteFiles() will
+ // delete the file that Compaction is trying to create, causing this: error
+ // db/db_test.cc:975: IO error:
+ // /tmp/rocksdbtest-1552237650/db_test/000009.sst: No such file or directory
+ Compact("a", "b");
+}
+
+// 1 Create some SST files by inserting K-V pairs into DB
+// 2 Close DB and change suffix from ".sst" to ".ldb" for every other SST file
+// 3 Open DB and check if all key can be read
+TEST_F(DBSSTTest, SSTsWithLdbSuffixHandling) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 110 << 10; // 110KB
+ options.num_levels = 4;
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ int key_id = 0;
+ for (int i = 0; i < 10; ++i) {
+ GenerateNewFile(&rnd, &key_id, false);
+ }
+ Flush();
+ Close();
+ int const num_files = GetSstFileCount(dbname_);
+ ASSERT_GT(num_files, 0);
+
+ Reopen(options);
+ std::vector<std::string> values;
+ values.reserve(key_id);
+ for (int k = 0; k < key_id; ++k) {
+ values.push_back(Get(Key(k)));
+ }
+ Close();
+
+ std::vector<std::string> filenames;
+ GetSstFiles(env_, dbname_, &filenames);
+ int num_ldb_files = 0;
+ for (size_t i = 0; i < filenames.size(); ++i) {
+ if (i & 1) {
+ continue;
+ }
+ std::string const rdb_name = dbname_ + "/" + filenames[i];
+ std::string const ldb_name = Rocks2LevelTableFileName(rdb_name);
+ ASSERT_TRUE(env_->RenameFile(rdb_name, ldb_name).ok());
+ ++num_ldb_files;
+ }
+ ASSERT_GT(num_ldb_files, 0);
+ ASSERT_EQ(num_files, GetSstFileCount(dbname_));
+
+ Reopen(options);
+ for (int k = 0; k < key_id; ++k) {
+ ASSERT_EQ(values[k], Get(Key(k)));
+ }
+ Destroy(options);
+}
+
+// Check that we don't crash when opening DB with
+// DBOptions::skip_checking_sst_file_sizes_on_db_open = true.
+TEST_F(DBSSTTest, SkipCheckingSSTFileSizesOnDBOpen) {
+ ASSERT_OK(Put("pika", "choo"));
+ ASSERT_OK(Flush());
+
+ // Just open the DB with the option set to true and check that we don't crash.
+ Options options;
+ options.skip_checking_sst_file_sizes_on_db_open = true;
+ Reopen(options);
+
+ ASSERT_EQ("choo", Get("pika"));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBSSTTest, DontDeleteMovedFile) {
+ // This test triggers move compaction and verifies that the file is not
+ // deleted when it's part of move compaction
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.max_bytes_for_level_base = 1024 * 1024; // 1 MB
+ options.level0_file_num_compaction_trigger =
+ 2; // trigger compaction when we have 2 files
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ // Create two 1MB sst files
+ for (int i = 0; i < 2; ++i) {
+ // Create 1MB sst file
+ for (int j = 0; j < 100; ++j) {
+ ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024)));
+ }
+ ASSERT_OK(Flush());
+ }
+ // this should execute both L0->L1 and L1->(move)->L2 compactions
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ("0,0,1", FilesPerLevel(0));
+
+ // If the moved file is actually deleted (the move-safeguard in
+ // ~Version::Version() is not there), we get this failure:
+ // Corruption: Can't access /000009.sst
+ Reopen(options);
+}
+
+// This reproduces a bug where we don't delete a file because when it was
+// supposed to be deleted, it was blocked by pending_outputs
+// Consider:
+// 1. current file_number is 13
+// 2. compaction (1) starts, blocks deletion of all files starting with 13
+// (pending outputs)
+// 3. file 13 is created by compaction (2)
+// 4. file 13 is consumed by compaction (3) and file 15 was created. Since file
+// 13 has no references, it is put into VersionSet::obsolete_files_
+// 5. FindObsoleteFiles() gets file 13 from VersionSet::obsolete_files_. File 13
+// is deleted from obsolete_files_ set.
+// 6. PurgeObsoleteFiles() tries to delete file 13, but this file is blocked by
+// pending outputs since compaction (1) is still running. It is not deleted and
+// it is not present in obsolete_files_ anymore. Therefore, we never delete it.
+TEST_F(DBSSTTest, DeleteObsoleteFilesPendingOutputs) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.write_buffer_size = 2 * 1024 * 1024; // 2 MB
+ options.max_bytes_for_level_base = 1024 * 1024; // 1 MB
+ options.level0_file_num_compaction_trigger =
+ 2; // trigger compaction when we have 2 files
+ options.max_background_flushes = 2;
+ options.max_background_compactions = 2;
+
+ OnFileDeletionListener* listener = new OnFileDeletionListener();
+ options.listeners.emplace_back(listener);
+
+ Reopen(options);
+
+ Random rnd(301);
+ // Create two 1MB sst files
+ for (int i = 0; i < 2; ++i) {
+ // Create 1MB sst file
+ for (int j = 0; j < 100; ++j) {
+ ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024)));
+ }
+ ASSERT_OK(Flush());
+ }
+ // this should execute both L0->L1 and L1->(move)->L2 compactions
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ("0,0,1", FilesPerLevel(0));
+
+ test::SleepingBackgroundTask blocking_thread;
+ port::Mutex mutex_;
+ bool already_blocked(false);
+
+ // block the flush
+ std::function<void()> block_first_time = [&]() {
+ bool blocking = false;
+ {
+ MutexLock l(&mutex_);
+ if (!already_blocked) {
+ blocking = true;
+ already_blocked = true;
+ }
+ }
+ if (blocking) {
+ blocking_thread.DoSleep();
+ }
+ };
+ env_->table_write_callback_ = &block_first_time;
+ // Insert 2.5MB data, which should trigger a flush because we exceed
+ // write_buffer_size. The flush will be blocked with block_first_time
+ // pending_file is protecting all the files created after
+ for (int j = 0; j < 256; ++j) {
+ ASSERT_OK(Put(Key(j), RandomString(&rnd, 10 * 1024)));
+ }
+ blocking_thread.WaitUntilSleeping();
+
+ ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr));
+
+ ASSERT_EQ("0,0,0,1", FilesPerLevel(0));
+ std::vector<LiveFileMetaData> metadata;
+ db_->GetLiveFilesMetaData(&metadata);
+ ASSERT_EQ(metadata.size(), 1U);
+ auto file_on_L2 = metadata[0].name;
+ listener->SetExpectedFileName(dbname_ + file_on_L2);
+
+ ASSERT_OK(dbfull()->TEST_CompactRange(3, nullptr, nullptr, nullptr,
+ true /* disallow trivial move */));
+ ASSERT_EQ("0,0,0,0,1", FilesPerLevel(0));
+
+ // finish the flush!
+ blocking_thread.WakeUp();
+ blocking_thread.WaitUntilDone();
+ dbfull()->TEST_WaitForFlushMemTable();
+ // File just flushed is too big for L0 and L1 so gets moved to L2.
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ("0,0,1,0,1", FilesPerLevel(0));
+
+ metadata.clear();
+ db_->GetLiveFilesMetaData(&metadata);
+ ASSERT_EQ(metadata.size(), 2U);
+
+ // This file should have been deleted during last compaction
+ ASSERT_EQ(Status::NotFound(), env_->FileExists(dbname_ + file_on_L2));
+ listener->VerifyMatchedCount(1);
+}
+
+TEST_F(DBSSTTest, DBWithSstFileManager) {
+ std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+ auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+ int files_added = 0;
+ int files_deleted = 0;
+ int files_moved = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SstFileManagerImpl::OnAddFile", [&](void* /*arg*/) { files_added++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SstFileManagerImpl::OnDeleteFile",
+ [&](void* /*arg*/) { files_deleted++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SstFileManagerImpl::OnMoveFile", [&](void* /*arg*/) { files_moved++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.sst_file_manager = sst_file_manager;
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < 25; i++) {
+ GenerateNewRandomFile(&rnd);
+ ASSERT_OK(Flush());
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+ // Verify that we are tracking all sst files in dbname_
+ ASSERT_EQ(sfm->GetTrackedFiles(), GetAllSSTFiles());
+ }
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ auto files_in_db = GetAllSSTFiles();
+ // Verify that we are tracking all sst files in dbname_
+ ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+ // Verify the total files size
+ uint64_t total_files_size = 0;
+ for (auto& file_to_size : files_in_db) {
+ total_files_size += file_to_size.second;
+ }
+ ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+ // We flushed at least 25 files
+ ASSERT_GE(files_added, 25);
+ // Compaction must have deleted some files
+ ASSERT_GT(files_deleted, 0);
+ // No files were moved
+ ASSERT_EQ(files_moved, 0);
+
+ Close();
+ Reopen(options);
+ ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+ ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+
+ // Verify that we track all the files again after the DB is closed and opened
+ Close();
+ sst_file_manager.reset(NewSstFileManager(env_));
+ options.sst_file_manager = sst_file_manager;
+ sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+ Reopen(options);
+ ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+ ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBSSTTest, RateLimitedDelete) {
+ Destroy(last_options_);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"DBSSTTest::RateLimitedDelete:1",
+ "DeleteScheduler::BackgroundEmptyTrash"},
+ });
+
+ std::vector<uint64_t> penalties;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DeleteScheduler::BackgroundEmptyTrash:Wait",
+ [&](void* arg) { penalties.push_back(*(static_cast<uint64_t*>(arg))); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
+ // Turn timed wait into a simulated sleep
+ uint64_t* abs_time_us = static_cast<uint64_t*>(arg);
+ int64_t cur_time = 0;
+ env_->GetCurrentTime(&cur_time);
+ if (*abs_time_us > static_cast<uint64_t>(cur_time)) {
+ env_->addon_time_.fetch_add(*abs_time_us -
+ static_cast<uint64_t>(cur_time));
+ }
+
+ // Randomly sleep shortly
+ env_->addon_time_.fetch_add(
+ static_cast<uint64_t>(Random::GetTLSInstance()->Uniform(10)));
+
+ // Set wait until time to before current to force not to sleep.
+ int64_t real_cur_time = 0;
+ Env::Default()->GetCurrentTime(&real_cur_time);
+ *abs_time_us = static_cast<uint64_t>(real_cur_time);
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ env_->no_slowdown_ = true;
+ env_->time_elapse_only_sleep_ = true;
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ // Need to disable stats dumping and persisting which also use
+ // RepeatableThread, one of whose member variables is of type
+ // InstrumentedCondVar. The callback for
+ // InstrumentedCondVar::TimedWaitInternal can be triggered by stats dumping
+ // and persisting threads and cause time_spent_deleting measurement to become
+ // incorrect.
+ options.stats_dump_period_sec = 0;
+ options.stats_persist_period_sec = 0;
+ options.env = env_;
+
+ int64_t rate_bytes_per_sec = 1024 * 10; // 10 Kbs / Sec
+ Status s;
+ options.sst_file_manager.reset(
+ NewSstFileManager(env_, nullptr, "", 0, false, &s, 0));
+ ASSERT_OK(s);
+ options.sst_file_manager->SetDeleteRateBytesPerSecond(rate_bytes_per_sec);
+ auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+ sfm->delete_scheduler()->SetMaxTrashDBRatio(1.1);
+
+ WriteOptions wo;
+ wo.disableWAL = true;
+ ASSERT_OK(TryReopen(options));
+ // Create 4 files in L0
+ for (char v = 'a'; v <= 'd'; v++) {
+ ASSERT_OK(Put("Key2", DummyString(1024, v), wo));
+ ASSERT_OK(Put("Key3", DummyString(1024, v), wo));
+ ASSERT_OK(Put("Key4", DummyString(1024, v), wo));
+ ASSERT_OK(Put("Key1", DummyString(1024, v), wo));
+ ASSERT_OK(Put("Key4", DummyString(1024, v), wo));
+ ASSERT_OK(Flush());
+ }
+ // We created 4 sst files in L0
+ ASSERT_EQ("4", FilesPerLevel(0));
+
+ std::vector<LiveFileMetaData> metadata;
+ db_->GetLiveFilesMetaData(&metadata);
+
+ // Compaction will move the 4 files in L0 to trash and create 1 L1 file
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+
+ uint64_t delete_start_time = env_->NowMicros();
+ // Hold BackgroundEmptyTrash
+ TEST_SYNC_POINT("DBSSTTest::RateLimitedDelete:1");
+ sfm->WaitForEmptyTrash();
+ uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time;
+
+ uint64_t total_files_size = 0;
+ uint64_t expected_penlty = 0;
+ ASSERT_EQ(penalties.size(), metadata.size());
+ for (size_t i = 0; i < metadata.size(); i++) {
+ total_files_size += metadata[i].size;
+ expected_penlty = ((total_files_size * 1000000) / rate_bytes_per_sec);
+ ASSERT_EQ(expected_penlty, penalties[i]);
+ }
+ ASSERT_GT(time_spent_deleting, expected_penlty * 0.9);
+ ASSERT_LT(time_spent_deleting, expected_penlty * 1.1);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBSSTTest, RateLimitedWALDelete) {
+ Destroy(last_options_);
+
+ std::vector<uint64_t> penalties;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DeleteScheduler::BackgroundEmptyTrash:Wait",
+ [&](void* arg) { penalties.push_back(*(static_cast<uint64_t*>(arg))); });
+
+ env_->no_slowdown_ = true;
+ env_->time_elapse_only_sleep_ = true;
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.compression = kNoCompression;
+ options.env = env_;
+
+ int64_t rate_bytes_per_sec = 1024 * 10; // 10 Kbs / Sec
+ Status s;
+ options.sst_file_manager.reset(
+ NewSstFileManager(env_, nullptr, "", 0, false, &s, 0));
+ ASSERT_OK(s);
+ options.sst_file_manager->SetDeleteRateBytesPerSecond(rate_bytes_per_sec);
+ auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+ sfm->delete_scheduler()->SetMaxTrashDBRatio(3.1);
+
+ ASSERT_OK(TryReopen(options));
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Create 4 files in L0
+ for (char v = 'a'; v <= 'd'; v++) {
+ ASSERT_OK(Put("Key2", DummyString(1024, v)));
+ ASSERT_OK(Put("Key3", DummyString(1024, v)));
+ ASSERT_OK(Put("Key4", DummyString(1024, v)));
+ ASSERT_OK(Put("Key1", DummyString(1024, v)));
+ ASSERT_OK(Put("Key4", DummyString(1024, v)));
+ ASSERT_OK(Flush());
+ }
+ // We created 4 sst files in L0
+ ASSERT_EQ("4", FilesPerLevel(0));
+
+ // Compaction will move the 4 files in L0 to trash and create 1 L1 file
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+
+ sfm->WaitForEmptyTrash();
+ ASSERT_EQ(penalties.size(), 8);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+class DBWALTestWithParam
+ : public DBSSTTest,
+ public testing::WithParamInterface<std::tuple<std::string, bool>> {
+ public:
+ DBWALTestWithParam() {
+ wal_dir_ = std::get<0>(GetParam());
+ wal_dir_same_as_dbname_ = std::get<1>(GetParam());
+ }
+
+ std::string wal_dir_;
+ bool wal_dir_same_as_dbname_;
+};
+
+TEST_P(DBWALTestWithParam, WALTrashCleanupOnOpen) {
+ class MyEnv : public EnvWrapper {
+ public:
+ MyEnv(Env* t) : EnvWrapper(t), fake_log_delete(false) {}
+
+ Status DeleteFile(const std::string& fname) {
+ if (fname.find(".log.trash") != std::string::npos && fake_log_delete) {
+ return Status::OK();
+ }
+
+ return target()->DeleteFile(fname);
+ }
+
+ void set_fake_log_delete(bool fake) { fake_log_delete = fake; }
+
+ private:
+ bool fake_log_delete;
+ };
+
+ std::unique_ptr<MyEnv> env(new MyEnv(Env::Default()));
+ Destroy(last_options_);
+
+ env->set_fake_log_delete(true);
+
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.compression = kNoCompression;
+ options.env = env.get();
+ options.wal_dir = dbname_ + wal_dir_;
+
+ int64_t rate_bytes_per_sec = 1024 * 10; // 10 Kbs / Sec
+ Status s;
+ options.sst_file_manager.reset(
+ NewSstFileManager(env_, nullptr, "", 0, false, &s, 0));
+ ASSERT_OK(s);
+ options.sst_file_manager->SetDeleteRateBytesPerSecond(rate_bytes_per_sec);
+ auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+ sfm->delete_scheduler()->SetMaxTrashDBRatio(3.1);
+
+ ASSERT_OK(TryReopen(options));
+
+ // Create 4 files in L0
+ for (char v = 'a'; v <= 'd'; v++) {
+ ASSERT_OK(Put("Key2", DummyString(1024, v)));
+ ASSERT_OK(Put("Key3", DummyString(1024, v)));
+ ASSERT_OK(Put("Key4", DummyString(1024, v)));
+ ASSERT_OK(Put("Key1", DummyString(1024, v)));
+ ASSERT_OK(Put("Key4", DummyString(1024, v)));
+ ASSERT_OK(Flush());
+ }
+ // We created 4 sst files in L0
+ ASSERT_EQ("4", FilesPerLevel(0));
+
+ Close();
+
+ options.sst_file_manager.reset();
+ std::vector<std::string> filenames;
+ int trash_log_count = 0;
+ if (!wal_dir_same_as_dbname_) {
+ // Forcibly create some trash log files
+ std::unique_ptr<WritableFile> result;
+ env->NewWritableFile(options.wal_dir + "/1000.log.trash", &result,
+ EnvOptions());
+ result.reset();
+ }
+ env->GetChildren(options.wal_dir, &filenames);
+ for (const std::string& fname : filenames) {
+ if (fname.find(".log.trash") != std::string::npos) {
+ trash_log_count++;
+ }
+ }
+ ASSERT_GE(trash_log_count, 1);
+
+ env->set_fake_log_delete(false);
+ ASSERT_OK(TryReopen(options));
+
+ filenames.clear();
+ trash_log_count = 0;
+ env->GetChildren(options.wal_dir, &filenames);
+ for (const std::string& fname : filenames) {
+ if (fname.find(".log.trash") != std::string::npos) {
+ trash_log_count++;
+ }
+ }
+ ASSERT_EQ(trash_log_count, 0);
+ Close();
+}
+
+INSTANTIATE_TEST_CASE_P(DBWALTestWithParam, DBWALTestWithParam,
+ ::testing::Values(std::make_tuple("", true),
+ std::make_tuple("_wal_dir", false)));
+
+TEST_F(DBSSTTest, OpenDBWithExistingTrash) {
+ Options options = CurrentOptions();
+
+ options.sst_file_manager.reset(
+ NewSstFileManager(env_, nullptr, "", 1024 * 1024 /* 1 MB/sec */));
+ auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+
+ Destroy(last_options_);
+
+ // Add some trash files to the db directory so the DB can clean them up
+ env_->CreateDirIfMissing(dbname_);
+ ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "001.sst.trash"));
+ ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "002.sst.trash"));
+ ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "003.sst.trash"));
+
+ // Reopen the DB and verify that it deletes existing trash files
+ ASSERT_OK(TryReopen(options));
+ sfm->WaitForEmptyTrash();
+ ASSERT_NOK(env_->FileExists(dbname_ + "/" + "001.sst.trash"));
+ ASSERT_NOK(env_->FileExists(dbname_ + "/" + "002.sst.trash"));
+ ASSERT_NOK(env_->FileExists(dbname_ + "/" + "003.sst.trash"));
+}
+
+
+// Create a DB with 2 db_paths, and generate multiple files in the 2
+// db_paths using CompactRangeOptions, make sure that files that were
+// deleted from first db_path were deleted using DeleteScheduler and
+// files in the second path were not.
+TEST_F(DBSSTTest, DeleteSchedulerMultipleDBPaths) {
+ std::atomic<int> bg_delete_file(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DeleteScheduler::DeleteTrashFile:DeleteFile",
+ [&](void* /*arg*/) { bg_delete_file++; });
+ // The deletion scheduler sometimes skips marking file as trash according to
+ // a heuristic. In that case the deletion will go through the below SyncPoint.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DeleteScheduler::DeleteFile", [&](void* /*arg*/) { bg_delete_file++; });
+
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.db_paths.emplace_back(dbname_, 1024 * 100);
+ options.db_paths.emplace_back(dbname_ + "_2", 1024 * 100);
+ options.env = env_;
+
+ int64_t rate_bytes_per_sec = 1024 * 1024; // 1 Mb / Sec
+ Status s;
+ options.sst_file_manager.reset(
+ NewSstFileManager(env_, nullptr, "", rate_bytes_per_sec, false, &s,
+ /* max_trash_db_ratio= */ 1.1));
+
+ ASSERT_OK(s);
+ auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+
+ DestroyAndReopen(options);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ WriteOptions wo;
+ wo.disableWAL = true;
+
+ // Create 4 files in L0
+ for (int i = 0; i < 4; i++) {
+ ASSERT_OK(Put("Key" + ToString(i), DummyString(1024, 'A'), wo));
+ ASSERT_OK(Flush());
+ }
+ // We created 4 sst files in L0
+ ASSERT_EQ("4", FilesPerLevel(0));
+ // Compaction will delete files from L0 in first db path and generate a new
+ // file in L1 in second db path
+ CompactRangeOptions compact_options;
+ compact_options.target_path_id = 1;
+ Slice begin("Key0");
+ Slice end("Key3");
+ ASSERT_OK(db_->CompactRange(compact_options, &begin, &end));
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+
+ // Create 4 files in L0
+ for (int i = 4; i < 8; i++) {
+ ASSERT_OK(Put("Key" + ToString(i), DummyString(1024, 'B'), wo));
+ ASSERT_OK(Flush());
+ }
+ ASSERT_EQ("4,1", FilesPerLevel(0));
+
+ // Compaction will delete files from L0 in first db path and generate a new
+ // file in L1 in second db path
+ begin = "Key4";
+ end = "Key7";
+ ASSERT_OK(db_->CompactRange(compact_options, &begin, &end));
+ ASSERT_EQ("0,2", FilesPerLevel(0));
+
+ sfm->WaitForEmptyTrash();
+ ASSERT_EQ(bg_delete_file, 8);
+
+ // Compaction will delete both files and regenerate a file in L1 in second
+ // db path. The deleted files should still be cleaned up via delete scheduler.
+ compact_options.bottommost_level_compaction =
+ BottommostLevelCompaction::kForceOptimized;
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+
+ sfm->WaitForEmptyTrash();
+ ASSERT_EQ(bg_delete_file, 10);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBSSTTest, DestroyDBWithRateLimitedDelete) {
+ int bg_delete_file = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DeleteScheduler::DeleteTrashFile:DeleteFile",
+ [&](void* /*arg*/) { bg_delete_file++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Status s;
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.env = env_;
+ options.sst_file_manager.reset(
+ NewSstFileManager(env_, nullptr, "", 0, false, &s, 0));
+ ASSERT_OK(s);
+ DestroyAndReopen(options);
+
+ // Create 4 files in L0
+ for (int i = 0; i < 4; i++) {
+ ASSERT_OK(Put("Key" + ToString(i), DummyString(1024, 'A')));
+ ASSERT_OK(Flush());
+ }
+ // We created 4 sst files in L0
+ ASSERT_EQ("4", FilesPerLevel(0));
+
+ // Close DB and destroy it using DeleteScheduler
+ Close();
+
+ int num_sst_files = 0;
+ int num_wal_files = 0;
+ std::vector<std::string> db_files;
+ env_->GetChildren(dbname_, &db_files);
+ for (std::string f : db_files) {
+ if (f.substr(f.find_last_of(".") + 1) == "sst") {
+ num_sst_files++;
+ } else if (f.substr(f.find_last_of(".") + 1) == "log") {
+ num_wal_files++;
+ }
+ }
+ ASSERT_GT(num_sst_files, 0);
+ ASSERT_GT(num_wal_files, 0);
+
+ auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+
+ sfm->SetDeleteRateBytesPerSecond(1024 * 1024);
+ sfm->delete_scheduler()->SetMaxTrashDBRatio(1.1);
+ ASSERT_OK(DestroyDB(dbname_, options));
+ sfm->WaitForEmptyTrash();
+ ASSERT_EQ(bg_delete_file, num_sst_files + num_wal_files);
+}
+
+TEST_F(DBSSTTest, DBWithMaxSpaceAllowed) {
+ std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+ auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+ Options options = CurrentOptions();
+ options.sst_file_manager = sst_file_manager;
+ options.disable_auto_compactions = true;
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+
+ // Generate a file containing 100 keys.
+ for (int i = 0; i < 100; i++) {
+ ASSERT_OK(Put(Key(i), RandomString(&rnd, 50)));
+ }
+ ASSERT_OK(Flush());
+
+ uint64_t first_file_size = 0;
+ auto files_in_db = GetAllSSTFiles(&first_file_size);
+ ASSERT_EQ(sfm->GetTotalSize(), first_file_size);
+
+ // Set the maximum allowed space usage to the current total size
+ sfm->SetMaxAllowedSpaceUsage(first_file_size + 1);
+
+ ASSERT_OK(Put("key1", "val1"));
+ // This flush will cause bg_error_ and will fail
+ ASSERT_NOK(Flush());
+}
+
+TEST_F(DBSSTTest, CancellingCompactionsWorks) {
+ std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+ auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+ Options options = CurrentOptions();
+ options.sst_file_manager = sst_file_manager;
+ options.level0_file_num_compaction_trigger = 2;
+ options.statistics = CreateDBStatistics();
+ DestroyAndReopen(options);
+
+ int completed_compactions = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction():CancelledCompaction", [&](void* /*arg*/) {
+ sfm->SetMaxAllowedSpaceUsage(0);
+ ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial:AfterRun",
+ [&](void* /*arg*/) { completed_compactions++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+
+ // Generate a file containing 10 keys.
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(Put(Key(i), RandomString(&rnd, 50)));
+ }
+ ASSERT_OK(Flush());
+ uint64_t total_file_size = 0;
+ auto files_in_db = GetAllSSTFiles(&total_file_size);
+ // Set the maximum allowed space usage to the current total size
+ sfm->SetMaxAllowedSpaceUsage(2 * total_file_size + 1);
+
+ // Generate another file to trigger compaction.
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(Put(Key(i), RandomString(&rnd, 50)));
+ }
+ ASSERT_OK(Flush());
+ dbfull()->TEST_WaitForCompact(true);
+
+ // Because we set a callback in CancelledCompaction, we actually
+ // let the compaction run
+ ASSERT_GT(completed_compactions, 0);
+ ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
+ // Make sure the stat is bumped
+ ASSERT_GT(dbfull()->immutable_db_options().statistics.get()->getTickerCount(COMPACTION_CANCELLED), 0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBSSTTest, CancellingManualCompactionsWorks) {
+ std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+ auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+ Options options = CurrentOptions();
+ options.sst_file_manager = sst_file_manager;
+ options.statistics = CreateDBStatistics();
+
+ FlushedFileCollector* collector = new FlushedFileCollector();
+ options.listeners.emplace_back(collector);
+
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+
+ // Generate a file containing 10 keys.
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(Put(Key(i), RandomString(&rnd, 50)));
+ }
+ ASSERT_OK(Flush());
+ uint64_t total_file_size = 0;
+ auto files_in_db = GetAllSSTFiles(&total_file_size);
+ // Set the maximum allowed space usage to the current total size
+ sfm->SetMaxAllowedSpaceUsage(2 * total_file_size + 1);
+
+ // Generate another file to trigger compaction.
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(Put(Key(i), RandomString(&rnd, 50)));
+ }
+ ASSERT_OK(Flush());
+
+ // OK, now trigger a manual compaction
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+ // Wait for manual compaction to get scheduled and finish
+ dbfull()->TEST_WaitForCompact(true);
+
+ ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
+ // Make sure the stat is bumped
+ ASSERT_EQ(dbfull()->immutable_db_options().statistics.get()->getTickerCount(
+ COMPACTION_CANCELLED),
+ 1);
+
+ // Now make sure CompactFiles also gets cancelled
+ auto l0_files = collector->GetFlushedFiles();
+ dbfull()->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), l0_files, 0);
+
+ // Wait for manual compaction to get scheduled and finish
+ dbfull()->TEST_WaitForCompact(true);
+
+ ASSERT_EQ(dbfull()->immutable_db_options().statistics.get()->getTickerCount(
+ COMPACTION_CANCELLED),
+ 2);
+ ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
+
+ // Now let the flush through and make sure GetCompactionsReservedSize
+ // returns to normal
+ sfm->SetMaxAllowedSpaceUsage(0);
+ int completed_compactions = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactFilesImpl:End", [&](void* /*arg*/) { completed_compactions++; });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ dbfull()->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), l0_files, 0);
+ dbfull()->TEST_WaitForCompact(true);
+
+ ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
+ ASSERT_GT(completed_compactions, 0);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBSSTTest, DBWithMaxSpaceAllowedRandomized) {
+ // This test will set a maximum allowed space for the DB, then it will
+ // keep filling the DB until the limit is reached and bg_error_ is set.
+ // When bg_error_ is set we will verify that the DB size is greater
+ // than the limit.
+
+ std::vector<int> max_space_limits_mbs = {1, 10};
+ std::atomic<bool> bg_error_set(false);
+
+ std::atomic<int> reached_max_space_on_flush(0);
+ std::atomic<int> reached_max_space_on_compaction(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached",
+ [&](void* arg) {
+ Status* bg_error = static_cast<Status*>(arg);
+ bg_error_set = true;
+ reached_max_space_on_flush++;
+ // clear error to ensure compaction callback is called
+ *bg_error = Status::OK();
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction():CancelledCompaction", [&](void* arg) {
+ bool* enough_room = static_cast<bool*>(arg);
+ *enough_room = true;
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::FinishCompactionOutputFile:MaxAllowedSpaceReached",
+ [&](void* /*arg*/) {
+ bg_error_set = true;
+ reached_max_space_on_compaction++;
+ });
+
+ for (auto limit_mb : max_space_limits_mbs) {
+ bg_error_set = false;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+ auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+ Options options = CurrentOptions();
+ options.sst_file_manager = sst_file_manager;
+ options.write_buffer_size = 1024 * 512; // 512 Kb
+ DestroyAndReopen(options);
+ Random rnd(301);
+
+ sfm->SetMaxAllowedSpaceUsage(limit_mb * 1024 * 1024);
+
+ // It is easy to detect if the test is stuck in a loop. No need for
+ // complex termination logic.
+ while (true) {
+ auto s = Put(RandomString(&rnd, 10), RandomString(&rnd, 50));
+ if (!s.ok()) {
+ break;
+ }
+ }
+ ASSERT_TRUE(bg_error_set);
+ uint64_t total_sst_files_size = 0;
+ GetAllSSTFiles(&total_sst_files_size);
+ ASSERT_GE(total_sst_files_size, limit_mb * 1024 * 1024);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+
+ ASSERT_GT(reached_max_space_on_flush, 0);
+ ASSERT_GT(reached_max_space_on_compaction, 0);
+}
+
+TEST_F(DBSSTTest, OpenDBWithInfiniteMaxOpenFiles) {
+ // Open DB with infinite max open files
+ // - First iteration use 1 thread to open files
+ // - Second iteration use 5 threads to open files
+ for (int iter = 0; iter < 2; iter++) {
+ Options options;
+ options.create_if_missing = true;
+ options.write_buffer_size = 100000;
+ options.disable_auto_compactions = true;
+ options.max_open_files = -1;
+ if (iter == 0) {
+ options.max_file_opening_threads = 1;
+ } else {
+ options.max_file_opening_threads = 5;
+ }
+ options = CurrentOptions(options);
+ DestroyAndReopen(options);
+
+ // Create 12 Files in L0 (then move then to L2)
+ for (int i = 0; i < 12; i++) {
+ std::string k = "L2_" + Key(i);
+ ASSERT_OK(Put(k, k + std::string(1000, 'a')));
+ ASSERT_OK(Flush());
+ }
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 2;
+ db_->CompactRange(compact_options, nullptr, nullptr);
+
+ // Create 12 Files in L0
+ for (int i = 0; i < 12; i++) {
+ std::string k = "L0_" + Key(i);
+ ASSERT_OK(Put(k, k + std::string(1000, 'a')));
+ ASSERT_OK(Flush());
+ }
+ Close();
+
+ // Reopening the DB will load all existing files
+ Reopen(options);
+ ASSERT_EQ("12,0,12", FilesPerLevel(0));
+ std::vector<std::vector<FileMetaData>> files;
+ dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files);
+
+ for (const auto& level : files) {
+ for (const auto& file : level) {
+ ASSERT_TRUE(file.table_reader_handle != nullptr);
+ }
+ }
+
+ for (int i = 0; i < 12; i++) {
+ ASSERT_EQ(Get("L0_" + Key(i)), "L0_" + Key(i) + std::string(1000, 'a'));
+ ASSERT_EQ(Get("L2_" + Key(i)), "L2_" + Key(i) + std::string(1000, 'a'));
+ }
+ }
+}
+
+TEST_F(DBSSTTest, GetTotalSstFilesSize) {
+ // We don't propagate oldest-key-time table property on compaction and
+ // just write 0 as default value. This affect the exact table size, since
+ // we encode table properties as varint64. Force time to be 0 to work around
+ // it. Should remove the workaround after we propagate the property on
+ // compaction.
+ std::unique_ptr<MockTimeEnv> mock_env(new MockTimeEnv(Env::Default()));
+ mock_env->set_current_time(0);
+
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.compression = kNoCompression;
+ options.env = mock_env.get();
+ DestroyAndReopen(options);
+ // Generate 5 files in L0
+ for (int i = 0; i < 5; i++) {
+ for (int j = 0; j < 10; j++) {
+ std::string val = "val_file_" + ToString(i);
+ ASSERT_OK(Put(Key(j), val));
+ }
+ Flush();
+ }
+ ASSERT_EQ("5", FilesPerLevel(0));
+
+ std::vector<LiveFileMetaData> live_files_meta;
+ dbfull()->GetLiveFilesMetaData(&live_files_meta);
+ ASSERT_EQ(live_files_meta.size(), 5);
+ uint64_t single_file_size = live_files_meta[0].size;
+
+ uint64_t live_sst_files_size = 0;
+ uint64_t total_sst_files_size = 0;
+ for (const auto& file_meta : live_files_meta) {
+ live_sst_files_size += file_meta.size;
+ }
+
+ ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+ &total_sst_files_size));
+ // Live SST files = 5
+ // Total SST files = 5
+ ASSERT_EQ(live_sst_files_size, 5 * single_file_size);
+ ASSERT_EQ(total_sst_files_size, 5 * single_file_size);
+
+ // hold current version
+ std::unique_ptr<Iterator> iter1(dbfull()->NewIterator(ReadOptions()));
+
+ // Compact 5 files into 1 file in L0
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+
+ live_files_meta.clear();
+ dbfull()->GetLiveFilesMetaData(&live_files_meta);
+ ASSERT_EQ(live_files_meta.size(), 1);
+
+ live_sst_files_size = 0;
+ total_sst_files_size = 0;
+ for (const auto& file_meta : live_files_meta) {
+ live_sst_files_size += file_meta.size;
+ }
+ ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+ &total_sst_files_size));
+ // Live SST files = 1 (compacted file)
+ // Total SST files = 6 (5 original files + compacted file)
+ ASSERT_EQ(live_sst_files_size, 1 * single_file_size);
+ ASSERT_EQ(total_sst_files_size, 6 * single_file_size);
+
+ // hold current version
+ std::unique_ptr<Iterator> iter2(dbfull()->NewIterator(ReadOptions()));
+
+ // Delete all keys and compact, this will delete all live files
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(Delete(Key(i)));
+ }
+ Flush();
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("", FilesPerLevel(0));
+
+ live_files_meta.clear();
+ dbfull()->GetLiveFilesMetaData(&live_files_meta);
+ ASSERT_EQ(live_files_meta.size(), 0);
+
+ ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+ &total_sst_files_size));
+ // Live SST files = 0
+ // Total SST files = 6 (5 original files + compacted file)
+ ASSERT_EQ(total_sst_files_size, 6 * single_file_size);
+
+ iter1.reset();
+ ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+ &total_sst_files_size));
+ // Live SST files = 0
+ // Total SST files = 1 (compacted file)
+ ASSERT_EQ(total_sst_files_size, 1 * single_file_size);
+
+ iter2.reset();
+ ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+ &total_sst_files_size));
+ // Live SST files = 0
+ // Total SST files = 0
+ ASSERT_EQ(total_sst_files_size, 0);
+
+ // Close db before mock_env destruct.
+ Close();
+}
+
+TEST_F(DBSSTTest, GetTotalSstFilesSizeVersionsFilesShared) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.compression = kNoCompression;
+ DestroyAndReopen(options);
+ // Generate 5 files in L0
+ for (int i = 0; i < 5; i++) {
+ ASSERT_OK(Put(Key(i), "val"));
+ Flush();
+ }
+ ASSERT_EQ("5", FilesPerLevel(0));
+
+ std::vector<LiveFileMetaData> live_files_meta;
+ dbfull()->GetLiveFilesMetaData(&live_files_meta);
+ ASSERT_EQ(live_files_meta.size(), 5);
+ uint64_t single_file_size = live_files_meta[0].size;
+
+ uint64_t live_sst_files_size = 0;
+ uint64_t total_sst_files_size = 0;
+ for (const auto& file_meta : live_files_meta) {
+ live_sst_files_size += file_meta.size;
+ }
+
+ ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+ &total_sst_files_size));
+
+ // Live SST files = 5
+ // Total SST files = 5
+ ASSERT_EQ(live_sst_files_size, 5 * single_file_size);
+ ASSERT_EQ(total_sst_files_size, 5 * single_file_size);
+
+ // hold current version
+ std::unique_ptr<Iterator> iter1(dbfull()->NewIterator(ReadOptions()));
+
+ // Compaction will do trivial move from L0 to L1
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("0,5", FilesPerLevel(0));
+
+ live_files_meta.clear();
+ dbfull()->GetLiveFilesMetaData(&live_files_meta);
+ ASSERT_EQ(live_files_meta.size(), 5);
+
+ live_sst_files_size = 0;
+ total_sst_files_size = 0;
+ for (const auto& file_meta : live_files_meta) {
+ live_sst_files_size += file_meta.size;
+ }
+ ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+ &total_sst_files_size));
+ // Live SST files = 5
+ // Total SST files = 5 (used in 2 version)
+ ASSERT_EQ(live_sst_files_size, 5 * single_file_size);
+ ASSERT_EQ(total_sst_files_size, 5 * single_file_size);
+
+ // hold current version
+ std::unique_ptr<Iterator> iter2(dbfull()->NewIterator(ReadOptions()));
+
+ // Delete all keys and compact, this will delete all live files
+ for (int i = 0; i < 5; i++) {
+ ASSERT_OK(Delete(Key(i)));
+ }
+ Flush();
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("", FilesPerLevel(0));
+
+ live_files_meta.clear();
+ dbfull()->GetLiveFilesMetaData(&live_files_meta);
+ ASSERT_EQ(live_files_meta.size(), 0);
+
+ ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+ &total_sst_files_size));
+ // Live SST files = 0
+ // Total SST files = 5 (used in 2 version)
+ ASSERT_EQ(total_sst_files_size, 5 * single_file_size);
+
+ iter1.reset();
+ iter2.reset();
+
+ ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+ &total_sst_files_size));
+ // Live SST files = 0
+ // Total SST files = 0
+ ASSERT_EQ(total_sst_files_size, 0);
+}
+
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_statistics_test.cc b/src/rocksdb/db/db_statistics_test.cc
new file mode 100644
index 000000000..8fbbb96d5
--- /dev/null
+++ b/src/rocksdb/db/db_statistics_test.cc
@@ -0,0 +1,149 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <string>
+
+#include "db/db_test_util.h"
+#include "monitoring/thread_status_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/statistics.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBStatisticsTest : public DBTestBase {
+ public:
+ DBStatisticsTest() : DBTestBase("/db_statistics_test") {}
+};
+
+TEST_F(DBStatisticsTest, CompressionStatsTest) {
+ CompressionType type;
+
+ if (Snappy_Supported()) {
+ type = kSnappyCompression;
+ fprintf(stderr, "using snappy\n");
+ } else if (Zlib_Supported()) {
+ type = kZlibCompression;
+ fprintf(stderr, "using zlib\n");
+ } else if (BZip2_Supported()) {
+ type = kBZip2Compression;
+ fprintf(stderr, "using bzip2\n");
+ } else if (LZ4_Supported()) {
+ type = kLZ4Compression;
+ fprintf(stderr, "using lz4\n");
+ } else if (XPRESS_Supported()) {
+ type = kXpressCompression;
+ fprintf(stderr, "using xpress\n");
+ } else if (ZSTD_Supported()) {
+ type = kZSTD;
+ fprintf(stderr, "using ZSTD\n");
+ } else {
+ fprintf(stderr, "skipping test, compression disabled\n");
+ return;
+ }
+
+ Options options = CurrentOptions();
+ options.compression = type;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
+ DestroyAndReopen(options);
+
+ int kNumKeysWritten = 100000;
+
+ // Check that compressions occur and are counted when compression is turned on
+ Random rnd(301);
+ for (int i = 0; i < kNumKeysWritten; ++i) {
+ // compressible string
+ ASSERT_OK(Put(Key(i), RandomString(&rnd, 128) + std::string(128, 'a')));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_GT(options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED), 0);
+
+ for (int i = 0; i < kNumKeysWritten; ++i) {
+ auto r = Get(Key(i));
+ }
+ ASSERT_GT(options.statistics->getTickerCount(NUMBER_BLOCK_DECOMPRESSED), 0);
+
+ options.compression = kNoCompression;
+ DestroyAndReopen(options);
+ uint64_t currentCompressions =
+ options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED);
+ uint64_t currentDecompressions =
+ options.statistics->getTickerCount(NUMBER_BLOCK_DECOMPRESSED);
+
+ // Check that compressions do not occur when turned off
+ for (int i = 0; i < kNumKeysWritten; ++i) {
+ // compressible string
+ ASSERT_OK(Put(Key(i), RandomString(&rnd, 128) + std::string(128, 'a')));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED)
+ - currentCompressions, 0);
+
+ for (int i = 0; i < kNumKeysWritten; ++i) {
+ auto r = Get(Key(i));
+ }
+ ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_DECOMPRESSED)
+ - currentDecompressions, 0);
+}
+
+TEST_F(DBStatisticsTest, MutexWaitStatsDisabledByDefault) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ CreateAndReopenWithCF({"pikachu"}, options);
+ const uint64_t kMutexWaitDelay = 100;
+ ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT,
+ kMutexWaitDelay);
+ ASSERT_OK(Put("hello", "rocksdb"));
+ ASSERT_EQ(TestGetTickerCount(options, DB_MUTEX_WAIT_MICROS), 0);
+ ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0);
+}
+
+TEST_F(DBStatisticsTest, MutexWaitStats) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.statistics->set_stats_level(StatsLevel::kAll);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ const uint64_t kMutexWaitDelay = 100;
+ ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT,
+ kMutexWaitDelay);
+ ASSERT_OK(Put("hello", "rocksdb"));
+ ASSERT_GE(TestGetTickerCount(options, DB_MUTEX_WAIT_MICROS), kMutexWaitDelay);
+ ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0);
+}
+
+TEST_F(DBStatisticsTest, ResetStats) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ DestroyAndReopen(options);
+ for (int i = 0; i < 2; ++i) {
+ // pick arbitrary ticker and histogram. On first iteration they're zero
+ // because db is unused. On second iteration they're zero due to Reset().
+ ASSERT_EQ(0, TestGetTickerCount(options, NUMBER_KEYS_WRITTEN));
+ HistogramData histogram_data;
+ options.statistics->histogramData(DB_WRITE, &histogram_data);
+ ASSERT_EQ(0.0, histogram_data.max);
+
+ if (i == 0) {
+ // The Put() makes some of the ticker/histogram stats nonzero until we
+ // Reset().
+ ASSERT_OK(Put("hello", "rocksdb"));
+ ASSERT_EQ(1, TestGetTickerCount(options, NUMBER_KEYS_WRITTEN));
+ options.statistics->histogramData(DB_WRITE, &histogram_data);
+ ASSERT_GT(histogram_data.max, 0.0);
+ options.statistics->Reset();
+ }
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_table_properties_test.cc b/src/rocksdb/db/db_table_properties_test.cc
new file mode 100644
index 000000000..e3499df70
--- /dev/null
+++ b/src/rocksdb/db/db_table_properties_test.cc
@@ -0,0 +1,336 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <unordered_set>
+#include <vector>
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/db.h"
+#include "rocksdb/utilities/table_properties_collectors.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+#ifndef ROCKSDB_LITE
+
+namespace ROCKSDB_NAMESPACE {
+
+// A helper function that ensures the table properties returned in
+// `GetPropertiesOfAllTablesTest` is correct.
+// This test assumes entries size is different for each of the tables.
+namespace {
+
+void VerifyTableProperties(DB* db, uint64_t expected_entries_size) {
+ TablePropertiesCollection props;
+ ASSERT_OK(db->GetPropertiesOfAllTables(&props));
+
+ ASSERT_EQ(4U, props.size());
+ std::unordered_set<uint64_t> unique_entries;
+
+ // Indirect test
+ uint64_t sum = 0;
+ for (const auto& item : props) {
+ unique_entries.insert(item.second->num_entries);
+ sum += item.second->num_entries;
+ }
+
+ ASSERT_EQ(props.size(), unique_entries.size());
+ ASSERT_EQ(expected_entries_size, sum);
+}
+} // namespace
+
+class DBTablePropertiesTest : public DBTestBase {
+ public:
+ DBTablePropertiesTest() : DBTestBase("/db_table_properties_test") {}
+ TablePropertiesCollection TestGetPropertiesOfTablesInRange(
+ std::vector<Range> ranges, std::size_t* num_properties = nullptr,
+ std::size_t* num_files = nullptr);
+};
+
+TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) {
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = 8;
+ Reopen(options);
+ // Create 4 tables
+ for (int table = 0; table < 4; ++table) {
+ for (int i = 0; i < 10 + table; ++i) {
+ db_->Put(WriteOptions(), ToString(table * 100 + i), "val");
+ }
+ db_->Flush(FlushOptions());
+ }
+
+ // 1. Read table properties directly from file
+ Reopen(options);
+ VerifyTableProperties(db_, 10 + 11 + 12 + 13);
+
+ // 2. Put two tables to table cache and
+ Reopen(options);
+ // fetch key from 1st and 2nd table, which will internally place that table to
+ // the table cache.
+ for (int i = 0; i < 2; ++i) {
+ Get(ToString(i * 100 + 0));
+ }
+
+ VerifyTableProperties(db_, 10 + 11 + 12 + 13);
+
+ // 3. Put all tables to table cache
+ Reopen(options);
+ // fetch key from 1st and 2nd table, which will internally place that table to
+ // the table cache.
+ for (int i = 0; i < 4; ++i) {
+ Get(ToString(i * 100 + 0));
+ }
+ VerifyTableProperties(db_, 10 + 11 + 12 + 13);
+}
+
+TablePropertiesCollection
+DBTablePropertiesTest::TestGetPropertiesOfTablesInRange(
+ std::vector<Range> ranges, std::size_t* num_properties,
+ std::size_t* num_files) {
+
+ // Since we deref zero element in the vector it can not be empty
+ // otherwise we pass an address to some random memory
+ EXPECT_GT(ranges.size(), 0U);
+ // run the query
+ TablePropertiesCollection props;
+ EXPECT_OK(db_->GetPropertiesOfTablesInRange(
+ db_->DefaultColumnFamily(), &ranges[0], ranges.size(), &props));
+
+ // Make sure that we've received properties for those and for those files
+ // only which fall within requested ranges
+ std::vector<LiveFileMetaData> vmd;
+ db_->GetLiveFilesMetaData(&vmd);
+ for (auto& md : vmd) {
+ std::string fn = md.db_path + md.name;
+ bool in_range = false;
+ for (auto& r : ranges) {
+ // smallestkey < limit && largestkey >= start
+ if (r.limit.compare(md.smallestkey) >= 0 &&
+ r.start.compare(md.largestkey) <= 0) {
+ in_range = true;
+ EXPECT_GT(props.count(fn), 0);
+ }
+ }
+ if (!in_range) {
+ EXPECT_EQ(props.count(fn), 0);
+ }
+ }
+
+ if (num_properties) {
+ *num_properties = props.size();
+ }
+
+ if (num_files) {
+ *num_files = vmd.size();
+ }
+ return props;
+}
+
+TEST_F(DBTablePropertiesTest, GetPropertiesOfTablesInRange) {
+ // Fixed random sead
+ Random rnd(301);
+
+ Options options;
+ options.create_if_missing = true;
+ options.write_buffer_size = 4096;
+ options.max_write_buffer_number = 2;
+ options.level0_file_num_compaction_trigger = 2;
+ options.level0_slowdown_writes_trigger = 2;
+ options.level0_stop_writes_trigger = 2;
+ options.target_file_size_base = 2048;
+ options.max_bytes_for_level_base = 40960;
+ options.max_bytes_for_level_multiplier = 4;
+ options.hard_pending_compaction_bytes_limit = 16 * 1024;
+ options.num_levels = 8;
+ options.env = env_;
+
+ DestroyAndReopen(options);
+
+ // build a decent LSM
+ for (int i = 0; i < 10000; i++) {
+ ASSERT_OK(Put(test::RandomKey(&rnd, 5), RandomString(&rnd, 102)));
+ }
+ Flush();
+ dbfull()->TEST_WaitForCompact();
+ if (NumTableFilesAtLevel(0) == 0) {
+ ASSERT_OK(Put(test::RandomKey(&rnd, 5), RandomString(&rnd, 102)));
+ Flush();
+ }
+
+ db_->PauseBackgroundWork();
+
+ // Ensure that we have at least L0, L1 and L2
+ ASSERT_GT(NumTableFilesAtLevel(0), 0);
+ ASSERT_GT(NumTableFilesAtLevel(1), 0);
+ ASSERT_GT(NumTableFilesAtLevel(2), 0);
+
+ // Query the largest range
+ std::size_t num_properties, num_files;
+ TestGetPropertiesOfTablesInRange(
+ {Range(test::RandomKey(&rnd, 5, test::RandomKeyType::SMALLEST),
+ test::RandomKey(&rnd, 5, test::RandomKeyType::LARGEST))},
+ &num_properties, &num_files);
+ ASSERT_EQ(num_properties, num_files);
+
+ // Query the empty range
+ TestGetPropertiesOfTablesInRange(
+ {Range(test::RandomKey(&rnd, 5, test::RandomKeyType::LARGEST),
+ test::RandomKey(&rnd, 5, test::RandomKeyType::SMALLEST))},
+ &num_properties, &num_files);
+ ASSERT_GT(num_files, 0);
+ ASSERT_EQ(num_properties, 0);
+
+ // Query the middle rangee
+ TestGetPropertiesOfTablesInRange(
+ {Range(test::RandomKey(&rnd, 5, test::RandomKeyType::MIDDLE),
+ test::RandomKey(&rnd, 5, test::RandomKeyType::LARGEST))},
+ &num_properties, &num_files);
+ ASSERT_GT(num_files, 0);
+ ASSERT_GT(num_files, num_properties);
+ ASSERT_GT(num_properties, 0);
+
+ // Query a bunch of random ranges
+ for (int j = 0; j < 100; j++) {
+ // create a bunch of ranges
+ std::vector<std::string> random_keys;
+ // Random returns numbers with zero included
+ // when we pass empty ranges TestGetPropertiesOfTablesInRange()
+ // derefs random memory in the empty ranges[0]
+ // so want to be greater than zero and even since
+ // the below loop requires that random_keys.size() to be even.
+ auto n = 2 * (rnd.Uniform(50) + 1);
+
+ for (uint32_t i = 0; i < n; ++i) {
+ random_keys.push_back(test::RandomKey(&rnd, 5));
+ }
+
+ ASSERT_GT(random_keys.size(), 0U);
+ ASSERT_EQ((random_keys.size() % 2), 0U);
+
+ std::vector<Range> ranges;
+ auto it = random_keys.begin();
+ while (it != random_keys.end()) {
+ ranges.push_back(Range(*it, *(it + 1)));
+ it += 2;
+ }
+
+ TestGetPropertiesOfTablesInRange(std::move(ranges));
+ }
+}
+
+TEST_F(DBTablePropertiesTest, GetColumnFamilyNameProperty) {
+ std::string kExtraCfName = "pikachu";
+ CreateAndReopenWithCF({kExtraCfName}, CurrentOptions());
+
+ // Create one table per CF, then verify it was created with the column family
+ // name property.
+ for (uint32_t cf = 0; cf < 2; ++cf) {
+ Put(cf, "key", "val");
+ Flush(cf);
+
+ TablePropertiesCollection fname_to_props;
+ ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[cf], &fname_to_props));
+ ASSERT_EQ(1U, fname_to_props.size());
+
+ std::string expected_cf_name;
+ if (cf > 0) {
+ expected_cf_name = kExtraCfName;
+ } else {
+ expected_cf_name = kDefaultColumnFamilyName;
+ }
+ ASSERT_EQ(expected_cf_name,
+ fname_to_props.begin()->second->column_family_name);
+ ASSERT_EQ(cf, static_cast<uint32_t>(
+ fname_to_props.begin()->second->column_family_id));
+ }
+}
+
+TEST_F(DBTablePropertiesTest, DeletionTriggeredCompactionMarking) {
+ int kNumKeys = 1000;
+ int kWindowSize = 100;
+ int kNumDelsTrigger = 90;
+ std::shared_ptr<TablePropertiesCollectorFactory> compact_on_del =
+ NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger);
+
+ Options opts = CurrentOptions();
+ opts.table_properties_collector_factories.emplace_back(compact_on_del);
+ Reopen(opts);
+
+ // add an L1 file to prevent tombstones from dropping due to obsolescence
+ // during flush
+ Put(Key(0), "val");
+ Flush();
+ MoveFilesToLevel(1);
+
+ for (int i = 0; i < kNumKeys; ++i) {
+ if (i >= kNumKeys - kWindowSize &&
+ i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+ Delete(Key(i));
+ } else {
+ Put(Key(i), "val");
+ }
+ }
+ Flush();
+
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_GT(NumTableFilesAtLevel(1), 0);
+
+ // Change the window size and deletion trigger and ensure new values take
+ // effect
+ kWindowSize = 50;
+ kNumDelsTrigger = 40;
+ static_cast<CompactOnDeletionCollectorFactory*>
+ (compact_on_del.get())->SetWindowSize(kWindowSize);
+ static_cast<CompactOnDeletionCollectorFactory*>
+ (compact_on_del.get())->SetDeletionTrigger(kNumDelsTrigger);
+ for (int i = 0; i < kNumKeys; ++i) {
+ if (i >= kNumKeys - kWindowSize &&
+ i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+ Delete(Key(i));
+ } else {
+ Put(Key(i), "val");
+ }
+ }
+ Flush();
+
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_GT(NumTableFilesAtLevel(1), 0);
+
+ // Change the window size to disable delete triggered compaction
+ kWindowSize = 0;
+ static_cast<CompactOnDeletionCollectorFactory*>
+ (compact_on_del.get())->SetWindowSize(kWindowSize);
+ static_cast<CompactOnDeletionCollectorFactory*>
+ (compact_on_del.get())->SetDeletionTrigger(kNumDelsTrigger);
+ for (int i = 0; i < kNumKeys; ++i) {
+ if (i >= kNumKeys - kWindowSize &&
+ i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+ Delete(Key(i));
+ } else {
+ Put(Key(i), "val");
+ }
+ }
+ Flush();
+
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_tailing_iter_test.cc b/src/rocksdb/db/db_tailing_iter_test.cc
new file mode 100644
index 000000000..39988638b
--- /dev/null
+++ b/src/rocksdb/db/db_tailing_iter_test.cc
@@ -0,0 +1,547 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// Introduction of SyncPoint effectively disabled building and running this test
+// in Release build.
+// which is a pity, it is a good test
+#if !defined(ROCKSDB_LITE)
+
+#include "db/db_test_util.h"
+#include "db/forward_iterator.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBTestTailingIterator : public DBTestBase {
+ public:
+ DBTestTailingIterator() : DBTestBase("/db_tailing_iterator_test") {}
+};
+
+TEST_F(DBTestTailingIterator, TailingIteratorSingle) {
+ ReadOptions read_options;
+ read_options.tailing = true;
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ iter->SeekToFirst();
+ ASSERT_TRUE(!iter->Valid());
+
+ // add a record and check that iter can see it
+ ASSERT_OK(db_->Put(WriteOptions(), "mirko", "fodor"));
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().ToString(), "mirko");
+
+ iter->Next();
+ ASSERT_TRUE(!iter->Valid());
+}
+
+TEST_F(DBTestTailingIterator, TailingIteratorKeepAdding) {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ReadOptions read_options;
+ read_options.tailing = true;
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+ std::string value(1024, 'a');
+
+ const int num_records = 10000;
+ for (int i = 0; i < num_records; ++i) {
+ char buf[32];
+ snprintf(buf, sizeof(buf), "%016d", i);
+
+ Slice key(buf, 16);
+ ASSERT_OK(Put(1, key, value));
+
+ iter->Seek(key);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(key), 0);
+ }
+}
+
+TEST_F(DBTestTailingIterator, TailingIteratorSeekToNext) {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ReadOptions read_options;
+ read_options.tailing = true;
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+ std::unique_ptr<Iterator> itern(db_->NewIterator(read_options, handles_[1]));
+ std::string value(1024, 'a');
+
+ const int num_records = 1000;
+ for (int i = 1; i < num_records; ++i) {
+ char buf1[32];
+ char buf2[32];
+ snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
+
+ Slice key(buf1, 20);
+ ASSERT_OK(Put(1, key, value));
+
+ if (i % 100 == 99) {
+ ASSERT_OK(Flush(1));
+ }
+
+ snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
+ Slice target(buf2, 20);
+ iter->Seek(target);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(key), 0);
+ if (i == 1) {
+ itern->SeekToFirst();
+ } else {
+ itern->Next();
+ }
+ ASSERT_TRUE(itern->Valid());
+ ASSERT_EQ(itern->key().compare(key), 0);
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ for (int i = 2 * num_records; i > 0; --i) {
+ char buf1[32];
+ char buf2[32];
+ snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
+
+ Slice key(buf1, 20);
+ ASSERT_OK(Put(1, key, value));
+
+ if (i % 100 == 99) {
+ ASSERT_OK(Flush(1));
+ }
+
+ snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
+ Slice target(buf2, 20);
+ iter->Seek(target);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(key), 0);
+ }
+}
+
+TEST_F(DBTestTailingIterator, TailingIteratorTrimSeekToNext) {
+ const uint64_t k150KB = 150 * 1024;
+ Options options;
+ options.write_buffer_size = k150KB;
+ options.max_write_buffer_number = 3;
+ options.min_write_buffer_number_to_merge = 2;
+ options.env = env_;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ReadOptions read_options;
+ read_options.tailing = true;
+ int num_iters, deleted_iters;
+
+ char bufe[32];
+ snprintf(bufe, sizeof(bufe), "00b0%016d", 0);
+ Slice keyu(bufe, 20);
+ read_options.iterate_upper_bound = &keyu;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+ std::unique_ptr<Iterator> itern(db_->NewIterator(read_options, handles_[1]));
+ std::unique_ptr<Iterator> iterh(db_->NewIterator(read_options, handles_[1]));
+ std::string value(1024, 'a');
+ bool file_iters_deleted = false;
+ bool file_iters_renewed_null = false;
+ bool file_iters_renewed_copy = false;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "ForwardIterator::SeekInternal:Return", [&](void* arg) {
+ ForwardIterator* fiter = reinterpret_cast<ForwardIterator*>(arg);
+ ASSERT_TRUE(!file_iters_deleted ||
+ fiter->TEST_CheckDeletedIters(&deleted_iters, &num_iters));
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "ForwardIterator::Next:Return", [&](void* arg) {
+ ForwardIterator* fiter = reinterpret_cast<ForwardIterator*>(arg);
+ ASSERT_TRUE(!file_iters_deleted ||
+ fiter->TEST_CheckDeletedIters(&deleted_iters, &num_iters));
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "ForwardIterator::RenewIterators:Null",
+ [&](void* /*arg*/) { file_iters_renewed_null = true; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "ForwardIterator::RenewIterators:Copy",
+ [&](void* /*arg*/) { file_iters_renewed_copy = true; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ const int num_records = 1000;
+ for (int i = 1; i < num_records; ++i) {
+ char buf1[32];
+ char buf2[32];
+ char buf3[32];
+ char buf4[32];
+ snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
+ snprintf(buf3, sizeof(buf3), "00b0%016d", i * 5);
+
+ Slice key(buf1, 20);
+ ASSERT_OK(Put(1, key, value));
+ Slice keyn(buf3, 20);
+ ASSERT_OK(Put(1, keyn, value));
+
+ if (i % 100 == 99) {
+ ASSERT_OK(Flush(1));
+ dbfull()->TEST_WaitForCompact();
+ if (i == 299) {
+ file_iters_deleted = true;
+ }
+ snprintf(buf4, sizeof(buf4), "00a0%016d", i * 5 / 2);
+ Slice target(buf4, 20);
+ iterh->Seek(target);
+ ASSERT_TRUE(iter->Valid());
+ for (int j = (i + 1) * 5 / 2; j < i * 5; j += 5) {
+ iterh->Next();
+ ASSERT_TRUE(iterh->Valid());
+ }
+ if (i == 299) {
+ file_iters_deleted = false;
+ }
+ }
+
+ file_iters_deleted = true;
+ snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
+ Slice target(buf2, 20);
+ iter->Seek(target);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(key), 0);
+ ASSERT_LE(num_iters, 1);
+ if (i == 1) {
+ itern->SeekToFirst();
+ } else {
+ itern->Next();
+ }
+ ASSERT_TRUE(itern->Valid());
+ ASSERT_EQ(itern->key().compare(key), 0);
+ ASSERT_LE(num_iters, 1);
+ file_iters_deleted = false;
+ }
+ ASSERT_TRUE(file_iters_renewed_null);
+ ASSERT_TRUE(file_iters_renewed_copy);
+ iter = nullptr;
+ itern = nullptr;
+ iterh = nullptr;
+ BlockBasedTableOptions table_options;
+ table_options.no_block_cache = true;
+ table_options.block_cache_compressed = nullptr;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ read_options.read_tier = kBlockCacheTier;
+ std::unique_ptr<Iterator> iteri(db_->NewIterator(read_options, handles_[1]));
+ char buf5[32];
+ snprintf(buf5, sizeof(buf5), "00a0%016d", (num_records / 2) * 5 - 2);
+ Slice target1(buf5, 20);
+ iteri->Seek(target1);
+ ASSERT_TRUE(iteri->status().IsIncomplete());
+ iteri = nullptr;
+
+ read_options.read_tier = kReadAllTier;
+ options.table_factory.reset(NewBlockBasedTableFactory());
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ iter.reset(db_->NewIterator(read_options, handles_[1]));
+ for (int i = 2 * num_records; i > 0; --i) {
+ char buf1[32];
+ char buf2[32];
+ snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
+
+ Slice key(buf1, 20);
+ ASSERT_OK(Put(1, key, value));
+
+ if (i % 100 == 99) {
+ ASSERT_OK(Flush(1));
+ }
+
+ snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
+ Slice target(buf2, 20);
+ iter->Seek(target);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(key), 0);
+ }
+}
+
+TEST_F(DBTestTailingIterator, TailingIteratorDeletes) {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ReadOptions read_options;
+ read_options.tailing = true;
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+
+ // write a single record, read it using the iterator, then delete it
+ ASSERT_OK(Put(1, "0test", "test"));
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().ToString(), "0test");
+ ASSERT_OK(Delete(1, "0test"));
+
+ // write many more records
+ const int num_records = 10000;
+ std::string value(1024, 'A');
+
+ for (int i = 0; i < num_records; ++i) {
+ char buf[32];
+ snprintf(buf, sizeof(buf), "1%015d", i);
+
+ Slice key(buf, 16);
+ ASSERT_OK(Put(1, key, value));
+ }
+
+ // force a flush to make sure that no records are read from memtable
+ ASSERT_OK(Flush(1));
+
+ // skip "0test"
+ iter->Next();
+
+ // make sure we can read all new records using the existing iterator
+ int count = 0;
+ for (; iter->Valid(); iter->Next(), ++count) ;
+
+ ASSERT_EQ(count, num_records);
+}
+
+TEST_F(DBTestTailingIterator, TailingIteratorPrefixSeek) {
+ ReadOptions read_options;
+ read_options.tailing = true;
+
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ options.prefix_extractor.reset(NewFixedPrefixTransform(2));
+ options.memtable_factory.reset(NewHashSkipListRepFactory(16));
+ options.allow_concurrent_memtable_write = false;
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+ ASSERT_OK(Put(1, "0101", "test"));
+
+ ASSERT_OK(Flush(1));
+
+ ASSERT_OK(Put(1, "0202", "test"));
+
+ // Seek(0102) shouldn't find any records since 0202 has a different prefix
+ iter->Seek("0102");
+ ASSERT_TRUE(!iter->Valid());
+
+ iter->Seek("0202");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().ToString(), "0202");
+
+ iter->Next();
+ ASSERT_TRUE(!iter->Valid());
+}
+
+TEST_F(DBTestTailingIterator, TailingIteratorIncomplete) {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ReadOptions read_options;
+ read_options.tailing = true;
+ read_options.read_tier = kBlockCacheTier;
+
+ std::string key("key");
+ std::string value("value");
+
+ ASSERT_OK(db_->Put(WriteOptions(), key, value));
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ iter->SeekToFirst();
+ // we either see the entry or it's not in cache
+ ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete());
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ iter->SeekToFirst();
+ // should still be true after compaction
+ ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete());
+}
+
+TEST_F(DBTestTailingIterator, TailingIteratorSeekToSame) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.write_buffer_size = 1000;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ReadOptions read_options;
+ read_options.tailing = true;
+
+ const int NROWS = 10000;
+ // Write rows with keys 00000, 00002, 00004 etc.
+ for (int i = 0; i < NROWS; ++i) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%05d", 2*i);
+ std::string key(buf);
+ std::string value("value");
+ ASSERT_OK(db_->Put(WriteOptions(), key, value));
+ }
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ // Seek to 00001. We expect to find 00002.
+ std::string start_key = "00001";
+ iter->Seek(start_key);
+ ASSERT_TRUE(iter->Valid());
+
+ std::string found = iter->key().ToString();
+ ASSERT_EQ("00002", found);
+
+ // Now seek to the same key. The iterator should remain in the same
+ // position.
+ iter->Seek(found);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(found, iter->key().ToString());
+}
+
+// Sets iterate_upper_bound and verifies that ForwardIterator doesn't call
+// Seek() on immutable iterators when target key is >= prev_key and all
+// iterators, including the memtable iterator, are over the upper bound.
+TEST_F(DBTestTailingIterator, TailingIteratorUpperBound) {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+
+ const Slice upper_bound("20", 3);
+ ReadOptions read_options;
+ read_options.tailing = true;
+ read_options.iterate_upper_bound = &upper_bound;
+
+ ASSERT_OK(Put(1, "11", "11"));
+ ASSERT_OK(Put(1, "12", "12"));
+ ASSERT_OK(Put(1, "22", "22"));
+ ASSERT_OK(Flush(1)); // flush all those keys to an immutable SST file
+
+ // Add another key to the memtable.
+ ASSERT_OK(Put(1, "21", "21"));
+
+ std::unique_ptr<Iterator> it(db_->NewIterator(read_options, handles_[1]));
+ it->Seek("12");
+ ASSERT_TRUE(it->Valid());
+ ASSERT_EQ("12", it->key().ToString());
+
+ it->Next();
+ // Not valid since "21" is over the upper bound.
+ ASSERT_FALSE(it->Valid());
+
+ // This keeps track of the number of times NeedToSeekImmutable() was true.
+ int immutable_seeks = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "ForwardIterator::SeekInternal:Immutable",
+ [&](void* /*arg*/) { ++immutable_seeks; });
+
+ // Seek to 13. This should not require any immutable seeks.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ it->Seek("13");
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ ASSERT_FALSE(it->Valid());
+ ASSERT_EQ(0, immutable_seeks);
+}
+
+TEST_F(DBTestTailingIterator, TailingIteratorGap) {
+ // level 1: [20, 25] [35, 40]
+ // level 2: [10 - 15] [45 - 50]
+ // level 3: [20, 30, 40]
+ // Previously there is a bug in tailing_iterator that if there is a gap in
+ // lower level, the key will be skipped if it is within the range between
+ // the largest key of index n file and the smallest key of index n+1 file
+ // if both file fit in that gap. In this example, 25 < key < 35
+ // https://github.com/facebook/rocksdb/issues/1372
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+
+ ReadOptions read_options;
+ read_options.tailing = true;
+
+ ASSERT_OK(Put(1, "20", "20"));
+ ASSERT_OK(Put(1, "30", "30"));
+ ASSERT_OK(Put(1, "40", "40"));
+ ASSERT_OK(Flush(1));
+ MoveFilesToLevel(3, 1);
+
+ ASSERT_OK(Put(1, "10", "10"));
+ ASSERT_OK(Put(1, "15", "15"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(Put(1, "45", "45"));
+ ASSERT_OK(Put(1, "50", "50"));
+ ASSERT_OK(Flush(1));
+ MoveFilesToLevel(2, 1);
+
+ ASSERT_OK(Put(1, "20", "20"));
+ ASSERT_OK(Put(1, "25", "25"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(Put(1, "35", "35"));
+ ASSERT_OK(Put(1, "40", "40"));
+ ASSERT_OK(Flush(1));
+ MoveFilesToLevel(1, 1);
+
+ ColumnFamilyMetaData meta;
+ db_->GetColumnFamilyMetaData(handles_[1], &meta);
+
+ std::unique_ptr<Iterator> it(db_->NewIterator(read_options, handles_[1]));
+ it->Seek("30");
+ ASSERT_TRUE(it->Valid());
+ ASSERT_EQ("30", it->key().ToString());
+
+ it->Next();
+ ASSERT_TRUE(it->Valid());
+ ASSERT_EQ("35", it->key().ToString());
+
+ it->Next();
+ ASSERT_TRUE(it->Valid());
+ ASSERT_EQ("40", it->key().ToString());
+}
+
+TEST_F(DBTestTailingIterator, SeekWithUpperBoundBug) {
+ ReadOptions read_options;
+ read_options.tailing = true;
+ const Slice upper_bound("cc", 3);
+ read_options.iterate_upper_bound = &upper_bound;
+
+
+ // 1st L0 file
+ ASSERT_OK(db_->Put(WriteOptions(), "aa", "SEEN"));
+ ASSERT_OK(Flush());
+
+ // 2nd L0 file
+ ASSERT_OK(db_->Put(WriteOptions(), "zz", "NOT-SEEN"));
+ ASSERT_OK(Flush());
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+
+ iter->Seek("aa");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().ToString(), "aa");
+}
+
+TEST_F(DBTestTailingIterator, SeekToFirstWithUpperBoundBug) {
+ ReadOptions read_options;
+ read_options.tailing = true;
+ const Slice upper_bound("cc", 3);
+ read_options.iterate_upper_bound = &upper_bound;
+
+
+ // 1st L0 file
+ ASSERT_OK(db_->Put(WriteOptions(), "aa", "SEEN"));
+ ASSERT_OK(Flush());
+
+ // 2nd L0 file
+ ASSERT_OK(db_->Put(WriteOptions(), "zz", "NOT-SEEN"));
+ ASSERT_OK(Flush());
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().ToString(), "aa");
+
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().ToString(), "aa");
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !defined(ROCKSDB_LITE)
+
+int main(int argc, char** argv) {
+#if !defined(ROCKSDB_LITE)
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+#else
+ (void) argc;
+ (void) argv;
+ return 0;
+#endif
+}
diff --git a/src/rocksdb/db/db_test.cc b/src/rocksdb/db/db_test.cc
new file mode 100644
index 000000000..60b4d60f4
--- /dev/null
+++ b/src/rocksdb/db/db_test.cc
@@ -0,0 +1,6605 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// Introduction of SyncPoint effectively disabled building and running this test
+// in Release build.
+// which is a pity, it is a good test
+#include <fcntl.h>
+#include <algorithm>
+#include <set>
+#include <thread>
+#include <unordered_set>
+#include <utility>
+#ifndef OS_WIN
+#include <unistd.h>
+#endif
+#ifdef OS_SOLARIS
+#include <alloca.h>
+#endif
+
+#include "cache/lru_cache.h"
+#include "db/blob_index.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "db/job_context.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "memtable/hash_linklist_rep.h"
+#include "monitoring/thread_status_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/experimental.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/snapshot.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/thread_status.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/mock_table.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/scoped_arena_iterator.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/compression.h"
+#include "util/mutexlock.h"
+#include "util/rate_limiter.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBTest : public DBTestBase {
+ public:
+ DBTest() : DBTestBase("/db_test") {}
+};
+
+class DBTestWithParam
+ : public DBTest,
+ public testing::WithParamInterface<std::tuple<uint32_t, bool>> {
+ public:
+ DBTestWithParam() {
+ max_subcompactions_ = std::get<0>(GetParam());
+ exclusive_manual_compaction_ = std::get<1>(GetParam());
+ }
+
+ // Required if inheriting from testing::WithParamInterface<>
+ static void SetUpTestCase() {}
+ static void TearDownTestCase() {}
+
+ uint32_t max_subcompactions_;
+ bool exclusive_manual_compaction_;
+};
+
+TEST_F(DBTest, MockEnvTest) {
+ std::unique_ptr<MockEnv> env{new MockEnv(Env::Default())};
+ Options options;
+ options.create_if_missing = true;
+ options.env = env.get();
+ DB* db;
+
+ const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
+ const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
+
+ ASSERT_OK(DB::Open(options, "/dir/db", &db));
+ for (size_t i = 0; i < 3; ++i) {
+ ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i]));
+ }
+
+ for (size_t i = 0; i < 3; ++i) {
+ std::string res;
+ ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+ ASSERT_TRUE(res == vals[i]);
+ }
+
+ Iterator* iterator = db->NewIterator(ReadOptions());
+ iterator->SeekToFirst();
+ for (size_t i = 0; i < 3; ++i) {
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_TRUE(keys[i] == iterator->key());
+ ASSERT_TRUE(vals[i] == iterator->value());
+ iterator->Next();
+ }
+ ASSERT_TRUE(!iterator->Valid());
+ delete iterator;
+
+// TEST_FlushMemTable() is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db);
+ ASSERT_OK(dbi->TEST_FlushMemTable());
+
+ for (size_t i = 0; i < 3; ++i) {
+ std::string res;
+ ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+ ASSERT_TRUE(res == vals[i]);
+ }
+#endif // ROCKSDB_LITE
+
+ delete db;
+}
+
+// NewMemEnv returns nullptr in ROCKSDB_LITE since class InMemoryEnv isn't
+// defined.
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, MemEnvTest) {
+ std::unique_ptr<Env> env{NewMemEnv(Env::Default())};
+ Options options;
+ options.create_if_missing = true;
+ options.env = env.get();
+ DB* db;
+
+ const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
+ const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
+
+ ASSERT_OK(DB::Open(options, "/dir/db", &db));
+ for (size_t i = 0; i < 3; ++i) {
+ ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i]));
+ }
+
+ for (size_t i = 0; i < 3; ++i) {
+ std::string res;
+ ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+ ASSERT_TRUE(res == vals[i]);
+ }
+
+ Iterator* iterator = db->NewIterator(ReadOptions());
+ iterator->SeekToFirst();
+ for (size_t i = 0; i < 3; ++i) {
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_TRUE(keys[i] == iterator->key());
+ ASSERT_TRUE(vals[i] == iterator->value());
+ iterator->Next();
+ }
+ ASSERT_TRUE(!iterator->Valid());
+ delete iterator;
+
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db);
+ ASSERT_OK(dbi->TEST_FlushMemTable());
+
+ for (size_t i = 0; i < 3; ++i) {
+ std::string res;
+ ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+ ASSERT_TRUE(res == vals[i]);
+ }
+
+ delete db;
+
+ options.create_if_missing = false;
+ ASSERT_OK(DB::Open(options, "/dir/db", &db));
+ for (size_t i = 0; i < 3; ++i) {
+ std::string res;
+ ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+ ASSERT_TRUE(res == vals[i]);
+ }
+ delete db;
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTest, WriteEmptyBatch) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.write_buffer_size = 100000;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_OK(Put(1, "foo", "bar"));
+ WriteOptions wo;
+ wo.sync = true;
+ wo.disableWAL = false;
+ WriteBatch empty_batch;
+ ASSERT_OK(dbfull()->Write(wo, &empty_batch));
+
+ // make sure we can re-open it.
+ ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+ ASSERT_EQ("bar", Get(1, "foo"));
+}
+
+TEST_F(DBTest, SkipDelay) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.write_buffer_size = 100000;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ for (bool sync : {true, false}) {
+ for (bool disableWAL : {true, false}) {
+ if (sync && disableWAL) {
+ // sync and disableWAL is incompatible.
+ continue;
+ }
+ // Use a small number to ensure a large delay that is still effective
+ // when we do Put
+ // TODO(myabandeh): this is time dependent and could potentially make
+ // the test flaky
+ auto token = dbfull()->TEST_write_controler().GetDelayToken(1);
+ std::atomic<int> sleep_count(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::DelayWrite:Sleep",
+ [&](void* /*arg*/) { sleep_count.fetch_add(1); });
+ std::atomic<int> wait_count(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::DelayWrite:Wait",
+ [&](void* /*arg*/) { wait_count.fetch_add(1); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ WriteOptions wo;
+ wo.sync = sync;
+ wo.disableWAL = disableWAL;
+ wo.no_slowdown = true;
+ dbfull()->Put(wo, "foo", "bar");
+ // We need the 2nd write to trigger delay. This is because delay is
+ // estimated based on the last write size which is 0 for the first write.
+ ASSERT_NOK(dbfull()->Put(wo, "foo2", "bar2"));
+ ASSERT_GE(sleep_count.load(), 0);
+ ASSERT_GE(wait_count.load(), 0);
+ token.reset();
+
+ token = dbfull()->TEST_write_controler().GetDelayToken(1000000000);
+ wo.no_slowdown = false;
+ ASSERT_OK(dbfull()->Put(wo, "foo3", "bar3"));
+ ASSERT_GE(sleep_count.load(), 1);
+ token.reset();
+ }
+ }
+}
+
+TEST_F(DBTest, MixedSlowdownOptions) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.write_buffer_size = 100000;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ std::vector<port::Thread> threads;
+ std::atomic<int> thread_num(0);
+
+ std::function<void()> write_slowdown_func = [&]() {
+ int a = thread_num.fetch_add(1);
+ std::string key = "foo" + std::to_string(a);
+ WriteOptions wo;
+ wo.no_slowdown = false;
+ ASSERT_OK(dbfull()->Put(wo, key, "bar"));
+ };
+ std::function<void()> write_no_slowdown_func = [&]() {
+ int a = thread_num.fetch_add(1);
+ std::string key = "foo" + std::to_string(a);
+ WriteOptions wo;
+ wo.no_slowdown = true;
+ ASSERT_NOK(dbfull()->Put(wo, key, "bar"));
+ };
+ // Use a small number to ensure a large delay that is still effective
+ // when we do Put
+ // TODO(myabandeh): this is time dependent and could potentially make
+ // the test flaky
+ auto token = dbfull()->TEST_write_controler().GetDelayToken(1);
+ std::atomic<int> sleep_count(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::DelayWrite:BeginWriteStallDone", [&](void* /*arg*/) {
+ sleep_count.fetch_add(1);
+ if (threads.empty()) {
+ for (int i = 0; i < 2; ++i) {
+ threads.emplace_back(write_slowdown_func);
+ }
+ for (int i = 0; i < 2; ++i) {
+ threads.emplace_back(write_no_slowdown_func);
+ }
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ WriteOptions wo;
+ wo.sync = false;
+ wo.disableWAL = false;
+ wo.no_slowdown = false;
+ dbfull()->Put(wo, "foo", "bar");
+ // We need the 2nd write to trigger delay. This is because delay is
+ // estimated based on the last write size which is 0 for the first write.
+ ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2"));
+ token.reset();
+
+ for (auto& t : threads) {
+ t.join();
+ }
+ ASSERT_GE(sleep_count.load(), 1);
+
+ wo.no_slowdown = true;
+ ASSERT_OK(dbfull()->Put(wo, "foo3", "bar"));
+}
+
+TEST_F(DBTest, MixedSlowdownOptionsInQueue) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.write_buffer_size = 100000;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ std::vector<port::Thread> threads;
+ std::atomic<int> thread_num(0);
+
+ std::function<void()> write_no_slowdown_func = [&]() {
+ int a = thread_num.fetch_add(1);
+ std::string key = "foo" + std::to_string(a);
+ WriteOptions wo;
+ wo.no_slowdown = true;
+ ASSERT_NOK(dbfull()->Put(wo, key, "bar"));
+ };
+ // Use a small number to ensure a large delay that is still effective
+ // when we do Put
+ // TODO(myabandeh): this is time dependent and could potentially make
+ // the test flaky
+ auto token = dbfull()->TEST_write_controler().GetDelayToken(1);
+ std::atomic<int> sleep_count(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::DelayWrite:Sleep", [&](void* /*arg*/) {
+ sleep_count.fetch_add(1);
+ if (threads.empty()) {
+ for (int i = 0; i < 2; ++i) {
+ threads.emplace_back(write_no_slowdown_func);
+ }
+ // Sleep for 2s to allow the threads to insert themselves into the
+ // write queue
+ env_->SleepForMicroseconds(3000000ULL);
+ }
+ });
+ std::atomic<int> wait_count(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::DelayWrite:Wait",
+ [&](void* /*arg*/) { wait_count.fetch_add(1); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ WriteOptions wo;
+ wo.sync = false;
+ wo.disableWAL = false;
+ wo.no_slowdown = false;
+ dbfull()->Put(wo, "foo", "bar");
+ // We need the 2nd write to trigger delay. This is because delay is
+ // estimated based on the last write size which is 0 for the first write.
+ ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2"));
+ token.reset();
+
+ for (auto& t : threads) {
+ t.join();
+ }
+ ASSERT_EQ(sleep_count.load(), 1);
+ ASSERT_GE(wait_count.load(), 0);
+}
+
+TEST_F(DBTest, MixedSlowdownOptionsStop) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.write_buffer_size = 100000;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ std::vector<port::Thread> threads;
+ std::atomic<int> thread_num(0);
+
+ std::function<void()> write_slowdown_func = [&]() {
+ int a = thread_num.fetch_add(1);
+ std::string key = "foo" + std::to_string(a);
+ WriteOptions wo;
+ wo.no_slowdown = false;
+ ASSERT_OK(dbfull()->Put(wo, key, "bar"));
+ };
+ std::function<void()> write_no_slowdown_func = [&]() {
+ int a = thread_num.fetch_add(1);
+ std::string key = "foo" + std::to_string(a);
+ WriteOptions wo;
+ wo.no_slowdown = true;
+ ASSERT_NOK(dbfull()->Put(wo, key, "bar"));
+ };
+ std::function<void()> wakeup_writer = [&]() {
+ dbfull()->mutex_.Lock();
+ dbfull()->bg_cv_.SignalAll();
+ dbfull()->mutex_.Unlock();
+ };
+ // Use a small number to ensure a large delay that is still effective
+ // when we do Put
+ // TODO(myabandeh): this is time dependent and could potentially make
+ // the test flaky
+ auto token = dbfull()->TEST_write_controler().GetStopToken();
+ std::atomic<int> wait_count(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::DelayWrite:Wait", [&](void* /*arg*/) {
+ wait_count.fetch_add(1);
+ if (threads.empty()) {
+ for (int i = 0; i < 2; ++i) {
+ threads.emplace_back(write_slowdown_func);
+ }
+ for (int i = 0; i < 2; ++i) {
+ threads.emplace_back(write_no_slowdown_func);
+ }
+ // Sleep for 2s to allow the threads to insert themselves into the
+ // write queue
+ env_->SleepForMicroseconds(3000000ULL);
+ }
+ token.reset();
+ threads.emplace_back(wakeup_writer);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ WriteOptions wo;
+ wo.sync = false;
+ wo.disableWAL = false;
+ wo.no_slowdown = false;
+ dbfull()->Put(wo, "foo", "bar");
+ // We need the 2nd write to trigger delay. This is because delay is
+ // estimated based on the last write size which is 0 for the first write.
+ ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2"));
+ token.reset();
+
+ for (auto& t : threads) {
+ t.join();
+ }
+ ASSERT_GE(wait_count.load(), 1);
+
+ wo.no_slowdown = true;
+ ASSERT_OK(dbfull()->Put(wo, "foo3", "bar"));
+}
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBTest, LevelLimitReopen) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ const std::string value(1024 * 1024, ' ');
+ int i = 0;
+ while (NumTableFilesAtLevel(2, 1) == 0) {
+ ASSERT_OK(Put(1, Key(i++), value));
+ }
+
+ options.num_levels = 1;
+ options.max_bytes_for_level_multiplier_additional.resize(1, 1);
+ Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_EQ(s.IsInvalidArgument(), true);
+ ASSERT_EQ(s.ToString(),
+ "Invalid argument: db has more levels than options.num_levels");
+
+ options.num_levels = 10;
+ options.max_bytes_for_level_multiplier_additional.resize(10, 1);
+ ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+}
+#endif // ROCKSDB_LITE
+
+
+TEST_F(DBTest, PutSingleDeleteGet) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "foo", "v1"));
+ ASSERT_EQ("v1", Get(1, "foo"));
+ ASSERT_OK(Put(1, "foo2", "v2"));
+ ASSERT_EQ("v2", Get(1, "foo2"));
+ ASSERT_OK(SingleDelete(1, "foo"));
+ ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+ // Skip FIFO and universal compaction beccause they do not apply to the test
+ // case. Skip MergePut because single delete does not get removed when it
+ // encounters a merge.
+ } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+ kSkipMergePut));
+}
+
+TEST_F(DBTest, ReadFromPersistedTier) {
+ do {
+ Random rnd(301);
+ Options options = CurrentOptions();
+ for (int disableWAL = 0; disableWAL <= 1; ++disableWAL) {
+ CreateAndReopenWithCF({"pikachu"}, options);
+ WriteOptions wopt;
+ wopt.disableWAL = (disableWAL == 1);
+ // 1st round: put but not flush
+ ASSERT_OK(db_->Put(wopt, handles_[1], "foo", "first"));
+ ASSERT_OK(db_->Put(wopt, handles_[1], "bar", "one"));
+ ASSERT_EQ("first", Get(1, "foo"));
+ ASSERT_EQ("one", Get(1, "bar"));
+
+ // Read directly from persited data.
+ ReadOptions ropt;
+ ropt.read_tier = kPersistedTier;
+ std::string value;
+ if (wopt.disableWAL) {
+ // as data has not yet being flushed, we expect not found.
+ ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).IsNotFound());
+ ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).IsNotFound());
+ } else {
+ ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value));
+ ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value));
+ }
+
+ // Multiget
+ std::vector<ColumnFamilyHandle*> multiget_cfs;
+ multiget_cfs.push_back(handles_[1]);
+ multiget_cfs.push_back(handles_[1]);
+ std::vector<Slice> multiget_keys;
+ multiget_keys.push_back("foo");
+ multiget_keys.push_back("bar");
+ std::vector<std::string> multiget_values;
+ auto statuses =
+ db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values);
+ if (wopt.disableWAL) {
+ ASSERT_TRUE(statuses[0].IsNotFound());
+ ASSERT_TRUE(statuses[1].IsNotFound());
+ } else {
+ ASSERT_OK(statuses[0]);
+ ASSERT_OK(statuses[1]);
+ }
+
+ // 2nd round: flush and put a new value in memtable.
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(db_->Put(wopt, handles_[1], "rocksdb", "hello"));
+
+ // once the data has been flushed, we are able to get the
+ // data when kPersistedTier is used.
+ ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).ok());
+ ASSERT_EQ(value, "first");
+ ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).ok());
+ ASSERT_EQ(value, "one");
+ if (wopt.disableWAL) {
+ ASSERT_TRUE(
+ db_->Get(ropt, handles_[1], "rocksdb", &value).IsNotFound());
+ } else {
+ ASSERT_OK(db_->Get(ropt, handles_[1], "rocksdb", &value));
+ ASSERT_EQ(value, "hello");
+ }
+
+ // Expect same result in multiget
+ multiget_cfs.push_back(handles_[1]);
+ multiget_keys.push_back("rocksdb");
+ statuses =
+ db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values);
+ ASSERT_TRUE(statuses[0].ok());
+ ASSERT_EQ("first", multiget_values[0]);
+ ASSERT_TRUE(statuses[1].ok());
+ ASSERT_EQ("one", multiget_values[1]);
+ if (wopt.disableWAL) {
+ ASSERT_TRUE(statuses[2].IsNotFound());
+ } else {
+ ASSERT_OK(statuses[2]);
+ }
+
+ // 3rd round: delete and flush
+ ASSERT_OK(db_->Delete(wopt, handles_[1], "foo"));
+ Flush(1);
+ ASSERT_OK(db_->Delete(wopt, handles_[1], "bar"));
+
+ ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).IsNotFound());
+ if (wopt.disableWAL) {
+ // Still expect finding the value as its delete has not yet being
+ // flushed.
+ ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).ok());
+ ASSERT_EQ(value, "one");
+ } else {
+ ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).IsNotFound());
+ }
+ ASSERT_TRUE(db_->Get(ropt, handles_[1], "rocksdb", &value).ok());
+ ASSERT_EQ(value, "hello");
+
+ statuses =
+ db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values);
+ ASSERT_TRUE(statuses[0].IsNotFound());
+ if (wopt.disableWAL) {
+ ASSERT_TRUE(statuses[1].ok());
+ ASSERT_EQ("one", multiget_values[1]);
+ } else {
+ ASSERT_TRUE(statuses[1].IsNotFound());
+ }
+ ASSERT_TRUE(statuses[2].ok());
+ ASSERT_EQ("hello", multiget_values[2]);
+ if (wopt.disableWAL == 0) {
+ DestroyAndReopen(options);
+ }
+ }
+ } while (ChangeOptions());
+}
+
+TEST_F(DBTest, SingleDeleteFlush) {
+ // Test to check whether flushing preserves a single delete hidden
+ // behind a put.
+ do {
+ Random rnd(301);
+
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Put values on second level (so that they will not be in the same
+ // compaction as the other operations.
+ Put(1, "foo", "first");
+ Put(1, "bar", "one");
+ ASSERT_OK(Flush(1));
+ MoveFilesToLevel(2, 1);
+
+ // (Single) delete hidden by a put
+ SingleDelete(1, "foo");
+ Put(1, "foo", "second");
+ Delete(1, "bar");
+ Put(1, "bar", "two");
+ ASSERT_OK(Flush(1));
+
+ SingleDelete(1, "foo");
+ Delete(1, "bar");
+ ASSERT_OK(Flush(1));
+
+ dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+ nullptr);
+
+ ASSERT_EQ("NOT_FOUND", Get(1, "bar"));
+ ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+ // Skip FIFO and universal compaction beccause they do not apply to the test
+ // case. Skip MergePut because single delete does not get removed when it
+ // encounters a merge.
+ } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+ kSkipMergePut));
+}
+
+TEST_F(DBTest, SingleDeletePutFlush) {
+ // Single deletes that encounter the matching put in a flush should get
+ // removed.
+ do {
+ Random rnd(301);
+
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ Put(1, "foo", Slice());
+ Put(1, "a", Slice());
+ SingleDelete(1, "a");
+ ASSERT_OK(Flush(1));
+
+ ASSERT_EQ("[ ]", AllEntriesFor("a", 1));
+ // Skip FIFO and universal compaction beccause they do not apply to the test
+ // case. Skip MergePut because single delete does not get removed when it
+ // encounters a merge.
+ } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+ kSkipMergePut));
+}
+
+// Disable because not all platform can run it.
+// It requires more than 9GB memory to run it, With single allocation
+// of more than 3GB.
+TEST_F(DBTest, DISABLED_SanitizeVeryVeryLargeValue) {
+ const size_t kValueSize = 4 * size_t{1024 * 1024 * 1024}; // 4GB value
+ std::string raw(kValueSize, 'v');
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.merge_operator = MergeOperators::CreatePutOperator();
+ options.write_buffer_size = 100000; // Small write buffer
+ options.paranoid_checks = true;
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("boo", "v1"));
+ ASSERT_TRUE(Put("foo", raw).IsInvalidArgument());
+ ASSERT_TRUE(Merge("foo", raw).IsInvalidArgument());
+
+ WriteBatch wb;
+ ASSERT_TRUE(wb.Put("foo", raw).IsInvalidArgument());
+ ASSERT_TRUE(wb.Merge("foo", raw).IsInvalidArgument());
+
+ Slice value_slice = raw;
+ Slice key_slice = "foo";
+ SliceParts sp_key(&key_slice, 1);
+ SliceParts sp_value(&value_slice, 1);
+
+ ASSERT_TRUE(wb.Put(sp_key, sp_value).IsInvalidArgument());
+ ASSERT_TRUE(wb.Merge(sp_key, sp_value).IsInvalidArgument());
+}
+
+// Disable because not all platform can run it.
+// It requires more than 9GB memory to run it, With single allocation
+// of more than 3GB.
+TEST_F(DBTest, DISABLED_VeryLargeValue) {
+ const size_t kValueSize = 3221225472u; // 3GB value
+ const size_t kKeySize = 8388608u; // 8MB key
+ std::string raw(kValueSize, 'v');
+ std::string key1(kKeySize, 'c');
+ std::string key2(kKeySize, 'd');
+
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.write_buffer_size = 100000; // Small write buffer
+ options.paranoid_checks = true;
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("boo", "v1"));
+ ASSERT_OK(Put("foo", "v1"));
+ ASSERT_OK(Put(key1, raw));
+ raw[0] = 'w';
+ ASSERT_OK(Put(key2, raw));
+ dbfull()->TEST_WaitForFlushMemTable();
+
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+#endif // !ROCKSDB_LITE
+
+ std::string value;
+ Status s = db_->Get(ReadOptions(), key1, &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(kValueSize, value.size());
+ ASSERT_EQ('v', value[0]);
+
+ s = db_->Get(ReadOptions(), key2, &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(kValueSize, value.size());
+ ASSERT_EQ('w', value[0]);
+
+ // Compact all files.
+ Flush();
+ db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+ // Check DB is not in read-only state.
+ ASSERT_OK(Put("boo", "v1"));
+
+ s = db_->Get(ReadOptions(), key1, &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(kValueSize, value.size());
+ ASSERT_EQ('v', value[0]);
+
+ s = db_->Get(ReadOptions(), key2, &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(kValueSize, value.size());
+ ASSERT_EQ('w', value[0]);
+}
+
+TEST_F(DBTest, GetFromImmutableLayer) {
+ do {
+ Options options = CurrentOptions();
+ options.env = env_;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_OK(Put(1, "foo", "v1"));
+ ASSERT_EQ("v1", Get(1, "foo"));
+
+ // Block sync calls
+ env_->delay_sstable_sync_.store(true, std::memory_order_release);
+ Put(1, "k1", std::string(100000, 'x')); // Fill memtable
+ Put(1, "k2", std::string(100000, 'y')); // Trigger flush
+ ASSERT_EQ("v1", Get(1, "foo"));
+ ASSERT_EQ("NOT_FOUND", Get(0, "foo"));
+ // Release sync calls
+ env_->delay_sstable_sync_.store(false, std::memory_order_release);
+ } while (ChangeOptions());
+}
+
+
+TEST_F(DBTest, GetLevel0Ordering) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ // Check that we process level-0 files in correct order. The code
+ // below generates two level-0 files where the earlier one comes
+ // before the later one in the level-0 file list since the earlier
+ // one has a smaller "smallest" key.
+ ASSERT_OK(Put(1, "bar", "b"));
+ ASSERT_OK(Put(1, "foo", "v1"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(Put(1, "foo", "v2"));
+ ASSERT_OK(Flush(1));
+ ASSERT_EQ("v2", Get(1, "foo"));
+ } while (ChangeOptions());
+}
+
+TEST_F(DBTest, WrongLevel0Config) {
+ Options options = CurrentOptions();
+ Close();
+ ASSERT_OK(DestroyDB(dbname_, options));
+ options.level0_stop_writes_trigger = 1;
+ options.level0_slowdown_writes_trigger = 2;
+ options.level0_file_num_compaction_trigger = 3;
+ ASSERT_OK(DB::Open(options, dbname_, &db_));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, GetOrderedByLevels) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "foo", "v1"));
+ Compact(1, "a", "z");
+ ASSERT_EQ("v1", Get(1, "foo"));
+ ASSERT_OK(Put(1, "foo", "v2"));
+ ASSERT_EQ("v2", Get(1, "foo"));
+ ASSERT_OK(Flush(1));
+ ASSERT_EQ("v2", Get(1, "foo"));
+ } while (ChangeOptions());
+}
+
+TEST_F(DBTest, GetPicksCorrectFile) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ // Arrange to have multiple files in a non-level-0 level.
+ ASSERT_OK(Put(1, "a", "va"));
+ Compact(1, "a", "b");
+ ASSERT_OK(Put(1, "x", "vx"));
+ Compact(1, "x", "y");
+ ASSERT_OK(Put(1, "f", "vf"));
+ Compact(1, "f", "g");
+ ASSERT_EQ("va", Get(1, "a"));
+ ASSERT_EQ("vf", Get(1, "f"));
+ ASSERT_EQ("vx", Get(1, "x"));
+ } while (ChangeOptions());
+}
+
+TEST_F(DBTest, GetEncountersEmptyLevel) {
+ do {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu"}, options);
+ // Arrange for the following to happen:
+ // * sstable A in level 0
+ // * nothing in level 1
+ // * sstable B in level 2
+ // Then do enough Get() calls to arrange for an automatic compaction
+ // of sstable A. A bug would cause the compaction to be marked as
+ // occurring at level 1 (instead of the correct level 0).
+
+ // Step 1: First place sstables in levels 0 and 2
+ Put(1, "a", "begin");
+ Put(1, "z", "end");
+ ASSERT_OK(Flush(1));
+ dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+ dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+ Put(1, "a", "begin");
+ Put(1, "z", "end");
+ ASSERT_OK(Flush(1));
+ ASSERT_GT(NumTableFilesAtLevel(0, 1), 0);
+ ASSERT_GT(NumTableFilesAtLevel(2, 1), 0);
+
+ // Step 2: clear level 1 if necessary.
+ dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1);
+ ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(2, 1), 1);
+
+ // Step 3: read a bunch of times
+ for (int i = 0; i < 1000; i++) {
+ ASSERT_EQ("NOT_FOUND", Get(1, "missing"));
+ }
+
+ // Step 4: Wait for compaction to finish
+ dbfull()->TEST_WaitForCompact();
+
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1); // XXX
+ } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction));
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTest, FlushMultipleMemtable) {
+ do {
+ Options options = CurrentOptions();
+ WriteOptions writeOpt = WriteOptions();
+ writeOpt.disableWAL = true;
+ options.max_write_buffer_number = 4;
+ options.min_write_buffer_number_to_merge = 3;
+ options.max_write_buffer_size_to_maintain = -1;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+ ASSERT_EQ("v1", Get(1, "foo"));
+ ASSERT_EQ("v1", Get(1, "bar"));
+ ASSERT_OK(Flush(1));
+ } while (ChangeCompactOptions());
+}
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, FlushSchedule) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.level0_stop_writes_trigger = 1 << 10;
+ options.level0_slowdown_writes_trigger = 1 << 10;
+ options.min_write_buffer_number_to_merge = 1;
+ options.max_write_buffer_size_to_maintain =
+ static_cast<int64_t>(options.write_buffer_size);
+ options.max_write_buffer_number = 2;
+ options.write_buffer_size = 120 * 1024;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ std::vector<port::Thread> threads;
+
+ std::atomic<int> thread_num(0);
+ // each column family will have 5 thread, each thread generating 2 memtables.
+ // each column family should end up with 10 table files
+ std::function<void()> fill_memtable_func = [&]() {
+ int a = thread_num.fetch_add(1);
+ Random rnd(a);
+ WriteOptions wo;
+ // this should fill up 2 memtables
+ for (int k = 0; k < 5000; ++k) {
+ ASSERT_OK(db_->Put(wo, handles_[a & 1], RandomString(&rnd, 13), ""));
+ }
+ };
+
+ for (int i = 0; i < 10; ++i) {
+ threads.emplace_back(fill_memtable_func);
+ }
+
+ for (auto& t : threads) {
+ t.join();
+ }
+
+ auto default_tables = GetNumberOfSstFilesForColumnFamily(db_, "default");
+ auto pikachu_tables = GetNumberOfSstFilesForColumnFamily(db_, "pikachu");
+ ASSERT_LE(default_tables, static_cast<uint64_t>(10));
+ ASSERT_GT(default_tables, static_cast<uint64_t>(0));
+ ASSERT_LE(pikachu_tables, static_cast<uint64_t>(10));
+ ASSERT_GT(pikachu_tables, static_cast<uint64_t>(0));
+}
+#endif // ROCKSDB_LITE
+
+namespace {
+class KeepFilter : public CompactionFilter {
+ public:
+ bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+ std::string* /*new_value*/,
+ bool* /*value_changed*/) const override {
+ return false;
+ }
+
+ const char* Name() const override { return "KeepFilter"; }
+};
+
+class KeepFilterFactory : public CompactionFilterFactory {
+ public:
+ explicit KeepFilterFactory(bool check_context = false)
+ : check_context_(check_context) {}
+
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& context) override {
+ if (check_context_) {
+ EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction);
+ EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction);
+ }
+ return std::unique_ptr<CompactionFilter>(new KeepFilter());
+ }
+
+ const char* Name() const override { return "KeepFilterFactory"; }
+ bool check_context_;
+ std::atomic_bool expect_full_compaction_;
+ std::atomic_bool expect_manual_compaction_;
+};
+
+class DelayFilter : public CompactionFilter {
+ public:
+ explicit DelayFilter(DBTestBase* d) : db_test(d) {}
+ bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+ std::string* /*new_value*/,
+ bool* /*value_changed*/) const override {
+ db_test->env_->addon_time_.fetch_add(1000);
+ return true;
+ }
+
+ const char* Name() const override { return "DelayFilter"; }
+
+ private:
+ DBTestBase* db_test;
+};
+
+class DelayFilterFactory : public CompactionFilterFactory {
+ public:
+ explicit DelayFilterFactory(DBTestBase* d) : db_test(d) {}
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& /*context*/) override {
+ return std::unique_ptr<CompactionFilter>(new DelayFilter(db_test));
+ }
+
+ const char* Name() const override { return "DelayFilterFactory"; }
+
+ private:
+ DBTestBase* db_test;
+};
+} // namespace
+
+#ifndef ROCKSDB_LITE
+
+static std::string CompressibleString(Random* rnd, int len) {
+ std::string r;
+ test::CompressibleString(rnd, 0.8, len, &r);
+ return r;
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTest, FailMoreDbPaths) {
+ Options options = CurrentOptions();
+ options.db_paths.emplace_back(dbname_, 10000000);
+ options.db_paths.emplace_back(dbname_ + "_2", 1000000);
+ options.db_paths.emplace_back(dbname_ + "_3", 1000000);
+ options.db_paths.emplace_back(dbname_ + "_4", 1000000);
+ options.db_paths.emplace_back(dbname_ + "_5", 1000000);
+ ASSERT_TRUE(TryReopen(options).IsNotSupported());
+}
+
+void CheckColumnFamilyMeta(
+ const ColumnFamilyMetaData& cf_meta,
+ const std::vector<std::vector<FileMetaData>>& files_by_level,
+ uint64_t start_time, uint64_t end_time) {
+ ASSERT_EQ(cf_meta.name, kDefaultColumnFamilyName);
+ ASSERT_EQ(cf_meta.levels.size(), files_by_level.size());
+
+ uint64_t cf_size = 0;
+ size_t file_count = 0;
+
+ for (size_t i = 0; i < cf_meta.levels.size(); ++i) {
+ const auto& level_meta_from_cf = cf_meta.levels[i];
+ const auto& level_meta_from_files = files_by_level[i];
+
+ ASSERT_EQ(level_meta_from_cf.level, i);
+ ASSERT_EQ(level_meta_from_cf.files.size(), level_meta_from_files.size());
+
+ file_count += level_meta_from_cf.files.size();
+
+ uint64_t level_size = 0;
+ for (size_t j = 0; j < level_meta_from_cf.files.size(); ++j) {
+ const auto& file_meta_from_cf = level_meta_from_cf.files[j];
+ const auto& file_meta_from_files = level_meta_from_files[j];
+
+ level_size += file_meta_from_cf.size;
+
+ ASSERT_EQ(file_meta_from_cf.file_number,
+ file_meta_from_files.fd.GetNumber());
+ ASSERT_EQ(file_meta_from_cf.file_number,
+ TableFileNameToNumber(file_meta_from_cf.name));
+ ASSERT_EQ(file_meta_from_cf.size, file_meta_from_files.fd.file_size);
+ ASSERT_EQ(file_meta_from_cf.smallest_seqno,
+ file_meta_from_files.fd.smallest_seqno);
+ ASSERT_EQ(file_meta_from_cf.largest_seqno,
+ file_meta_from_files.fd.largest_seqno);
+ ASSERT_EQ(file_meta_from_cf.smallestkey,
+ file_meta_from_files.smallest.user_key().ToString());
+ ASSERT_EQ(file_meta_from_cf.largestkey,
+ file_meta_from_files.largest.user_key().ToString());
+ ASSERT_EQ(file_meta_from_cf.oldest_blob_file_number,
+ file_meta_from_files.oldest_blob_file_number);
+ ASSERT_EQ(file_meta_from_cf.oldest_ancester_time,
+ file_meta_from_files.oldest_ancester_time);
+ ASSERT_EQ(file_meta_from_cf.file_creation_time,
+ file_meta_from_files.file_creation_time);
+ ASSERT_GE(file_meta_from_cf.file_creation_time, start_time);
+ ASSERT_LE(file_meta_from_cf.file_creation_time, end_time);
+ ASSERT_GE(file_meta_from_cf.oldest_ancester_time, start_time);
+ ASSERT_LE(file_meta_from_cf.oldest_ancester_time, end_time);
+ }
+
+ ASSERT_EQ(level_meta_from_cf.size, level_size);
+ cf_size += level_size;
+ }
+
+ ASSERT_EQ(cf_meta.file_count, file_count);
+ ASSERT_EQ(cf_meta.size, cf_size);
+}
+
+void CheckLiveFilesMeta(
+ const std::vector<LiveFileMetaData>& live_file_meta,
+ const std::vector<std::vector<FileMetaData>>& files_by_level) {
+ size_t total_file_count = 0;
+ for (const auto& f : files_by_level) {
+ total_file_count += f.size();
+ }
+
+ ASSERT_EQ(live_file_meta.size(), total_file_count);
+
+ int level = 0;
+ int i = 0;
+
+ for (const auto& meta : live_file_meta) {
+ if (level != meta.level) {
+ level = meta.level;
+ i = 0;
+ }
+
+ ASSERT_LT(i, files_by_level[level].size());
+
+ const auto& expected_meta = files_by_level[level][i];
+
+ ASSERT_EQ(meta.column_family_name, kDefaultColumnFamilyName);
+ ASSERT_EQ(meta.file_number, expected_meta.fd.GetNumber());
+ ASSERT_EQ(meta.file_number, TableFileNameToNumber(meta.name));
+ ASSERT_EQ(meta.size, expected_meta.fd.file_size);
+ ASSERT_EQ(meta.smallest_seqno, expected_meta.fd.smallest_seqno);
+ ASSERT_EQ(meta.largest_seqno, expected_meta.fd.largest_seqno);
+ ASSERT_EQ(meta.smallestkey, expected_meta.smallest.user_key().ToString());
+ ASSERT_EQ(meta.largestkey, expected_meta.largest.user_key().ToString());
+ ASSERT_EQ(meta.oldest_blob_file_number,
+ expected_meta.oldest_blob_file_number);
+
+ ++i;
+ }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, MetaDataTest) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+
+ int64_t temp_time = 0;
+ options.env->GetCurrentTime(&temp_time);
+ uint64_t start_time = static_cast<uint64_t>(temp_time);
+
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ int key_index = 0;
+ for (int i = 0; i < 100; ++i) {
+ // Add a single blob reference to each file
+ std::string blob_index;
+ BlobIndex::EncodeBlob(&blob_index, /* blob_file_number */ i + 1000,
+ /* offset */ 1234, /* size */ 5678, kNoCompression);
+
+ WriteBatch batch;
+ ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, Key(key_index),
+ blob_index));
+ ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+
+ ++key_index;
+
+ // Fill up the rest of the file with random values.
+ GenerateNewFile(&rnd, &key_index, /* nowait */ true);
+
+ Flush();
+ }
+
+ std::vector<std::vector<FileMetaData>> files_by_level;
+ dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files_by_level);
+
+ options.env->GetCurrentTime(&temp_time);
+ uint64_t end_time = static_cast<uint64_t>(temp_time);
+
+ ColumnFamilyMetaData cf_meta;
+ db_->GetColumnFamilyMetaData(&cf_meta);
+ CheckColumnFamilyMeta(cf_meta, files_by_level, start_time, end_time);
+
+ std::vector<LiveFileMetaData> live_file_meta;
+ db_->GetLiveFilesMetaData(&live_file_meta);
+ CheckLiveFilesMeta(live_file_meta, files_by_level);
+}
+
+namespace {
+void MinLevelHelper(DBTest* self, Options& options) {
+ Random rnd(301);
+
+ for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+ num++) {
+ std::vector<std::string> values;
+ // Write 120KB (12 values, each 10K)
+ for (int i = 0; i < 12; i++) {
+ values.push_back(DBTestBase::RandomString(&rnd, 10000));
+ ASSERT_OK(self->Put(DBTestBase::Key(i), values[i]));
+ }
+ self->dbfull()->TEST_WaitForFlushMemTable();
+ ASSERT_EQ(self->NumTableFilesAtLevel(0), num + 1);
+ }
+
+ // generate one more file in level-0, and should trigger level-0 compaction
+ std::vector<std::string> values;
+ for (int i = 0; i < 12; i++) {
+ values.push_back(DBTestBase::RandomString(&rnd, 10000));
+ ASSERT_OK(self->Put(DBTestBase::Key(i), values[i]));
+ }
+ self->dbfull()->TEST_WaitForCompact();
+
+ ASSERT_EQ(self->NumTableFilesAtLevel(0), 0);
+ ASSERT_EQ(self->NumTableFilesAtLevel(1), 1);
+}
+
+// returns false if the calling-Test should be skipped
+bool MinLevelToCompress(CompressionType& type, Options& options, int wbits,
+ int lev, int strategy) {
+ fprintf(stderr,
+ "Test with compression options : window_bits = %d, level = %d, "
+ "strategy = %d}\n",
+ wbits, lev, strategy);
+ options.write_buffer_size = 100 << 10; // 100KB
+ options.arena_block_size = 4096;
+ options.num_levels = 3;
+ options.level0_file_num_compaction_trigger = 3;
+ options.create_if_missing = true;
+
+ if (Snappy_Supported()) {
+ type = kSnappyCompression;
+ fprintf(stderr, "using snappy\n");
+ } else if (Zlib_Supported()) {
+ type = kZlibCompression;
+ fprintf(stderr, "using zlib\n");
+ } else if (BZip2_Supported()) {
+ type = kBZip2Compression;
+ fprintf(stderr, "using bzip2\n");
+ } else if (LZ4_Supported()) {
+ type = kLZ4Compression;
+ fprintf(stderr, "using lz4\n");
+ } else if (XPRESS_Supported()) {
+ type = kXpressCompression;
+ fprintf(stderr, "using xpress\n");
+ } else if (ZSTD_Supported()) {
+ type = kZSTD;
+ fprintf(stderr, "using ZSTD\n");
+ } else {
+ fprintf(stderr, "skipping test, compression disabled\n");
+ return false;
+ }
+ options.compression_per_level.resize(options.num_levels);
+
+ // do not compress L0
+ for (int i = 0; i < 1; i++) {
+ options.compression_per_level[i] = kNoCompression;
+ }
+ for (int i = 1; i < options.num_levels; i++) {
+ options.compression_per_level[i] = type;
+ }
+ return true;
+}
+} // namespace
+
+TEST_F(DBTest, MinLevelToCompress1) {
+ Options options = CurrentOptions();
+ CompressionType type = kSnappyCompression;
+ if (!MinLevelToCompress(type, options, -14, -1, 0)) {
+ return;
+ }
+ Reopen(options);
+ MinLevelHelper(this, options);
+
+ // do not compress L0 and L1
+ for (int i = 0; i < 2; i++) {
+ options.compression_per_level[i] = kNoCompression;
+ }
+ for (int i = 2; i < options.num_levels; i++) {
+ options.compression_per_level[i] = type;
+ }
+ DestroyAndReopen(options);
+ MinLevelHelper(this, options);
+}
+
+TEST_F(DBTest, MinLevelToCompress2) {
+ Options options = CurrentOptions();
+ CompressionType type = kSnappyCompression;
+ if (!MinLevelToCompress(type, options, 15, -1, 0)) {
+ return;
+ }
+ Reopen(options);
+ MinLevelHelper(this, options);
+
+ // do not compress L0 and L1
+ for (int i = 0; i < 2; i++) {
+ options.compression_per_level[i] = kNoCompression;
+ }
+ for (int i = 2; i < options.num_levels; i++) {
+ options.compression_per_level[i] = type;
+ }
+ DestroyAndReopen(options);
+ MinLevelHelper(this, options);
+}
+
+// This test may fail because of a legit case that multiple L0 files
+// are trivial moved to L1.
+TEST_F(DBTest, DISABLED_RepeatedWritesToSameKey) {
+ do {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.write_buffer_size = 100000; // Small write buffer
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // We must have at most one file per level except for level-0,
+ // which may have up to kL0_StopWritesTrigger files.
+ const int kMaxFiles =
+ options.num_levels + options.level0_stop_writes_trigger;
+
+ Random rnd(301);
+ std::string value =
+ RandomString(&rnd, static_cast<int>(2 * options.write_buffer_size));
+ for (int i = 0; i < 5 * kMaxFiles; i++) {
+ ASSERT_OK(Put(1, "key", value));
+ ASSERT_LE(TotalTableFiles(1), kMaxFiles);
+ }
+ } while (ChangeCompactOptions());
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTest, SparseMerge) {
+ do {
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ FillLevels("A", "Z", 1);
+
+ // Suppose there is:
+ // small amount of data with prefix A
+ // large amount of data with prefix B
+ // small amount of data with prefix C
+ // and that recent updates have made small changes to all three prefixes.
+ // Check that we do not do a compaction that merges all of B in one shot.
+ const std::string value(1000, 'x');
+ Put(1, "A", "va");
+ // Write approximately 100MB of "B" values
+ for (int i = 0; i < 100000; i++) {
+ char key[100];
+ snprintf(key, sizeof(key), "B%010d", i);
+ Put(1, key, value);
+ }
+ Put(1, "C", "vc");
+ ASSERT_OK(Flush(1));
+ dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+
+ // Make sparse update
+ Put(1, "A", "va2");
+ Put(1, "B100", "bvalue2");
+ Put(1, "C", "vc2");
+ ASSERT_OK(Flush(1));
+
+ // Compactions should not cause us to create a situation where
+ // a file overlaps too much data at the next level.
+ ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]),
+ 20 * 1048576);
+ dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+ ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]),
+ 20 * 1048576);
+ dbfull()->TEST_CompactRange(1, nullptr, nullptr);
+ ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]),
+ 20 * 1048576);
+ } while (ChangeCompactOptions());
+}
+
+#ifndef ROCKSDB_LITE
+static bool Between(uint64_t val, uint64_t low, uint64_t high) {
+ bool result = (val >= low) && (val <= high);
+ if (!result) {
+ fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n",
+ (unsigned long long)(val), (unsigned long long)(low),
+ (unsigned long long)(high));
+ }
+ return result;
+}
+
+TEST_F(DBTest, ApproximateSizesMemTable) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100000000; // Large write buffer
+ options.compression = kNoCompression;
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+ auto default_cf = db_->DefaultColumnFamily();
+
+ const int N = 128;
+ Random rnd(301);
+ for (int i = 0; i < N; i++) {
+ ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+ }
+
+ uint64_t size;
+ std::string start = Key(50);
+ std::string end = Key(60);
+ Range r(start, end);
+ SizeApproximationOptions size_approx_options;
+ size_approx_options.include_memtabtles = true;
+ size_approx_options.include_files = true;
+ db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+ ASSERT_GT(size, 6000);
+ ASSERT_LT(size, 204800);
+ // Zero if not including mem table
+ db_->GetApproximateSizes(&r, 1, &size);
+ ASSERT_EQ(size, 0);
+
+ start = Key(500);
+ end = Key(600);
+ r = Range(start, end);
+ db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+ ASSERT_EQ(size, 0);
+
+ for (int i = 0; i < N; i++) {
+ ASSERT_OK(Put(Key(1000 + i), RandomString(&rnd, 1024)));
+ }
+
+ start = Key(500);
+ end = Key(600);
+ r = Range(start, end);
+ db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+ ASSERT_EQ(size, 0);
+
+ start = Key(100);
+ end = Key(1020);
+ r = Range(start, end);
+ db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+ ASSERT_GT(size, 6000);
+
+ options.max_write_buffer_number = 8;
+ options.min_write_buffer_number_to_merge = 5;
+ options.write_buffer_size = 1024 * N; // Not very large
+ DestroyAndReopen(options);
+ default_cf = db_->DefaultColumnFamily();
+
+ int keys[N * 3];
+ for (int i = 0; i < N; i++) {
+ keys[i * 3] = i * 5;
+ keys[i * 3 + 1] = i * 5 + 1;
+ keys[i * 3 + 2] = i * 5 + 2;
+ }
+ std::random_shuffle(std::begin(keys), std::end(keys));
+
+ for (int i = 0; i < N * 3; i++) {
+ ASSERT_OK(Put(Key(keys[i] + 1000), RandomString(&rnd, 1024)));
+ }
+
+ start = Key(100);
+ end = Key(300);
+ r = Range(start, end);
+ db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+ ASSERT_EQ(size, 0);
+
+ start = Key(1050);
+ end = Key(1080);
+ r = Range(start, end);
+ db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+ ASSERT_GT(size, 6000);
+
+ start = Key(2100);
+ end = Key(2300);
+ r = Range(start, end);
+ db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+ ASSERT_EQ(size, 0);
+
+ start = Key(1050);
+ end = Key(1080);
+ r = Range(start, end);
+ uint64_t size_with_mt, size_without_mt;
+ db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
+ &size_with_mt);
+ ASSERT_GT(size_with_mt, 6000);
+ db_->GetApproximateSizes(&r, 1, &size_without_mt);
+ ASSERT_EQ(size_without_mt, 0);
+
+ Flush();
+
+ for (int i = 0; i < N; i++) {
+ ASSERT_OK(Put(Key(i + 1000), RandomString(&rnd, 1024)));
+ }
+
+ start = Key(1050);
+ end = Key(1080);
+ r = Range(start, end);
+ db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
+ &size_with_mt);
+ db_->GetApproximateSizes(&r, 1, &size_without_mt);
+ ASSERT_GT(size_with_mt, size_without_mt);
+ ASSERT_GT(size_without_mt, 6000);
+
+ // Check that include_memtabtles flag works as expected
+ size_approx_options.include_memtabtles = false;
+ db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+ ASSERT_EQ(size, size_without_mt);
+
+ // Check that files_size_error_margin works as expected, when the heuristic
+ // conditions are not met
+ start = Key(1);
+ end = Key(1000 + N - 2);
+ r = Range(start, end);
+ size_approx_options.files_size_error_margin = -1.0; // disabled
+ db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+ uint64_t size2;
+ size_approx_options.files_size_error_margin = 0.5; // enabled, but not used
+ db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2);
+ ASSERT_EQ(size, size2);
+}
+
+TEST_F(DBTest, ApproximateSizesFilesWithErrorMargin) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 1024 * 1024;
+ options.compression = kNoCompression;
+ options.create_if_missing = true;
+ options.target_file_size_base = 1024 * 1024;
+ DestroyAndReopen(options);
+ const auto default_cf = db_->DefaultColumnFamily();
+
+ const int N = 64000;
+ Random rnd(301);
+ for (int i = 0; i < N; i++) {
+ ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+ }
+ // Flush everything to files
+ Flush();
+ // Compact the entire key space into the next level
+ db_->CompactRange(CompactRangeOptions(), default_cf, nullptr, nullptr);
+
+ // Write more keys
+ for (int i = N; i < (N + N / 4); i++) {
+ ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+ }
+ // Flush everything to files again
+ Flush();
+
+ // Wait for compaction to finish
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ const std::string start = Key(0);
+ const std::string end = Key(2 * N);
+ const Range r(start, end);
+
+ SizeApproximationOptions size_approx_options;
+ size_approx_options.include_memtabtles = false;
+ size_approx_options.include_files = true;
+ size_approx_options.files_size_error_margin = -1.0; // disabled
+
+ // Get the precise size without any approximation heuristic
+ uint64_t size;
+ db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+ ASSERT_NE(size, 0);
+
+ // Get the size with an approximation heuristic
+ uint64_t size2;
+ const double error_margin = 0.2;
+ size_approx_options.files_size_error_margin = error_margin;
+ db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2);
+ ASSERT_LT(size2, size * (1 + error_margin));
+ ASSERT_GT(size2, size * (1 - error_margin));
+}
+
+TEST_F(DBTest, GetApproximateMemTableStats) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100000000;
+ options.compression = kNoCompression;
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+
+ const int N = 128;
+ Random rnd(301);
+ for (int i = 0; i < N; i++) {
+ ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+ }
+
+ uint64_t count;
+ uint64_t size;
+
+ std::string start = Key(50);
+ std::string end = Key(60);
+ Range r(start, end);
+ db_->GetApproximateMemTableStats(r, &count, &size);
+ ASSERT_GT(count, 0);
+ ASSERT_LE(count, N);
+ ASSERT_GT(size, 6000);
+ ASSERT_LT(size, 204800);
+
+ start = Key(500);
+ end = Key(600);
+ r = Range(start, end);
+ db_->GetApproximateMemTableStats(r, &count, &size);
+ ASSERT_EQ(count, 0);
+ ASSERT_EQ(size, 0);
+
+ Flush();
+
+ start = Key(50);
+ end = Key(60);
+ r = Range(start, end);
+ db_->GetApproximateMemTableStats(r, &count, &size);
+ ASSERT_EQ(count, 0);
+ ASSERT_EQ(size, 0);
+
+ for (int i = 0; i < N; i++) {
+ ASSERT_OK(Put(Key(1000 + i), RandomString(&rnd, 1024)));
+ }
+
+ start = Key(100);
+ end = Key(1020);
+ r = Range(start, end);
+ db_->GetApproximateMemTableStats(r, &count, &size);
+ ASSERT_GT(count, 20);
+ ASSERT_GT(size, 6000);
+}
+
+TEST_F(DBTest, ApproximateSizes) {
+ do {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100000000; // Large write buffer
+ options.compression = kNoCompression;
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0));
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0));
+
+ // Write 8MB (80 values, each 100K)
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+ const int N = 80;
+ static const int S1 = 100000;
+ static const int S2 = 105000; // Allow some expansion from metadata
+ Random rnd(301);
+ for (int i = 0; i < N; i++) {
+ ASSERT_OK(Put(1, Key(i), RandomString(&rnd, S1)));
+ }
+
+ // 0 because GetApproximateSizes() does not account for memtable space
+ ASSERT_TRUE(Between(Size("", Key(50), 1), 0, 0));
+
+ // Check sizes across recovery by reopening a few times
+ for (int run = 0; run < 3; run++) {
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ for (int compact_start = 0; compact_start < N; compact_start += 10) {
+ for (int i = 0; i < N; i += 10) {
+ ASSERT_TRUE(Between(Size("", Key(i), 1), S1 * i, S2 * i));
+ ASSERT_TRUE(Between(Size("", Key(i) + ".suffix", 1), S1 * (i + 1),
+ S2 * (i + 1)));
+ ASSERT_TRUE(Between(Size(Key(i), Key(i + 10), 1), S1 * 10, S2 * 10));
+ }
+ ASSERT_TRUE(Between(Size("", Key(50), 1), S1 * 50, S2 * 50));
+ ASSERT_TRUE(
+ Between(Size("", Key(50) + ".suffix", 1), S1 * 50, S2 * 50));
+
+ std::string cstart_str = Key(compact_start);
+ std::string cend_str = Key(compact_start + 9);
+ Slice cstart = cstart_str;
+ Slice cend = cend_str;
+ dbfull()->TEST_CompactRange(0, &cstart, &cend, handles_[1]);
+ }
+
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+ ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);
+ }
+ // ApproximateOffsetOf() is not yet implemented in plain table format.
+ } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction |
+ kSkipPlainTable | kSkipHashIndex));
+}
+
+TEST_F(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
+ do {
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ Random rnd(301);
+ std::string big1 = RandomString(&rnd, 100000);
+ ASSERT_OK(Put(1, Key(0), RandomString(&rnd, 10000)));
+ ASSERT_OK(Put(1, Key(1), RandomString(&rnd, 10000)));
+ ASSERT_OK(Put(1, Key(2), big1));
+ ASSERT_OK(Put(1, Key(3), RandomString(&rnd, 10000)));
+ ASSERT_OK(Put(1, Key(4), big1));
+ ASSERT_OK(Put(1, Key(5), RandomString(&rnd, 10000)));
+ ASSERT_OK(Put(1, Key(6), RandomString(&rnd, 300000)));
+ ASSERT_OK(Put(1, Key(7), RandomString(&rnd, 10000)));
+
+ // Check sizes across recovery by reopening a few times
+ for (int run = 0; run < 3; run++) {
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ ASSERT_TRUE(Between(Size("", Key(0), 1), 0, 0));
+ ASSERT_TRUE(Between(Size("", Key(1), 1), 10000, 11000));
+ ASSERT_TRUE(Between(Size("", Key(2), 1), 20000, 21000));
+ ASSERT_TRUE(Between(Size("", Key(3), 1), 120000, 121000));
+ ASSERT_TRUE(Between(Size("", Key(4), 1), 130000, 131000));
+ ASSERT_TRUE(Between(Size("", Key(5), 1), 230000, 231000));
+ ASSERT_TRUE(Between(Size("", Key(6), 1), 240000, 241000));
+ ASSERT_TRUE(Between(Size("", Key(7), 1), 540000, 541000));
+ ASSERT_TRUE(Between(Size("", Key(8), 1), 550000, 560000));
+
+ ASSERT_TRUE(Between(Size(Key(3), Key(5), 1), 110000, 111000));
+
+ dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+ }
+ // ApproximateOffsetOf() is not yet implemented in plain table format.
+ } while (ChangeOptions(kSkipPlainTable));
+}
+#endif // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, Snapshot) {
+ anon::OptionsOverride options_override;
+ options_override.skip_policy = kSkipNoSnapshot;
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
+ Put(0, "foo", "0v1");
+ Put(1, "foo", "1v1");
+
+ const Snapshot* s1 = db_->GetSnapshot();
+ ASSERT_EQ(1U, GetNumSnapshots());
+ uint64_t time_snap1 = GetTimeOldestSnapshots();
+ ASSERT_GT(time_snap1, 0U);
+ ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+ Put(0, "foo", "0v2");
+ Put(1, "foo", "1v2");
+
+ env_->addon_time_.fetch_add(1);
+
+ const Snapshot* s2 = db_->GetSnapshot();
+ ASSERT_EQ(2U, GetNumSnapshots());
+ ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+ ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+ Put(0, "foo", "0v3");
+ Put(1, "foo", "1v3");
+
+ {
+ ManagedSnapshot s3(db_);
+ ASSERT_EQ(3U, GetNumSnapshots());
+ ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+ ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+
+ Put(0, "foo", "0v4");
+ Put(1, "foo", "1v4");
+ ASSERT_EQ("0v1", Get(0, "foo", s1));
+ ASSERT_EQ("1v1", Get(1, "foo", s1));
+ ASSERT_EQ("0v2", Get(0, "foo", s2));
+ ASSERT_EQ("1v2", Get(1, "foo", s2));
+ ASSERT_EQ("0v3", Get(0, "foo", s3.snapshot()));
+ ASSERT_EQ("1v3", Get(1, "foo", s3.snapshot()));
+ ASSERT_EQ("0v4", Get(0, "foo"));
+ ASSERT_EQ("1v4", Get(1, "foo"));
+ }
+
+ ASSERT_EQ(2U, GetNumSnapshots());
+ ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+ ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+ ASSERT_EQ("0v1", Get(0, "foo", s1));
+ ASSERT_EQ("1v1", Get(1, "foo", s1));
+ ASSERT_EQ("0v2", Get(0, "foo", s2));
+ ASSERT_EQ("1v2", Get(1, "foo", s2));
+ ASSERT_EQ("0v4", Get(0, "foo"));
+ ASSERT_EQ("1v4", Get(1, "foo"));
+
+ db_->ReleaseSnapshot(s1);
+ ASSERT_EQ("0v2", Get(0, "foo", s2));
+ ASSERT_EQ("1v2", Get(1, "foo", s2));
+ ASSERT_EQ("0v4", Get(0, "foo"));
+ ASSERT_EQ("1v4", Get(1, "foo"));
+ ASSERT_EQ(1U, GetNumSnapshots());
+ ASSERT_LT(time_snap1, GetTimeOldestSnapshots());
+ ASSERT_EQ(GetSequenceOldestSnapshots(), s2->GetSequenceNumber());
+
+ db_->ReleaseSnapshot(s2);
+ ASSERT_EQ(0U, GetNumSnapshots());
+ ASSERT_EQ(GetSequenceOldestSnapshots(), 0);
+ ASSERT_EQ("0v4", Get(0, "foo"));
+ ASSERT_EQ("1v4", Get(1, "foo"));
+ } while (ChangeOptions());
+}
+
+TEST_F(DBTest, HiddenValuesAreRemoved) {
+ anon::OptionsOverride options_override;
+ options_override.skip_policy = kSkipNoSnapshot;
+ do {
+ Options options = CurrentOptions(options_override);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ Random rnd(301);
+ FillLevels("a", "z", 1);
+
+ std::string big = RandomString(&rnd, 50000);
+ Put(1, "foo", big);
+ Put(1, "pastfoo", "v");
+ const Snapshot* snapshot = db_->GetSnapshot();
+ Put(1, "foo", "tiny");
+ Put(1, "pastfoo2", "v2"); // Advance sequence number one more
+
+ ASSERT_OK(Flush(1));
+ ASSERT_GT(NumTableFilesAtLevel(0, 1), 0);
+
+ ASSERT_EQ(big, Get(1, "foo", snapshot));
+ ASSERT_TRUE(Between(Size("", "pastfoo", 1), 50000, 60000));
+ db_->ReleaseSnapshot(snapshot);
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny, " + big + " ]");
+ Slice x("x");
+ dbfull()->TEST_CompactRange(0, nullptr, &x, handles_[1]);
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]");
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+ ASSERT_GE(NumTableFilesAtLevel(1, 1), 1);
+ dbfull()->TEST_CompactRange(1, nullptr, &x, handles_[1]);
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]");
+
+ ASSERT_TRUE(Between(Size("", "pastfoo", 1), 0, 1000));
+ // ApproximateOffsetOf() is not yet implemented in plain table format,
+ // which is used by Size().
+ } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction |
+ kSkipPlainTable));
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTest, UnremovableSingleDelete) {
+ // If we compact:
+ //
+ // Put(A, v1) Snapshot SingleDelete(A) Put(A, v2)
+ //
+ // We do not want to end up with:
+ //
+ // Put(A, v1) Snapshot Put(A, v2)
+ //
+ // Because a subsequent SingleDelete(A) would delete the Put(A, v2)
+ // but not Put(A, v1), so Get(A) would return v1.
+ anon::OptionsOverride options_override;
+ options_override.skip_policy = kSkipNoSnapshot;
+ do {
+ Options options = CurrentOptions(options_override);
+ options.disable_auto_compactions = true;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ Put(1, "foo", "first");
+ const Snapshot* snapshot = db_->GetSnapshot();
+ SingleDelete(1, "foo");
+ Put(1, "foo", "second");
+ ASSERT_OK(Flush(1));
+
+ ASSERT_EQ("first", Get(1, "foo", snapshot));
+ ASSERT_EQ("second", Get(1, "foo"));
+
+ dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+ nullptr);
+ ASSERT_EQ("[ second, SDEL, first ]", AllEntriesFor("foo", 1));
+
+ SingleDelete(1, "foo");
+
+ ASSERT_EQ("first", Get(1, "foo", snapshot));
+ ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+
+ dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+ nullptr);
+
+ ASSERT_EQ("first", Get(1, "foo", snapshot));
+ ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+ db_->ReleaseSnapshot(snapshot);
+ // Skip FIFO and universal compaction beccause they do not apply to the test
+ // case. Skip MergePut because single delete does not get removed when it
+ // encounters a merge.
+ } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+ kSkipMergePut));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, DeletionMarkers1) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu"}, options);
+ Put(1, "foo", "v1");
+ ASSERT_OK(Flush(1));
+ const int last = 2;
+ MoveFilesToLevel(last, 1);
+ // foo => v1 is now in last level
+ ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
+
+ // Place a table at level last-1 to prevent merging with preceding mutation
+ Put(1, "a", "begin");
+ Put(1, "z", "end");
+ Flush(1);
+ MoveFilesToLevel(last - 1, 1);
+ ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
+ ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1);
+
+ Delete(1, "foo");
+ Put(1, "foo", "v2");
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]");
+ ASSERT_OK(Flush(1)); // Moves to level last-2
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]");
+ Slice z("z");
+ dbfull()->TEST_CompactRange(last - 2, nullptr, &z, handles_[1]);
+ // DEL eliminated, but v1 remains because we aren't compacting that level
+ // (DEL can be eliminated because v2 hides v1).
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]");
+ dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]);
+ // Merging last-1 w/ last, so we are the base level for "foo", so
+ // DEL is removed. (as is v1).
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]");
+}
+
+TEST_F(DBTest, DeletionMarkers2) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu"}, options);
+ Put(1, "foo", "v1");
+ ASSERT_OK(Flush(1));
+ const int last = 2;
+ MoveFilesToLevel(last, 1);
+ // foo => v1 is now in last level
+ ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
+
+ // Place a table at level last-1 to prevent merging with preceding mutation
+ Put(1, "a", "begin");
+ Put(1, "z", "end");
+ Flush(1);
+ MoveFilesToLevel(last - 1, 1);
+ ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
+ ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1);
+
+ Delete(1, "foo");
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]");
+ ASSERT_OK(Flush(1)); // Moves to level last-2
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]");
+ dbfull()->TEST_CompactRange(last - 2, nullptr, nullptr, handles_[1]);
+ // DEL kept: "last" file overlaps
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]");
+ dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]);
+ // Merging last-1 w/ last, so we are the base level for "foo", so
+ // DEL is removed. (as is v1).
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+}
+
+TEST_F(DBTest, OverlapInLevel0) {
+ do {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Fill levels 1 and 2 to disable the pushing of new memtables to levels >
+ // 0.
+ ASSERT_OK(Put(1, "100", "v100"));
+ ASSERT_OK(Put(1, "999", "v999"));
+ Flush(1);
+ MoveFilesToLevel(2, 1);
+ ASSERT_OK(Delete(1, "100"));
+ ASSERT_OK(Delete(1, "999"));
+ Flush(1);
+ MoveFilesToLevel(1, 1);
+ ASSERT_EQ("0,1,1", FilesPerLevel(1));
+
+ // Make files spanning the following ranges in level-0:
+ // files[0] 200 .. 900
+ // files[1] 300 .. 500
+ // Note that files are sorted by smallest key.
+ ASSERT_OK(Put(1, "300", "v300"));
+ ASSERT_OK(Put(1, "500", "v500"));
+ Flush(1);
+ ASSERT_OK(Put(1, "200", "v200"));
+ ASSERT_OK(Put(1, "600", "v600"));
+ ASSERT_OK(Put(1, "900", "v900"));
+ Flush(1);
+ ASSERT_EQ("2,1,1", FilesPerLevel(1));
+
+ // Compact away the placeholder files we created initially
+ dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+ dbfull()->TEST_CompactRange(2, nullptr, nullptr, handles_[1]);
+ ASSERT_EQ("2", FilesPerLevel(1));
+
+ // Do a memtable compaction. Before bug-fix, the compaction would
+ // not detect the overlap with level-0 files and would incorrectly place
+ // the deletion in a deeper level.
+ ASSERT_OK(Delete(1, "600"));
+ Flush(1);
+ ASSERT_EQ("3", FilesPerLevel(1));
+ ASSERT_EQ("NOT_FOUND", Get(1, "600"));
+ } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction));
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTest, ComparatorCheck) {
+ class NewComparator : public Comparator {
+ public:
+ const char* Name() const override { return "rocksdb.NewComparator"; }
+ int Compare(const Slice& a, const Slice& b) const override {
+ return BytewiseComparator()->Compare(a, b);
+ }
+ void FindShortestSeparator(std::string* s, const Slice& l) const override {
+ BytewiseComparator()->FindShortestSeparator(s, l);
+ }
+ void FindShortSuccessor(std::string* key) const override {
+ BytewiseComparator()->FindShortSuccessor(key);
+ }
+ };
+ Options new_options, options;
+ NewComparator cmp;
+ do {
+ options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu"}, options);
+ new_options = CurrentOptions();
+ new_options.comparator = &cmp;
+ // only the non-default column family has non-matching comparator
+ Status s = TryReopenWithColumnFamilies(
+ {"default", "pikachu"}, std::vector<Options>({options, new_options}));
+ ASSERT_TRUE(!s.ok());
+ ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos)
+ << s.ToString();
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTest, CustomComparator) {
+ class NumberComparator : public Comparator {
+ public:
+ const char* Name() const override { return "test.NumberComparator"; }
+ int Compare(const Slice& a, const Slice& b) const override {
+ return ToNumber(a) - ToNumber(b);
+ }
+ void FindShortestSeparator(std::string* s, const Slice& l) const override {
+ ToNumber(*s); // Check format
+ ToNumber(l); // Check format
+ }
+ void FindShortSuccessor(std::string* key) const override {
+ ToNumber(*key); // Check format
+ }
+
+ private:
+ static int ToNumber(const Slice& x) {
+ // Check that there are no extra characters.
+ EXPECT_TRUE(x.size() >= 2 && x[0] == '[' && x[x.size() - 1] == ']')
+ << EscapeString(x);
+ int val;
+ char ignored;
+ EXPECT_TRUE(sscanf(x.ToString().c_str(), "[%i]%c", &val, &ignored) == 1)
+ << EscapeString(x);
+ return val;
+ }
+ };
+ Options new_options;
+ NumberComparator cmp;
+ do {
+ new_options = CurrentOptions();
+ new_options.create_if_missing = true;
+ new_options.comparator = &cmp;
+ new_options.write_buffer_size = 4096; // Compact more often
+ new_options.arena_block_size = 4096;
+ new_options = CurrentOptions(new_options);
+ DestroyAndReopen(new_options);
+ CreateAndReopenWithCF({"pikachu"}, new_options);
+ ASSERT_OK(Put(1, "[10]", "ten"));
+ ASSERT_OK(Put(1, "[0x14]", "twenty"));
+ for (int i = 0; i < 2; i++) {
+ ASSERT_EQ("ten", Get(1, "[10]"));
+ ASSERT_EQ("ten", Get(1, "[0xa]"));
+ ASSERT_EQ("twenty", Get(1, "[20]"));
+ ASSERT_EQ("twenty", Get(1, "[0x14]"));
+ ASSERT_EQ("NOT_FOUND", Get(1, "[15]"));
+ ASSERT_EQ("NOT_FOUND", Get(1, "[0xf]"));
+ Compact(1, "[0]", "[9999]");
+ }
+
+ for (int run = 0; run < 2; run++) {
+ for (int i = 0; i < 1000; i++) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "[%d]", i * 10);
+ ASSERT_OK(Put(1, buf, buf));
+ }
+ Compact(1, "[0]", "[1000000]");
+ }
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTest, DBOpen_Options) {
+ Options options = CurrentOptions();
+ std::string dbname = test::PerThreadDBPath("db_options_test");
+ ASSERT_OK(DestroyDB(dbname, options));
+
+ // Does not exist, and create_if_missing == false: error
+ DB* db = nullptr;
+ options.create_if_missing = false;
+ Status s = DB::Open(options, dbname, &db);
+ ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr);
+ ASSERT_TRUE(db == nullptr);
+
+ // Does not exist, and create_if_missing == true: OK
+ options.create_if_missing = true;
+ s = DB::Open(options, dbname, &db);
+ ASSERT_OK(s);
+ ASSERT_TRUE(db != nullptr);
+
+ delete db;
+ db = nullptr;
+
+ // Does exist, and error_if_exists == true: error
+ options.create_if_missing = false;
+ options.error_if_exists = true;
+ s = DB::Open(options, dbname, &db);
+ ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr);
+ ASSERT_TRUE(db == nullptr);
+
+ // Does exist, and error_if_exists == false: OK
+ options.create_if_missing = true;
+ options.error_if_exists = false;
+ s = DB::Open(options, dbname, &db);
+ ASSERT_OK(s);
+ ASSERT_TRUE(db != nullptr);
+
+ delete db;
+ db = nullptr;
+}
+
+TEST_F(DBTest, DBOpen_Change_NumLevels) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+ ASSERT_TRUE(db_ != nullptr);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_OK(Put(1, "a", "123"));
+ ASSERT_OK(Put(1, "b", "234"));
+ Flush(1);
+ MoveFilesToLevel(3, 1);
+ Close();
+
+ options.create_if_missing = false;
+ options.num_levels = 2;
+ Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_TRUE(strstr(s.ToString().c_str(), "Invalid argument") != nullptr);
+ ASSERT_TRUE(db_ == nullptr);
+}
+
+TEST_F(DBTest, DestroyDBMetaDatabase) {
+ std::string dbname = test::PerThreadDBPath("db_meta");
+ ASSERT_OK(env_->CreateDirIfMissing(dbname));
+ std::string metadbname = MetaDatabaseName(dbname, 0);
+ ASSERT_OK(env_->CreateDirIfMissing(metadbname));
+ std::string metametadbname = MetaDatabaseName(metadbname, 0);
+ ASSERT_OK(env_->CreateDirIfMissing(metametadbname));
+
+ // Destroy previous versions if they exist. Using the long way.
+ Options options = CurrentOptions();
+ ASSERT_OK(DestroyDB(metametadbname, options));
+ ASSERT_OK(DestroyDB(metadbname, options));
+ ASSERT_OK(DestroyDB(dbname, options));
+
+ // Setup databases
+ DB* db = nullptr;
+ ASSERT_OK(DB::Open(options, dbname, &db));
+ delete db;
+ db = nullptr;
+ ASSERT_OK(DB::Open(options, metadbname, &db));
+ delete db;
+ db = nullptr;
+ ASSERT_OK(DB::Open(options, metametadbname, &db));
+ delete db;
+ db = nullptr;
+
+ // Delete databases
+ ASSERT_OK(DestroyDB(dbname, options));
+
+ // Check if deletion worked.
+ options.create_if_missing = false;
+ ASSERT_TRUE(!(DB::Open(options, dbname, &db)).ok());
+ ASSERT_TRUE(!(DB::Open(options, metadbname, &db)).ok());
+ ASSERT_TRUE(!(DB::Open(options, metametadbname, &db)).ok());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, SnapshotFiles) {
+ do {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100000000; // Large write buffer
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ Random rnd(301);
+
+ // Write 8MB (80 values, each 100K)
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+ std::vector<std::string> values;
+ for (int i = 0; i < 80; i++) {
+ values.push_back(RandomString(&rnd, 100000));
+ ASSERT_OK(Put((i < 40), Key(i), values[i]));
+ }
+
+ // assert that nothing makes it to disk yet.
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+
+ // get a file snapshot
+ uint64_t manifest_number = 0;
+ uint64_t manifest_size = 0;
+ std::vector<std::string> files;
+ dbfull()->DisableFileDeletions();
+ dbfull()->GetLiveFiles(files, &manifest_size);
+
+ // CURRENT, MANIFEST, OPTIONS, *.sst files (one for each CF)
+ ASSERT_EQ(files.size(), 5U);
+
+ uint64_t number = 0;
+ FileType type;
+
+ // copy these files to a new snapshot directory
+ std::string snapdir = dbname_ + ".snapdir/";
+ ASSERT_OK(env_->CreateDirIfMissing(snapdir));
+
+ for (size_t i = 0; i < files.size(); i++) {
+ // our clients require that GetLiveFiles returns
+ // files with "/" as first character!
+ ASSERT_EQ(files[i][0], '/');
+ std::string src = dbname_ + files[i];
+ std::string dest = snapdir + files[i];
+
+ uint64_t size;
+ ASSERT_OK(env_->GetFileSize(src, &size));
+
+ // record the number and the size of the
+ // latest manifest file
+ if (ParseFileName(files[i].substr(1), &number, &type)) {
+ if (type == kDescriptorFile) {
+ if (number > manifest_number) {
+ manifest_number = number;
+ ASSERT_GE(size, manifest_size);
+ size = manifest_size; // copy only valid MANIFEST data
+ }
+ }
+ }
+ CopyFile(src, dest, size);
+ }
+
+ // release file snapshot
+ dbfull()->DisableFileDeletions();
+ // overwrite one key, this key should not appear in the snapshot
+ std::vector<std::string> extras;
+ for (unsigned int i = 0; i < 1; i++) {
+ extras.push_back(RandomString(&rnd, 100000));
+ ASSERT_OK(Put(0, Key(i), extras[i]));
+ }
+
+ // verify that data in the snapshot are correct
+ std::vector<ColumnFamilyDescriptor> column_families;
+ column_families.emplace_back("default", ColumnFamilyOptions());
+ column_families.emplace_back("pikachu", ColumnFamilyOptions());
+ std::vector<ColumnFamilyHandle*> cf_handles;
+ DB* snapdb;
+ DBOptions opts;
+ opts.env = env_;
+ opts.create_if_missing = false;
+ Status stat =
+ DB::Open(opts, snapdir, column_families, &cf_handles, &snapdb);
+ ASSERT_OK(stat);
+
+ ReadOptions roptions;
+ std::string val;
+ for (unsigned int i = 0; i < 80; i++) {
+ stat = snapdb->Get(roptions, cf_handles[i < 40], Key(i), &val);
+ ASSERT_EQ(values[i].compare(val), 0);
+ }
+ for (auto cfh : cf_handles) {
+ delete cfh;
+ }
+ delete snapdb;
+
+ // look at the new live files after we added an 'extra' key
+ // and after we took the first snapshot.
+ uint64_t new_manifest_number = 0;
+ uint64_t new_manifest_size = 0;
+ std::vector<std::string> newfiles;
+ dbfull()->DisableFileDeletions();
+ dbfull()->GetLiveFiles(newfiles, &new_manifest_size);
+
+ // find the new manifest file. assert that this manifest file is
+ // the same one as in the previous snapshot. But its size should be
+ // larger because we added an extra key after taking the
+ // previous shapshot.
+ for (size_t i = 0; i < newfiles.size(); i++) {
+ std::string src = dbname_ + "/" + newfiles[i];
+ // record the lognumber and the size of the
+ // latest manifest file
+ if (ParseFileName(newfiles[i].substr(1), &number, &type)) {
+ if (type == kDescriptorFile) {
+ if (number > new_manifest_number) {
+ uint64_t size;
+ new_manifest_number = number;
+ ASSERT_OK(env_->GetFileSize(src, &size));
+ ASSERT_GE(size, new_manifest_size);
+ }
+ }
+ }
+ }
+ ASSERT_EQ(manifest_number, new_manifest_number);
+ ASSERT_GT(new_manifest_size, manifest_size);
+
+ // release file snapshot
+ dbfull()->DisableFileDeletions();
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTest, ReadonlyDBGetLiveManifestSize) {
+ do {
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = 2;
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ Close();
+ ASSERT_OK(ReadOnlyReopen(options));
+
+ uint64_t manifest_size = 0;
+ std::vector<std::string> files;
+ dbfull()->GetLiveFiles(files, &manifest_size);
+
+ for (const std::string& f : files) {
+ uint64_t number = 0;
+ FileType type;
+ if (ParseFileName(f.substr(1), &number, &type)) {
+ if (type == kDescriptorFile) {
+ uint64_t size_on_disk;
+ env_->GetFileSize(dbname_ + "/" + f, &size_on_disk);
+ ASSERT_EQ(manifest_size, size_on_disk);
+ break;
+ }
+ }
+ }
+ Close();
+ } while (ChangeCompactOptions());
+}
+#endif
+
+TEST_F(DBTest, PurgeInfoLogs) {
+ Options options = CurrentOptions();
+ options.keep_log_file_num = 5;
+ options.create_if_missing = true;
+ for (int mode = 0; mode <= 1; mode++) {
+ if (mode == 1) {
+ options.db_log_dir = dbname_ + "_logs";
+ env_->CreateDirIfMissing(options.db_log_dir);
+ } else {
+ options.db_log_dir = "";
+ }
+ for (int i = 0; i < 8; i++) {
+ Reopen(options);
+ }
+
+ std::vector<std::string> files;
+ env_->GetChildren(options.db_log_dir.empty() ? dbname_ : options.db_log_dir,
+ &files);
+ int info_log_count = 0;
+ for (std::string file : files) {
+ if (file.find("LOG") != std::string::npos) {
+ info_log_count++;
+ }
+ }
+ ASSERT_EQ(5, info_log_count);
+
+ Destroy(options);
+ // For mode (1), test DestroyDB() to delete all the logs under DB dir.
+ // For mode (2), no info log file should have been put under DB dir.
+ std::vector<std::string> db_files;
+ env_->GetChildren(dbname_, &db_files);
+ for (std::string file : db_files) {
+ ASSERT_TRUE(file.find("LOG") == std::string::npos);
+ }
+
+ if (mode == 1) {
+ // Cleaning up
+ env_->GetChildren(options.db_log_dir, &files);
+ for (std::string file : files) {
+ env_->DeleteFile(options.db_log_dir + "/" + file);
+ }
+ env_->DeleteDir(options.db_log_dir);
+ }
+ }
+}
+
+#ifndef ROCKSDB_LITE
+// Multi-threaded test:
+namespace {
+
+static const int kColumnFamilies = 10;
+static const int kNumThreads = 10;
+static const int kTestSeconds = 10;
+static const int kNumKeys = 1000;
+
+struct MTState {
+ DBTest* test;
+ std::atomic<bool> stop;
+ std::atomic<int> counter[kNumThreads];
+ std::atomic<bool> thread_done[kNumThreads];
+};
+
+struct MTThread {
+ MTState* state;
+ int id;
+ bool multiget_batched;
+};
+
+static void MTThreadBody(void* arg) {
+ MTThread* t = reinterpret_cast<MTThread*>(arg);
+ int id = t->id;
+ DB* db = t->state->test->db_;
+ int counter = 0;
+ fprintf(stderr, "... starting thread %d\n", id);
+ Random rnd(1000 + id);
+ char valbuf[1500];
+ while (t->state->stop.load(std::memory_order_acquire) == false) {
+ t->state->counter[id].store(counter, std::memory_order_release);
+
+ int key = rnd.Uniform(kNumKeys);
+ char keybuf[20];
+ snprintf(keybuf, sizeof(keybuf), "%016d", key);
+
+ if (rnd.OneIn(2)) {
+ // Write values of the form <key, my id, counter, cf, unique_id>.
+ // into each of the CFs
+ // We add some padding for force compactions.
+ int unique_id = rnd.Uniform(1000000);
+
+ // Half of the time directly use WriteBatch. Half of the time use
+ // WriteBatchWithIndex.
+ if (rnd.OneIn(2)) {
+ WriteBatch batch;
+ for (int cf = 0; cf < kColumnFamilies; ++cf) {
+ snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id,
+ static_cast<int>(counter), cf, unique_id);
+ batch.Put(t->state->test->handles_[cf], Slice(keybuf), Slice(valbuf));
+ }
+ ASSERT_OK(db->Write(WriteOptions(), &batch));
+ } else {
+ WriteBatchWithIndex batch(db->GetOptions().comparator);
+ for (int cf = 0; cf < kColumnFamilies; ++cf) {
+ snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id,
+ static_cast<int>(counter), cf, unique_id);
+ batch.Put(t->state->test->handles_[cf], Slice(keybuf), Slice(valbuf));
+ }
+ ASSERT_OK(db->Write(WriteOptions(), batch.GetWriteBatch()));
+ }
+ } else {
+ // Read a value and verify that it matches the pattern written above
+ // and that writes to all column families were atomic (unique_id is the
+ // same)
+ std::vector<Slice> keys(kColumnFamilies, Slice(keybuf));
+ std::vector<std::string> values;
+ std::vector<Status> statuses;
+ if (!t->multiget_batched) {
+ statuses = db->MultiGet(ReadOptions(), t->state->test->handles_, keys,
+ &values);
+ } else {
+ std::vector<PinnableSlice> pin_values(keys.size());
+ statuses.resize(keys.size());
+ const Snapshot* snapshot = db->GetSnapshot();
+ ReadOptions ro;
+ ro.snapshot = snapshot;
+ for (int cf = 0; cf < kColumnFamilies; ++cf) {
+ db->MultiGet(ro, t->state->test->handles_[cf], 1, &keys[cf],
+ &pin_values[cf], &statuses[cf]);
+ }
+ db->ReleaseSnapshot(snapshot);
+ values.resize(keys.size());
+ for (int cf = 0; cf < kColumnFamilies; ++cf) {
+ if (statuses[cf].ok()) {
+ values[cf].assign(pin_values[cf].data(), pin_values[cf].size());
+ }
+ }
+ }
+ Status s = statuses[0];
+ // all statuses have to be the same
+ for (size_t i = 1; i < statuses.size(); ++i) {
+ // they are either both ok or both not-found
+ ASSERT_TRUE((s.ok() && statuses[i].ok()) ||
+ (s.IsNotFound() && statuses[i].IsNotFound()));
+ }
+ if (s.IsNotFound()) {
+ // Key has not yet been written
+ } else {
+ // Check that the writer thread counter is >= the counter in the value
+ ASSERT_OK(s);
+ int unique_id = -1;
+ for (int i = 0; i < kColumnFamilies; ++i) {
+ int k, w, c, cf, u;
+ ASSERT_EQ(5, sscanf(values[i].c_str(), "%d.%d.%d.%d.%d", &k, &w, &c,
+ &cf, &u))
+ << values[i];
+ ASSERT_EQ(k, key);
+ ASSERT_GE(w, 0);
+ ASSERT_LT(w, kNumThreads);
+ ASSERT_LE(c, t->state->counter[w].load(std::memory_order_acquire));
+ ASSERT_EQ(cf, i);
+ if (i == 0) {
+ unique_id = u;
+ } else {
+ // this checks that updates across column families happened
+ // atomically -- all unique ids are the same
+ ASSERT_EQ(u, unique_id);
+ }
+ }
+ }
+ }
+ counter++;
+ }
+ t->state->thread_done[id].store(true, std::memory_order_release);
+ fprintf(stderr, "... stopping thread %d after %d ops\n", id, int(counter));
+}
+
+} // namespace
+
+class MultiThreadedDBTest
+ : public DBTest,
+ public ::testing::WithParamInterface<std::tuple<int, bool>> {
+ public:
+ void SetUp() override {
+ std::tie(option_config_, multiget_batched_) = GetParam();
+ }
+
+ static std::vector<int> GenerateOptionConfigs() {
+ std::vector<int> optionConfigs;
+ for (int optionConfig = kDefault; optionConfig < kEnd; ++optionConfig) {
+ optionConfigs.push_back(optionConfig);
+ }
+ return optionConfigs;
+ }
+
+ bool multiget_batched_;
+};
+
+TEST_P(MultiThreadedDBTest, MultiThreaded) {
+ if (option_config_ == kPipelinedWrite) return;
+ anon::OptionsOverride options_override;
+ options_override.skip_policy = kSkipNoSnapshot;
+ Options options = CurrentOptions(options_override);
+ std::vector<std::string> cfs;
+ for (int i = 1; i < kColumnFamilies; ++i) {
+ cfs.push_back(ToString(i));
+ }
+ Reopen(options);
+ CreateAndReopenWithCF(cfs, options);
+ // Initialize state
+ MTState mt;
+ mt.test = this;
+ mt.stop.store(false, std::memory_order_release);
+ for (int id = 0; id < kNumThreads; id++) {
+ mt.counter[id].store(0, std::memory_order_release);
+ mt.thread_done[id].store(false, std::memory_order_release);
+ }
+
+ // Start threads
+ MTThread thread[kNumThreads];
+ for (int id = 0; id < kNumThreads; id++) {
+ thread[id].state = &mt;
+ thread[id].id = id;
+ thread[id].multiget_batched = multiget_batched_;
+ env_->StartThread(MTThreadBody, &thread[id]);
+ }
+
+ // Let them run for a while
+ env_->SleepForMicroseconds(kTestSeconds * 1000000);
+
+ // Stop the threads and wait for them to finish
+ mt.stop.store(true, std::memory_order_release);
+ for (int id = 0; id < kNumThreads; id++) {
+ while (mt.thread_done[id].load(std::memory_order_acquire) == false) {
+ env_->SleepForMicroseconds(100000);
+ }
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(
+ MultiThreaded, MultiThreadedDBTest,
+ ::testing::Combine(
+ ::testing::ValuesIn(MultiThreadedDBTest::GenerateOptionConfigs()),
+ ::testing::Bool()));
+#endif // ROCKSDB_LITE
+
+// Group commit test:
+#if !defined(TRAVIS) && !defined(OS_WIN)
+// Disable this test temporarily on Travis and appveyor as it fails
+// intermittently. Github issue: #4151
+namespace {
+
+static const int kGCNumThreads = 4;
+static const int kGCNumKeys = 1000;
+
+struct GCThread {
+ DB* db;
+ int id;
+ std::atomic<bool> done;
+};
+
+static void GCThreadBody(void* arg) {
+ GCThread* t = reinterpret_cast<GCThread*>(arg);
+ int id = t->id;
+ DB* db = t->db;
+ WriteOptions wo;
+
+ for (int i = 0; i < kGCNumKeys; ++i) {
+ std::string kv(ToString(i + id * kGCNumKeys));
+ ASSERT_OK(db->Put(wo, kv, kv));
+ }
+ t->done = true;
+}
+
+} // namespace
+
+TEST_F(DBTest, GroupCommitTest) {
+ do {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ Reopen(options);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"WriteThread::JoinBatchGroup:BeganWaiting",
+ "DBImpl::WriteImpl:BeforeLeaderEnters"},
+ {"WriteThread::AwaitState:BlockingWaiting",
+ "WriteThread::EnterAsBatchGroupLeader:End"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Start threads
+ GCThread thread[kGCNumThreads];
+ for (int id = 0; id < kGCNumThreads; id++) {
+ thread[id].id = id;
+ thread[id].db = db_;
+ thread[id].done = false;
+ env_->StartThread(GCThreadBody, &thread[id]);
+ }
+ env_->WaitForJoin();
+
+ ASSERT_GT(TestGetTickerCount(options, WRITE_DONE_BY_OTHER), 0);
+
+ std::vector<std::string> expected_db;
+ for (int i = 0; i < kGCNumThreads * kGCNumKeys; ++i) {
+ expected_db.push_back(ToString(i));
+ }
+ std::sort(expected_db.begin(), expected_db.end());
+
+ Iterator* itr = db_->NewIterator(ReadOptions());
+ itr->SeekToFirst();
+ for (auto x : expected_db) {
+ ASSERT_TRUE(itr->Valid());
+ ASSERT_EQ(itr->key().ToString(), x);
+ ASSERT_EQ(itr->value().ToString(), x);
+ itr->Next();
+ }
+ ASSERT_TRUE(!itr->Valid());
+ delete itr;
+
+ HistogramData hist_data;
+ options.statistics->histogramData(DB_WRITE, &hist_data);
+ ASSERT_GT(hist_data.average, 0.0);
+ } while (ChangeOptions(kSkipNoSeekToLast));
+}
+#endif // TRAVIS
+
+namespace {
+typedef std::map<std::string, std::string> KVMap;
+}
+
+class ModelDB : public DB {
+ public:
+ class ModelSnapshot : public Snapshot {
+ public:
+ KVMap map_;
+
+ SequenceNumber GetSequenceNumber() const override {
+ // no need to call this
+ assert(false);
+ return 0;
+ }
+ };
+
+ explicit ModelDB(const Options& options) : options_(options) {}
+ using DB::Put;
+ Status Put(const WriteOptions& o, ColumnFamilyHandle* cf, const Slice& k,
+ const Slice& v) override {
+ WriteBatch batch;
+ batch.Put(cf, k, v);
+ return Write(o, &batch);
+ }
+ using DB::Close;
+ Status Close() override { return Status::OK(); }
+ using DB::Delete;
+ Status Delete(const WriteOptions& o, ColumnFamilyHandle* cf,
+ const Slice& key) override {
+ WriteBatch batch;
+ batch.Delete(cf, key);
+ return Write(o, &batch);
+ }
+ using DB::SingleDelete;
+ Status SingleDelete(const WriteOptions& o, ColumnFamilyHandle* cf,
+ const Slice& key) override {
+ WriteBatch batch;
+ batch.SingleDelete(cf, key);
+ return Write(o, &batch);
+ }
+ using DB::Merge;
+ Status Merge(const WriteOptions& o, ColumnFamilyHandle* cf, const Slice& k,
+ const Slice& v) override {
+ WriteBatch batch;
+ batch.Merge(cf, k, v);
+ return Write(o, &batch);
+ }
+ using DB::Get;
+ Status Get(const ReadOptions& /*options*/, ColumnFamilyHandle* /*cf*/,
+ const Slice& key, PinnableSlice* /*value*/) override {
+ return Status::NotSupported(key);
+ }
+
+ using DB::GetMergeOperands;
+ virtual Status GetMergeOperands(
+ const ReadOptions& /*options*/, ColumnFamilyHandle* /*column_family*/,
+ const Slice& key, PinnableSlice* /*slice*/,
+ GetMergeOperandsOptions* /*merge_operands_options*/,
+ int* /*number_of_operands*/) override {
+ return Status::NotSupported(key);
+ }
+
+ using DB::MultiGet;
+ std::vector<Status> MultiGet(
+ const ReadOptions& /*options*/,
+ const std::vector<ColumnFamilyHandle*>& /*column_family*/,
+ const std::vector<Slice>& keys,
+ std::vector<std::string>* /*values*/) override {
+ std::vector<Status> s(keys.size(),
+ Status::NotSupported("Not implemented."));
+ return s;
+ }
+
+#ifndef ROCKSDB_LITE
+ using DB::IngestExternalFile;
+ Status IngestExternalFile(
+ ColumnFamilyHandle* /*column_family*/,
+ const std::vector<std::string>& /*external_files*/,
+ const IngestExternalFileOptions& /*options*/) override {
+ return Status::NotSupported("Not implemented.");
+ }
+
+ using DB::IngestExternalFiles;
+ Status IngestExternalFiles(
+ const std::vector<IngestExternalFileArg>& /*args*/) override {
+ return Status::NotSupported("Not implemented");
+ }
+
+ using DB::CreateColumnFamilyWithImport;
+ virtual Status CreateColumnFamilyWithImport(
+ const ColumnFamilyOptions& /*options*/,
+ const std::string& /*column_family_name*/,
+ const ImportColumnFamilyOptions& /*import_options*/,
+ const ExportImportFilesMetaData& /*metadata*/,
+ ColumnFamilyHandle** /*handle*/) override {
+ return Status::NotSupported("Not implemented.");
+ }
+
+ using DB::VerifyChecksum;
+ Status VerifyChecksum(const ReadOptions&) override {
+ return Status::NotSupported("Not implemented.");
+ }
+
+ using DB::GetPropertiesOfAllTables;
+ Status GetPropertiesOfAllTables(
+ ColumnFamilyHandle* /*column_family*/,
+ TablePropertiesCollection* /*props*/) override {
+ return Status();
+ }
+
+ Status GetPropertiesOfTablesInRange(
+ ColumnFamilyHandle* /*column_family*/, const Range* /*range*/,
+ std::size_t /*n*/, TablePropertiesCollection* /*props*/) override {
+ return Status();
+ }
+#endif // ROCKSDB_LITE
+
+ using DB::KeyMayExist;
+ bool KeyMayExist(const ReadOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
+ std::string* /*value*/,
+ bool* value_found = nullptr) override {
+ if (value_found != nullptr) {
+ *value_found = false;
+ }
+ return true; // Not Supported directly
+ }
+ using DB::NewIterator;
+ Iterator* NewIterator(const ReadOptions& options,
+ ColumnFamilyHandle* /*column_family*/) override {
+ if (options.snapshot == nullptr) {
+ KVMap* saved = new KVMap;
+ *saved = map_;
+ return new ModelIter(saved, true);
+ } else {
+ const KVMap* snapshot_state =
+ &(reinterpret_cast<const ModelSnapshot*>(options.snapshot)->map_);
+ return new ModelIter(snapshot_state, false);
+ }
+ }
+ Status NewIterators(const ReadOptions& /*options*/,
+ const std::vector<ColumnFamilyHandle*>& /*column_family*/,
+ std::vector<Iterator*>* /*iterators*/) override {
+ return Status::NotSupported("Not supported yet");
+ }
+ const Snapshot* GetSnapshot() override {
+ ModelSnapshot* snapshot = new ModelSnapshot;
+ snapshot->map_ = map_;
+ return snapshot;
+ }
+
+ void ReleaseSnapshot(const Snapshot* snapshot) override {
+ delete reinterpret_cast<const ModelSnapshot*>(snapshot);
+ }
+
+ Status Write(const WriteOptions& /*options*/, WriteBatch* batch) override {
+ class Handler : public WriteBatch::Handler {
+ public:
+ KVMap* map_;
+ void Put(const Slice& key, const Slice& value) override {
+ (*map_)[key.ToString()] = value.ToString();
+ }
+ void Merge(const Slice& /*key*/, const Slice& /*value*/) override {
+ // ignore merge for now
+ // (*map_)[key.ToString()] = value.ToString();
+ }
+ void Delete(const Slice& key) override { map_->erase(key.ToString()); }
+ };
+ Handler handler;
+ handler.map_ = &map_;
+ return batch->Iterate(&handler);
+ }
+
+ using DB::GetProperty;
+ bool GetProperty(ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*property*/, std::string* /*value*/) override {
+ return false;
+ }
+ using DB::GetIntProperty;
+ bool GetIntProperty(ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*property*/, uint64_t* /*value*/) override {
+ return false;
+ }
+ using DB::GetMapProperty;
+ bool GetMapProperty(ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*property*/,
+ std::map<std::string, std::string>* /*value*/) override {
+ return false;
+ }
+ using DB::GetAggregatedIntProperty;
+ bool GetAggregatedIntProperty(const Slice& /*property*/,
+ uint64_t* /*value*/) override {
+ return false;
+ }
+ using DB::GetApproximateSizes;
+ Status GetApproximateSizes(const SizeApproximationOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Range* /*range*/, int n,
+ uint64_t* sizes) override {
+ for (int i = 0; i < n; i++) {
+ sizes[i] = 0;
+ }
+ return Status::OK();
+ }
+ using DB::GetApproximateMemTableStats;
+ void GetApproximateMemTableStats(ColumnFamilyHandle* /*column_family*/,
+ const Range& /*range*/,
+ uint64_t* const count,
+ uint64_t* const size) override {
+ *count = 0;
+ *size = 0;
+ }
+ using DB::CompactRange;
+ Status CompactRange(const CompactRangeOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice* /*start*/, const Slice* /*end*/) override {
+ return Status::NotSupported("Not supported operation.");
+ }
+
+ Status SetDBOptions(
+ const std::unordered_map<std::string, std::string>& /*new_options*/)
+ override {
+ return Status::NotSupported("Not supported operation.");
+ }
+
+ using DB::CompactFiles;
+ Status CompactFiles(
+ const CompactionOptions& /*compact_options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const std::vector<std::string>& /*input_file_names*/,
+ const int /*output_level*/, const int /*output_path_id*/ = -1,
+ std::vector<std::string>* const /*output_file_names*/ = nullptr,
+ CompactionJobInfo* /*compaction_job_info*/ = nullptr) override {
+ return Status::NotSupported("Not supported operation.");
+ }
+
+ Status PauseBackgroundWork() override {
+ return Status::NotSupported("Not supported operation.");
+ }
+
+ Status ContinueBackgroundWork() override {
+ return Status::NotSupported("Not supported operation.");
+ }
+
+ Status EnableAutoCompaction(
+ const std::vector<ColumnFamilyHandle*>& /*column_family_handles*/)
+ override {
+ return Status::NotSupported("Not supported operation.");
+ }
+
+ void EnableManualCompaction() override { return; }
+
+ void DisableManualCompaction() override { return; }
+
+ using DB::NumberLevels;
+ int NumberLevels(ColumnFamilyHandle* /*column_family*/) override { return 1; }
+
+ using DB::MaxMemCompactionLevel;
+ int MaxMemCompactionLevel(ColumnFamilyHandle* /*column_family*/) override {
+ return 1;
+ }
+
+ using DB::Level0StopWriteTrigger;
+ int Level0StopWriteTrigger(ColumnFamilyHandle* /*column_family*/) override {
+ return -1;
+ }
+
+ const std::string& GetName() const override { return name_; }
+
+ Env* GetEnv() const override { return nullptr; }
+
+ using DB::GetOptions;
+ Options GetOptions(ColumnFamilyHandle* /*column_family*/) const override {
+ return options_;
+ }
+
+ using DB::GetDBOptions;
+ DBOptions GetDBOptions() const override { return options_; }
+
+ using DB::Flush;
+ Status Flush(const ROCKSDB_NAMESPACE::FlushOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/) override {
+ Status ret;
+ return ret;
+ }
+ Status Flush(
+ const ROCKSDB_NAMESPACE::FlushOptions& /*options*/,
+ const std::vector<ColumnFamilyHandle*>& /*column_families*/) override {
+ return Status::OK();
+ }
+
+ Status SyncWAL() override { return Status::OK(); }
+
+#ifndef ROCKSDB_LITE
+ Status DisableFileDeletions() override { return Status::OK(); }
+
+ Status EnableFileDeletions(bool /*force*/) override { return Status::OK(); }
+ Status GetLiveFiles(std::vector<std::string>&, uint64_t* /*size*/,
+ bool /*flush_memtable*/ = true) override {
+ return Status::OK();
+ }
+
+ Status GetSortedWalFiles(VectorLogPtr& /*files*/) override {
+ return Status::OK();
+ }
+
+ Status GetCurrentWalFile(
+ std::unique_ptr<LogFile>* /*current_log_file*/) override {
+ return Status::OK();
+ }
+
+ virtual Status GetCreationTimeOfOldestFile(
+ uint64_t* /*creation_time*/) override {
+ return Status::NotSupported();
+ }
+
+ Status DeleteFile(std::string /*name*/) override { return Status::OK(); }
+
+ Status GetUpdatesSince(
+ ROCKSDB_NAMESPACE::SequenceNumber,
+ std::unique_ptr<ROCKSDB_NAMESPACE::TransactionLogIterator>*,
+ const TransactionLogIterator::ReadOptions& /*read_options*/ =
+ TransactionLogIterator::ReadOptions()) override {
+ return Status::NotSupported("Not supported in Model DB");
+ }
+
+ void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/,
+ ColumnFamilyMetaData* /*metadata*/) override {}
+#endif // ROCKSDB_LITE
+
+ Status GetDbIdentity(std::string& /*identity*/) const override {
+ return Status::OK();
+ }
+
+ SequenceNumber GetLatestSequenceNumber() const override { return 0; }
+
+ bool SetPreserveDeletesSequenceNumber(SequenceNumber /*seqnum*/) override {
+ return true;
+ }
+
+ ColumnFamilyHandle* DefaultColumnFamily() const override { return nullptr; }
+
+ private:
+ class ModelIter : public Iterator {
+ public:
+ ModelIter(const KVMap* map, bool owned)
+ : map_(map), owned_(owned), iter_(map_->end()) {}
+ ~ModelIter() override {
+ if (owned_) delete map_;
+ }
+ bool Valid() const override { return iter_ != map_->end(); }
+ void SeekToFirst() override { iter_ = map_->begin(); }
+ void SeekToLast() override {
+ if (map_->empty()) {
+ iter_ = map_->end();
+ } else {
+ iter_ = map_->find(map_->rbegin()->first);
+ }
+ }
+ void Seek(const Slice& k) override {
+ iter_ = map_->lower_bound(k.ToString());
+ }
+ void SeekForPrev(const Slice& k) override {
+ iter_ = map_->upper_bound(k.ToString());
+ Prev();
+ }
+ void Next() override { ++iter_; }
+ void Prev() override {
+ if (iter_ == map_->begin()) {
+ iter_ = map_->end();
+ return;
+ }
+ --iter_;
+ }
+
+ Slice key() const override { return iter_->first; }
+ Slice value() const override { return iter_->second; }
+ Status status() const override { return Status::OK(); }
+
+ private:
+ const KVMap* const map_;
+ const bool owned_; // Do we own map_
+ KVMap::const_iterator iter_;
+ };
+ const Options options_;
+ KVMap map_;
+ std::string name_ = "";
+};
+
+#ifndef ROCKSDB_VALGRIND_RUN
+static std::string RandomKey(Random* rnd, int minimum = 0) {
+ int len;
+ do {
+ len = (rnd->OneIn(3)
+ ? 1 // Short sometimes to encourage collisions
+ : (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10)));
+ } while (len < minimum);
+ return test::RandomKey(rnd, len);
+}
+
+static bool CompareIterators(int step, DB* model, DB* db,
+ const Snapshot* model_snap,
+ const Snapshot* db_snap) {
+ ReadOptions options;
+ options.snapshot = model_snap;
+ Iterator* miter = model->NewIterator(options);
+ options.snapshot = db_snap;
+ Iterator* dbiter = db->NewIterator(options);
+ bool ok = true;
+ int count = 0;
+ for (miter->SeekToFirst(), dbiter->SeekToFirst();
+ ok && miter->Valid() && dbiter->Valid(); miter->Next(), dbiter->Next()) {
+ count++;
+ if (miter->key().compare(dbiter->key()) != 0) {
+ fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n", step,
+ EscapeString(miter->key()).c_str(),
+ EscapeString(dbiter->key()).c_str());
+ ok = false;
+ break;
+ }
+
+ if (miter->value().compare(dbiter->value()) != 0) {
+ fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n",
+ step, EscapeString(miter->key()).c_str(),
+ EscapeString(miter->value()).c_str(),
+ EscapeString(miter->value()).c_str());
+ ok = false;
+ }
+ }
+
+ if (ok) {
+ if (miter->Valid() != dbiter->Valid()) {
+ fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n",
+ step, miter->Valid(), dbiter->Valid());
+ ok = false;
+ }
+ }
+ delete miter;
+ delete dbiter;
+ return ok;
+}
+
+class DBTestRandomized : public DBTest,
+ public ::testing::WithParamInterface<int> {
+ public:
+ void SetUp() override { option_config_ = GetParam(); }
+
+ static std::vector<int> GenerateOptionConfigs() {
+ std::vector<int> option_configs;
+ // skip cuckoo hash as it does not support snapshot.
+ for (int option_config = kDefault; option_config < kEnd; ++option_config) {
+ if (!ShouldSkipOptions(option_config,
+ kSkipDeletesFilterFirst | kSkipNoSeekToLast)) {
+ option_configs.push_back(option_config);
+ }
+ }
+ option_configs.push_back(kBlockBasedTableWithIndexRestartInterval);
+ return option_configs;
+ }
+};
+
+INSTANTIATE_TEST_CASE_P(
+ DBTestRandomized, DBTestRandomized,
+ ::testing::ValuesIn(DBTestRandomized::GenerateOptionConfigs()));
+
+TEST_P(DBTestRandomized, Randomized) {
+ anon::OptionsOverride options_override;
+ options_override.skip_policy = kSkipNoSnapshot;
+ Options options = CurrentOptions(options_override);
+ DestroyAndReopen(options);
+
+ Random rnd(test::RandomSeed() + GetParam());
+ ModelDB model(options);
+ const int N = 10000;
+ const Snapshot* model_snap = nullptr;
+ const Snapshot* db_snap = nullptr;
+ std::string k, v;
+ for (int step = 0; step < N; step++) {
+ // TODO(sanjay): Test Get() works
+ int p = rnd.Uniform(100);
+ int minimum = 0;
+ if (option_config_ == kHashSkipList || option_config_ == kHashLinkList ||
+ option_config_ == kPlainTableFirstBytePrefix ||
+ option_config_ == kBlockBasedTableWithWholeKeyHashIndex ||
+ option_config_ == kBlockBasedTableWithPrefixHashIndex) {
+ minimum = 1;
+ }
+ if (p < 45) { // Put
+ k = RandomKey(&rnd, minimum);
+ v = RandomString(&rnd,
+ rnd.OneIn(20) ? 100 + rnd.Uniform(100) : rnd.Uniform(8));
+ ASSERT_OK(model.Put(WriteOptions(), k, v));
+ ASSERT_OK(db_->Put(WriteOptions(), k, v));
+ } else if (p < 90) { // Delete
+ k = RandomKey(&rnd, minimum);
+ ASSERT_OK(model.Delete(WriteOptions(), k));
+ ASSERT_OK(db_->Delete(WriteOptions(), k));
+ } else { // Multi-element batch
+ WriteBatch b;
+ const int num = rnd.Uniform(8);
+ for (int i = 0; i < num; i++) {
+ if (i == 0 || !rnd.OneIn(10)) {
+ k = RandomKey(&rnd, minimum);
+ } else {
+ // Periodically re-use the same key from the previous iter, so
+ // we have multiple entries in the write batch for the same key
+ }
+ if (rnd.OneIn(2)) {
+ v = RandomString(&rnd, rnd.Uniform(10));
+ b.Put(k, v);
+ } else {
+ b.Delete(k);
+ }
+ }
+ ASSERT_OK(model.Write(WriteOptions(), &b));
+ ASSERT_OK(db_->Write(WriteOptions(), &b));
+ }
+
+ if ((step % 100) == 0) {
+ // For DB instances that use the hash index + block-based table, the
+ // iterator will be invalid right when seeking a non-existent key, right
+ // than return a key that is close to it.
+ if (option_config_ != kBlockBasedTableWithWholeKeyHashIndex &&
+ option_config_ != kBlockBasedTableWithPrefixHashIndex) {
+ ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
+ ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap));
+ }
+
+ // Save a snapshot from each DB this time that we'll use next
+ // time we compare things, to make sure the current state is
+ // preserved with the snapshot
+ if (model_snap != nullptr) model.ReleaseSnapshot(model_snap);
+ if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap);
+
+ Reopen(options);
+ ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
+
+ model_snap = model.GetSnapshot();
+ db_snap = db_->GetSnapshot();
+ }
+ }
+ if (model_snap != nullptr) model.ReleaseSnapshot(model_snap);
+ if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap);
+}
+#endif // ROCKSDB_VALGRIND_RUN
+
+TEST_F(DBTest, BlockBasedTablePrefixIndexTest) {
+ // create a DB with block prefix index
+ BlockBasedTableOptions table_options;
+ Options options = CurrentOptions();
+ table_options.index_type = BlockBasedTableOptions::kHashSearch;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+
+ Reopen(options);
+ ASSERT_OK(Put("k1", "v1"));
+ Flush();
+ ASSERT_OK(Put("k2", "v2"));
+
+ // Reopen it without prefix extractor, make sure everything still works.
+ // RocksDB should just fall back to the binary index.
+ table_options.index_type = BlockBasedTableOptions::kBinarySearch;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.prefix_extractor.reset();
+
+ Reopen(options);
+ ASSERT_EQ("v1", Get("k1"));
+ ASSERT_EQ("v2", Get("k2"));
+}
+
+TEST_F(DBTest, BlockBasedTablePrefixIndexTotalOrderSeek) {
+ // create a DB with block prefix index
+ BlockBasedTableOptions table_options;
+ Options options = CurrentOptions();
+ options.max_open_files = 10;
+ table_options.index_type = BlockBasedTableOptions::kHashSearch;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+
+ // RocksDB sanitize max open files to at least 20. Modify it back.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+ int* max_open_files = static_cast<int*>(arg);
+ *max_open_files = 11;
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Reopen(options);
+ ASSERT_OK(Put("k1", "v1"));
+ Flush();
+
+ CompactRangeOptions cro;
+ cro.change_level = true;
+ cro.target_level = 1;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ // Force evict tables
+ dbfull()->TEST_table_cache()->SetCapacity(0);
+ // Make table cache to keep one entry.
+ dbfull()->TEST_table_cache()->SetCapacity(1);
+
+ ReadOptions read_options;
+ read_options.total_order_seek = true;
+ {
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ iter->Seek("k1");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("k1", iter->key().ToString());
+ }
+
+ // After total order seek, prefix index should still be used.
+ read_options.total_order_seek = false;
+ {
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ iter->Seek("k1");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("k1", iter->key().ToString());
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest, ChecksumTest) {
+ BlockBasedTableOptions table_options;
+ Options options = CurrentOptions();
+
+ table_options.checksum = kCRC32c;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+ ASSERT_OK(Put("a", "b"));
+ ASSERT_OK(Put("c", "d"));
+ ASSERT_OK(Flush()); // table with crc checksum
+
+ table_options.checksum = kxxHash;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+ ASSERT_OK(Put("e", "f"));
+ ASSERT_OK(Put("g", "h"));
+ ASSERT_OK(Flush()); // table with xxhash checksum
+
+ table_options.checksum = kCRC32c;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+ ASSERT_EQ("b", Get("a"));
+ ASSERT_EQ("d", Get("c"));
+ ASSERT_EQ("f", Get("e"));
+ ASSERT_EQ("h", Get("g"));
+
+ table_options.checksum = kCRC32c;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+ ASSERT_EQ("b", Get("a"));
+ ASSERT_EQ("d", Get("c"));
+ ASSERT_EQ("f", Get("e"));
+ ASSERT_EQ("h", Get("g"));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_P(DBTestWithParam, FIFOCompactionTest) {
+ for (int iter = 0; iter < 2; ++iter) {
+ // first iteration -- auto compaction
+ // second iteration -- manual compaction
+ Options options;
+ options.compaction_style = kCompactionStyleFIFO;
+ options.write_buffer_size = 100 << 10; // 100KB
+ options.arena_block_size = 4096;
+ options.compaction_options_fifo.max_table_files_size = 500 << 10; // 500KB
+ options.compression = kNoCompression;
+ options.create_if_missing = true;
+ options.max_subcompactions = max_subcompactions_;
+ if (iter == 1) {
+ options.disable_auto_compactions = true;
+ }
+ options = CurrentOptions(options);
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < 6; ++i) {
+ for (int j = 0; j < 110; ++j) {
+ ASSERT_OK(Put(ToString(i * 100 + j), RandomString(&rnd, 980)));
+ }
+ // flush should happen here
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ }
+ if (iter == 0) {
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ } else {
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ }
+ // only 5 files should survive
+ ASSERT_EQ(NumTableFilesAtLevel(0), 5);
+ for (int i = 0; i < 50; ++i) {
+ // these keys should be deleted in previous compaction
+ ASSERT_EQ("NOT_FOUND", Get(ToString(i)));
+ }
+ }
+}
+
+TEST_F(DBTest, FIFOCompactionTestWithCompaction) {
+ Options options;
+ options.compaction_style = kCompactionStyleFIFO;
+ options.write_buffer_size = 20 << 10; // 20K
+ options.arena_block_size = 4096;
+ options.compaction_options_fifo.max_table_files_size = 1500 << 10; // 1MB
+ options.compaction_options_fifo.allow_compaction = true;
+ options.level0_file_num_compaction_trigger = 6;
+ options.compression = kNoCompression;
+ options.create_if_missing = true;
+ options = CurrentOptions(options);
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < 60; i++) {
+ // Generate and flush a file about 20KB.
+ for (int j = 0; j < 20; j++) {
+ ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+ }
+ Flush();
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ // It should be compacted to 10 files.
+ ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+ for (int i = 0; i < 60; i++) {
+ // Generate and flush a file about 20KB.
+ for (int j = 0; j < 20; j++) {
+ ASSERT_OK(Put(ToString(i * 20 + j + 2000), RandomString(&rnd, 980)));
+ }
+ Flush();
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+
+ // It should be compacted to no more than 20 files.
+ ASSERT_GT(NumTableFilesAtLevel(0), 10);
+ ASSERT_LT(NumTableFilesAtLevel(0), 18);
+ // Size limit is still guaranteed.
+ ASSERT_LE(SizeAtLevel(0),
+ options.compaction_options_fifo.max_table_files_size);
+}
+
+TEST_F(DBTest, FIFOCompactionStyleWithCompactionAndDelete) {
+ Options options;
+ options.compaction_style = kCompactionStyleFIFO;
+ options.write_buffer_size = 20 << 10; // 20K
+ options.arena_block_size = 4096;
+ options.compaction_options_fifo.max_table_files_size = 1500 << 10; // 1MB
+ options.compaction_options_fifo.allow_compaction = true;
+ options.level0_file_num_compaction_trigger = 3;
+ options.compression = kNoCompression;
+ options.create_if_missing = true;
+ options = CurrentOptions(options);
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < 3; i++) {
+ // Each file contains a different key which will be dropped later.
+ ASSERT_OK(Put("a" + ToString(i), RandomString(&rnd, 500)));
+ ASSERT_OK(Put("key" + ToString(i), ""));
+ ASSERT_OK(Put("z" + ToString(i), RandomString(&rnd, 500)));
+ Flush();
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+ for (int i = 0; i < 3; i++) {
+ ASSERT_EQ("", Get("key" + ToString(i)));
+ }
+ for (int i = 0; i < 3; i++) {
+ // Each file contains a different key which will be dropped later.
+ ASSERT_OK(Put("a" + ToString(i), RandomString(&rnd, 500)));
+ ASSERT_OK(Delete("key" + ToString(i)));
+ ASSERT_OK(Put("z" + ToString(i), RandomString(&rnd, 500)));
+ Flush();
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ ASSERT_EQ(NumTableFilesAtLevel(0), 2);
+ for (int i = 0; i < 3; i++) {
+ ASSERT_EQ("NOT_FOUND", Get("key" + ToString(i)));
+ }
+}
+
+// Check that FIFO-with-TTL is not supported with max_open_files != -1.
+TEST_F(DBTest, FIFOCompactionWithTTLAndMaxOpenFilesTest) {
+ Options options;
+ options.compaction_style = kCompactionStyleFIFO;
+ options.create_if_missing = true;
+ options.ttl = 600; // seconds
+
+ // TTL is now supported with max_open_files != -1.
+ options.max_open_files = 100;
+ options = CurrentOptions(options);
+ ASSERT_OK(TryReopen(options));
+
+ options.max_open_files = -1;
+ ASSERT_OK(TryReopen(options));
+}
+
+// Check that FIFO-with-TTL is supported only with BlockBasedTableFactory.
+TEST_F(DBTest, FIFOCompactionWithTTLAndVariousTableFormatsTest) {
+ Options options;
+ options.compaction_style = kCompactionStyleFIFO;
+ options.create_if_missing = true;
+ options.ttl = 600; // seconds
+
+ options = CurrentOptions(options);
+ options.table_factory.reset(NewBlockBasedTableFactory());
+ ASSERT_OK(TryReopen(options));
+
+ Destroy(options);
+ options.table_factory.reset(NewPlainTableFactory());
+ ASSERT_TRUE(TryReopen(options).IsNotSupported());
+
+ Destroy(options);
+ options.table_factory.reset(NewAdaptiveTableFactory());
+ ASSERT_TRUE(TryReopen(options).IsNotSupported());
+}
+
+TEST_F(DBTest, FIFOCompactionWithTTLTest) {
+ Options options;
+ options.compaction_style = kCompactionStyleFIFO;
+ options.write_buffer_size = 10 << 10; // 10KB
+ options.arena_block_size = 4096;
+ options.compression = kNoCompression;
+ options.create_if_missing = true;
+ env_->time_elapse_only_sleep_ = false;
+ options.env = env_;
+
+ // Test to make sure that all files with expired ttl are deleted on next
+ // manual compaction.
+ {
+ env_->addon_time_.store(0);
+ options.compaction_options_fifo.max_table_files_size = 150 << 10; // 150KB
+ options.compaction_options_fifo.allow_compaction = false;
+ options.ttl = 1 * 60 * 60 ; // 1 hour
+ options = CurrentOptions(options);
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < 10; i++) {
+ // Generate and flush a file about 10KB.
+ for (int j = 0; j < 10; j++) {
+ ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+ }
+ Flush();
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+ // Sleep for 2 hours -- which is much greater than TTL.
+ // Note: Couldn't use SleepForMicroseconds because it takes an int instead
+ // of uint64_t. Hence used addon_time_ directly.
+ // env_->SleepForMicroseconds(2 * 60 * 60 * 1000 * 1000);
+ env_->addon_time_.fetch_add(2 * 60 * 60);
+
+ // Since no flushes and compactions have run, the db should still be in
+ // the same state even after considerable time has passed.
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ }
+
+ // Test to make sure that all files with expired ttl are deleted on next
+ // automatic compaction.
+ {
+ options.compaction_options_fifo.max_table_files_size = 150 << 10; // 150KB
+ options.compaction_options_fifo.allow_compaction = false;
+ options.ttl = 1 * 60 * 60; // 1 hour
+ options = CurrentOptions(options);
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < 10; i++) {
+ // Generate and flush a file about 10KB.
+ for (int j = 0; j < 10; j++) {
+ ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+ }
+ Flush();
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+ // Sleep for 2 hours -- which is much greater than TTL.
+ env_->addon_time_.fetch_add(2 * 60 * 60);
+ // Just to make sure that we are in the same state even after sleeping.
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+ // Create 1 more file to trigger TTL compaction. The old files are dropped.
+ for (int i = 0; i < 1; i++) {
+ for (int j = 0; j < 10; j++) {
+ ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+ }
+ Flush();
+ }
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ // Only the new 10 files remain.
+ ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+ ASSERT_LE(SizeAtLevel(0),
+ options.compaction_options_fifo.max_table_files_size);
+ }
+
+ // Test that shows the fall back to size-based FIFO compaction if TTL-based
+ // deletion doesn't move the total size to be less than max_table_files_size.
+ {
+ options.write_buffer_size = 10 << 10; // 10KB
+ options.compaction_options_fifo.max_table_files_size = 150 << 10; // 150KB
+ options.compaction_options_fifo.allow_compaction = false;
+ options.ttl = 1 * 60 * 60; // 1 hour
+ options = CurrentOptions(options);
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < 3; i++) {
+ // Generate and flush a file about 10KB.
+ for (int j = 0; j < 10; j++) {
+ ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+ }
+ Flush();
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ ASSERT_EQ(NumTableFilesAtLevel(0), 3);
+
+ // Sleep for 2 hours -- which is much greater than TTL.
+ env_->addon_time_.fetch_add(2 * 60 * 60);
+ // Just to make sure that we are in the same state even after sleeping.
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(NumTableFilesAtLevel(0), 3);
+
+ for (int i = 0; i < 5; i++) {
+ for (int j = 0; j < 140; j++) {
+ ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+ }
+ Flush();
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ // Size limit is still guaranteed.
+ ASSERT_LE(SizeAtLevel(0),
+ options.compaction_options_fifo.max_table_files_size);
+ }
+
+ // Test with TTL + Intra-L0 compactions.
+ {
+ options.compaction_options_fifo.max_table_files_size = 150 << 10; // 150KB
+ options.compaction_options_fifo.allow_compaction = true;
+ options.ttl = 1 * 60 * 60; // 1 hour
+ options.level0_file_num_compaction_trigger = 6;
+ options = CurrentOptions(options);
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < 10; i++) {
+ // Generate and flush a file about 10KB.
+ for (int j = 0; j < 10; j++) {
+ ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+ }
+ Flush();
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ // With Intra-L0 compaction, out of 10 files, 6 files will be compacted to 1
+ // (due to level0_file_num_compaction_trigger = 6).
+ // So total files = 1 + remaining 4 = 5.
+ ASSERT_EQ(NumTableFilesAtLevel(0), 5);
+
+ // Sleep for 2 hours -- which is much greater than TTL.
+ env_->addon_time_.fetch_add(2 * 60 * 60);
+ // Just to make sure that we are in the same state even after sleeping.
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(NumTableFilesAtLevel(0), 5);
+
+ // Create 10 more files. The old 5 files are dropped as their ttl expired.
+ for (int i = 0; i < 10; i++) {
+ for (int j = 0; j < 10; j++) {
+ ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+ }
+ Flush();
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ ASSERT_EQ(NumTableFilesAtLevel(0), 5);
+ ASSERT_LE(SizeAtLevel(0),
+ options.compaction_options_fifo.max_table_files_size);
+ }
+
+ // Test with large TTL + Intra-L0 compactions.
+ // Files dropped based on size, as ttl doesn't kick in.
+ {
+ options.write_buffer_size = 20 << 10; // 20K
+ options.compaction_options_fifo.max_table_files_size = 1500 << 10; // 1.5MB
+ options.compaction_options_fifo.allow_compaction = true;
+ options.ttl = 1 * 60 * 60; // 1 hour
+ options.level0_file_num_compaction_trigger = 6;
+ options = CurrentOptions(options);
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < 60; i++) {
+ // Generate and flush a file about 20KB.
+ for (int j = 0; j < 20; j++) {
+ ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+ }
+ Flush();
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ // It should be compacted to 10 files.
+ ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+ for (int i = 0; i < 60; i++) {
+ // Generate and flush a file about 20KB.
+ for (int j = 0; j < 20; j++) {
+ ASSERT_OK(Put(ToString(i * 20 + j + 2000), RandomString(&rnd, 980)));
+ }
+ Flush();
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+
+ // It should be compacted to no more than 20 files.
+ ASSERT_GT(NumTableFilesAtLevel(0), 10);
+ ASSERT_LT(NumTableFilesAtLevel(0), 18);
+ // Size limit is still guaranteed.
+ ASSERT_LE(SizeAtLevel(0),
+ options.compaction_options_fifo.max_table_files_size);
+ }
+}
+#endif // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+/*
+ * This test is not reliable enough as it heavily depends on disk behavior.
+ * Disable as it is flaky.
+ */
+TEST_F(DBTest, DISABLED_RateLimitingTest) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 1 << 20; // 1MB
+ options.level0_file_num_compaction_trigger = 2;
+ options.target_file_size_base = 1 << 20; // 1MB
+ options.max_bytes_for_level_base = 4 << 20; // 4MB
+ options.max_bytes_for_level_multiplier = 4;
+ options.compression = kNoCompression;
+ options.create_if_missing = true;
+ options.env = env_;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.IncreaseParallelism(4);
+ DestroyAndReopen(options);
+
+ WriteOptions wo;
+ wo.disableWAL = true;
+
+ // # no rate limiting
+ Random rnd(301);
+ uint64_t start = env_->NowMicros();
+ // Write ~96M data
+ for (int64_t i = 0; i < (96 << 10); ++i) {
+ ASSERT_OK(
+ Put(RandomString(&rnd, 32), RandomString(&rnd, (1 << 10) + 1), wo));
+ }
+ uint64_t elapsed = env_->NowMicros() - start;
+ double raw_rate = env_->bytes_written_ * 1000000.0 / elapsed;
+ uint64_t rate_limiter_drains =
+ TestGetTickerCount(options, NUMBER_RATE_LIMITER_DRAINS);
+ ASSERT_EQ(0, rate_limiter_drains);
+ Close();
+
+ // # rate limiting with 0.7 x threshold
+ options.rate_limiter.reset(
+ NewGenericRateLimiter(static_cast<int64_t>(0.7 * raw_rate)));
+ env_->bytes_written_ = 0;
+ DestroyAndReopen(options);
+
+ start = env_->NowMicros();
+ // Write ~96M data
+ for (int64_t i = 0; i < (96 << 10); ++i) {
+ ASSERT_OK(
+ Put(RandomString(&rnd, 32), RandomString(&rnd, (1 << 10) + 1), wo));
+ }
+ rate_limiter_drains =
+ TestGetTickerCount(options, NUMBER_RATE_LIMITER_DRAINS) -
+ rate_limiter_drains;
+ elapsed = env_->NowMicros() - start;
+ Close();
+ ASSERT_EQ(options.rate_limiter->GetTotalBytesThrough(), env_->bytes_written_);
+ // Most intervals should've been drained (interval time is 100ms, elapsed is
+ // micros)
+ ASSERT_GT(rate_limiter_drains, 0);
+ ASSERT_LE(rate_limiter_drains, elapsed / 100000 + 1);
+ double ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate;
+ fprintf(stderr, "write rate ratio = %.2lf, expected 0.7\n", ratio);
+ ASSERT_TRUE(ratio < 0.8);
+
+ // # rate limiting with half of the raw_rate
+ options.rate_limiter.reset(
+ NewGenericRateLimiter(static_cast<int64_t>(raw_rate / 2)));
+ env_->bytes_written_ = 0;
+ DestroyAndReopen(options);
+
+ start = env_->NowMicros();
+ // Write ~96M data
+ for (int64_t i = 0; i < (96 << 10); ++i) {
+ ASSERT_OK(
+ Put(RandomString(&rnd, 32), RandomString(&rnd, (1 << 10) + 1), wo));
+ }
+ elapsed = env_->NowMicros() - start;
+ rate_limiter_drains =
+ TestGetTickerCount(options, NUMBER_RATE_LIMITER_DRAINS) -
+ rate_limiter_drains;
+ Close();
+ ASSERT_EQ(options.rate_limiter->GetTotalBytesThrough(), env_->bytes_written_);
+ // Most intervals should've been drained (interval time is 100ms, elapsed is
+ // micros)
+ ASSERT_GT(rate_limiter_drains, elapsed / 100000 / 2);
+ ASSERT_LE(rate_limiter_drains, elapsed / 100000 + 1);
+ ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate;
+ fprintf(stderr, "write rate ratio = %.2lf, expected 0.5\n", ratio);
+ ASSERT_LT(ratio, 0.6);
+}
+
+TEST_F(DBTest, TableOptionsSanitizeTest) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+ ASSERT_EQ(db_->GetOptions().allow_mmap_reads, false);
+
+ options.table_factory.reset(new PlainTableFactory());
+ options.prefix_extractor.reset(NewNoopTransform());
+ Destroy(options);
+ ASSERT_TRUE(!TryReopen(options).IsNotSupported());
+
+ // Test for check of prefix_extractor when hash index is used for
+ // block-based table
+ BlockBasedTableOptions to;
+ to.index_type = BlockBasedTableOptions::kHashSearch;
+ options = CurrentOptions();
+ options.create_if_missing = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(to));
+ ASSERT_TRUE(TryReopen(options).IsInvalidArgument());
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+ ASSERT_OK(TryReopen(options));
+}
+
+TEST_F(DBTest, ConcurrentMemtableNotSupported) {
+ Options options = CurrentOptions();
+ options.allow_concurrent_memtable_write = true;
+ options.soft_pending_compaction_bytes_limit = 0;
+ options.hard_pending_compaction_bytes_limit = 100;
+ options.create_if_missing = true;
+
+ DestroyDB(dbname_, options);
+ options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true, 4));
+ ASSERT_NOK(TryReopen(options));
+
+ options.memtable_factory.reset(new SkipListFactory);
+ ASSERT_OK(TryReopen(options));
+
+ ColumnFamilyOptions cf_options(options);
+ cf_options.memtable_factory.reset(
+ NewHashLinkListRepFactory(4, 0, 3, true, 4));
+ ColumnFamilyHandle* handle;
+ ASSERT_NOK(db_->CreateColumnFamily(cf_options, "name", &handle));
+}
+
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTest, SanitizeNumThreads) {
+ for (int attempt = 0; attempt < 2; attempt++) {
+ const size_t kTotalTasks = 8;
+ test::SleepingBackgroundTask sleeping_tasks[kTotalTasks];
+
+ Options options = CurrentOptions();
+ if (attempt == 0) {
+ options.max_background_compactions = 3;
+ options.max_background_flushes = 2;
+ }
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+
+ for (size_t i = 0; i < kTotalTasks; i++) {
+ // Insert 5 tasks to low priority queue and 5 tasks to high priority queue
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+ &sleeping_tasks[i],
+ (i < 4) ? Env::Priority::LOW : Env::Priority::HIGH);
+ }
+
+ // Wait until 10s for they are scheduled.
+ for (int i = 0; i < 10000; i++) {
+ if (options.env->GetThreadPoolQueueLen(Env::Priority::LOW) <= 1 &&
+ options.env->GetThreadPoolQueueLen(Env::Priority::HIGH) <= 2) {
+ break;
+ }
+ env_->SleepForMicroseconds(1000);
+ }
+
+ // pool size 3, total task 4. Queue size should be 1.
+ ASSERT_EQ(1U, options.env->GetThreadPoolQueueLen(Env::Priority::LOW));
+ // pool size 2, total task 4. Queue size should be 2.
+ ASSERT_EQ(2U, options.env->GetThreadPoolQueueLen(Env::Priority::HIGH));
+
+ for (size_t i = 0; i < kTotalTasks; i++) {
+ sleeping_tasks[i].WakeUp();
+ sleeping_tasks[i].WaitUntilDone();
+ }
+
+ ASSERT_OK(Put("abc", "def"));
+ ASSERT_EQ("def", Get("abc"));
+ Flush();
+ ASSERT_EQ("def", Get("abc"));
+ }
+}
+
+TEST_F(DBTest, WriteSingleThreadEntry) {
+ std::vector<port::Thread> threads;
+ dbfull()->TEST_LockMutex();
+ auto w = dbfull()->TEST_BeginWrite();
+ threads.emplace_back([&] { Put("a", "b"); });
+ env_->SleepForMicroseconds(10000);
+ threads.emplace_back([&] { Flush(); });
+ env_->SleepForMicroseconds(10000);
+ dbfull()->TEST_UnlockMutex();
+ dbfull()->TEST_LockMutex();
+ dbfull()->TEST_EndWrite(w);
+ dbfull()->TEST_UnlockMutex();
+
+ for (auto& t : threads) {
+ t.join();
+ }
+}
+
+TEST_F(DBTest, ConcurrentFlushWAL) {
+ const size_t cnt = 100;
+ Options options;
+ WriteOptions wopt;
+ ReadOptions ropt;
+ for (bool two_write_queues : {false, true}) {
+ for (bool manual_wal_flush : {false, true}) {
+ options.two_write_queues = two_write_queues;
+ options.manual_wal_flush = manual_wal_flush;
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+ std::vector<port::Thread> threads;
+ threads.emplace_back([&] {
+ for (size_t i = 0; i < cnt; i++) {
+ auto istr = ToString(i);
+ db_->Put(wopt, db_->DefaultColumnFamily(), "a" + istr, "b" + istr);
+ }
+ });
+ if (two_write_queues) {
+ threads.emplace_back([&] {
+ for (size_t i = cnt; i < 2 * cnt; i++) {
+ auto istr = ToString(i);
+ WriteBatch batch;
+ batch.Put("a" + istr, "b" + istr);
+ dbfull()->WriteImpl(wopt, &batch, nullptr, nullptr, 0, true);
+ }
+ });
+ }
+ threads.emplace_back([&] {
+ for (size_t i = 0; i < cnt * 100; i++) { // FlushWAL is faster than Put
+ db_->FlushWAL(false);
+ }
+ });
+ for (auto& t : threads) {
+ t.join();
+ }
+ options.create_if_missing = false;
+ // Recover from the wal and make sure that it is not corrupted
+ Reopen(options);
+ for (size_t i = 0; i < cnt; i++) {
+ PinnableSlice pval;
+ auto istr = ToString(i);
+ ASSERT_OK(
+ db_->Get(ropt, db_->DefaultColumnFamily(), "a" + istr, &pval));
+ ASSERT_TRUE(pval == ("b" + istr));
+ }
+ }
+ }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, DynamicMemtableOptions) {
+ const uint64_t k64KB = 1 << 16;
+ const uint64_t k128KB = 1 << 17;
+ const uint64_t k5KB = 5 * 1024;
+ Options options;
+ options.env = env_;
+ options.create_if_missing = true;
+ options.compression = kNoCompression;
+ options.max_background_compactions = 1;
+ options.write_buffer_size = k64KB;
+ options.arena_block_size = 16 * 1024;
+ options.max_write_buffer_number = 2;
+ // Don't trigger compact/slowdown/stop
+ options.level0_file_num_compaction_trigger = 1024;
+ options.level0_slowdown_writes_trigger = 1024;
+ options.level0_stop_writes_trigger = 1024;
+ DestroyAndReopen(options);
+
+ auto gen_l0_kb = [this](int size) {
+ const int kNumPutsBeforeWaitForFlush = 64;
+ Random rnd(301);
+ for (int i = 0; i < size; i++) {
+ ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+
+ // The following condition prevents a race condition between flush jobs
+ // acquiring work and this thread filling up multiple memtables. Without
+ // this, the flush might produce less files than expected because
+ // multiple memtables are flushed into a single L0 file. This race
+ // condition affects assertion (A).
+ if (i % kNumPutsBeforeWaitForFlush == kNumPutsBeforeWaitForFlush - 1) {
+ dbfull()->TEST_WaitForFlushMemTable();
+ }
+ }
+ dbfull()->TEST_WaitForFlushMemTable();
+ };
+
+ // Test write_buffer_size
+ gen_l0_kb(64);
+ ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+ ASSERT_LT(SizeAtLevel(0), k64KB + k5KB);
+ ASSERT_GT(SizeAtLevel(0), k64KB - k5KB * 2);
+
+ // Clean up L0
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+ // Increase buffer size
+ ASSERT_OK(dbfull()->SetOptions({
+ {"write_buffer_size", "131072"},
+ }));
+
+ // The existing memtable inflated 64KB->128KB when we invoked SetOptions().
+ // Write 192KB, we should have a 128KB L0 file and a memtable with 64KB data.
+ gen_l0_kb(192);
+ ASSERT_EQ(NumTableFilesAtLevel(0), 1); // (A)
+ ASSERT_LT(SizeAtLevel(0), k128KB + 2 * k5KB);
+ ASSERT_GT(SizeAtLevel(0), k128KB - 4 * k5KB);
+
+ // Decrease buffer size below current usage
+ ASSERT_OK(dbfull()->SetOptions({
+ {"write_buffer_size", "65536"},
+ }));
+ // The existing memtable became eligible for flush when we reduced its
+ // capacity to 64KB. Two keys need to be added to trigger flush: first causes
+ // memtable to be marked full, second schedules the flush. Then we should have
+ // a 128KB L0 file, a 64KB L0 file, and a memtable with just one key.
+ gen_l0_kb(2);
+ ASSERT_EQ(NumTableFilesAtLevel(0), 2);
+ ASSERT_LT(SizeAtLevel(0), k128KB + k64KB + 2 * k5KB);
+ ASSERT_GT(SizeAtLevel(0), k128KB + k64KB - 4 * k5KB);
+
+ // Test max_write_buffer_number
+ // Block compaction thread, which will also block the flushes because
+ // max_background_flushes == 0, so flushes are getting executed by the
+ // compaction thread
+ env_->SetBackgroundThreads(1, Env::LOW);
+ test::SleepingBackgroundTask sleeping_task_low;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+ // Start from scratch and disable compaction/flush. Flush can only happen
+ // during compaction but trigger is pretty high
+ options.disable_auto_compactions = true;
+ DestroyAndReopen(options);
+ env_->SetBackgroundThreads(0, Env::HIGH);
+
+ // Put until writes are stopped, bounded by 256 puts. We should see stop at
+ // ~128KB
+ int count = 0;
+ Random rnd(301);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::DelayWrite:Wait",
+ [&](void* /*arg*/) { sleeping_task_low.WakeUp(); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ while (!sleeping_task_low.WokenUp() && count < 256) {
+ ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), WriteOptions()));
+ count++;
+ }
+ ASSERT_GT(static_cast<double>(count), 128 * 0.8);
+ ASSERT_LT(static_cast<double>(count), 128 * 1.2);
+
+ sleeping_task_low.WaitUntilDone();
+
+ // Increase
+ ASSERT_OK(dbfull()->SetOptions({
+ {"max_write_buffer_number", "8"},
+ }));
+ // Clean up memtable and L0
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+ sleeping_task_low.Reset();
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+ count = 0;
+ while (!sleeping_task_low.WokenUp() && count < 1024) {
+ ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), WriteOptions()));
+ count++;
+ }
+// Windows fails this test. Will tune in the future and figure out
+// approp number
+#ifndef OS_WIN
+ ASSERT_GT(static_cast<double>(count), 512 * 0.8);
+ ASSERT_LT(static_cast<double>(count), 512 * 1.2);
+#endif
+ sleeping_task_low.WaitUntilDone();
+
+ // Decrease
+ ASSERT_OK(dbfull()->SetOptions({
+ {"max_write_buffer_number", "4"},
+ }));
+ // Clean up memtable and L0
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+ sleeping_task_low.Reset();
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+
+ count = 0;
+ while (!sleeping_task_low.WokenUp() && count < 1024) {
+ ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), WriteOptions()));
+ count++;
+ }
+// Windows fails this test. Will tune in the future and figure out
+// approp number
+#ifndef OS_WIN
+ ASSERT_GT(static_cast<double>(count), 256 * 0.8);
+ ASSERT_LT(static_cast<double>(count), 266 * 1.2);
+#endif
+ sleeping_task_low.WaitUntilDone();
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+#endif // ROCKSDB_LITE
+
+#ifdef ROCKSDB_USING_THREAD_STATUS
+namespace {
+void VerifyOperationCount(Env* env, ThreadStatus::OperationType op_type,
+ int expected_count) {
+ int op_count = 0;
+ std::vector<ThreadStatus> thread_list;
+ ASSERT_OK(env->GetThreadList(&thread_list));
+ for (auto thread : thread_list) {
+ if (thread.operation_type == op_type) {
+ op_count++;
+ }
+ }
+ ASSERT_EQ(op_count, expected_count);
+}
+} // namespace
+
+TEST_F(DBTest, GetThreadStatus) {
+ Options options;
+ options.env = env_;
+ options.enable_thread_tracking = true;
+ TryReopen(options);
+
+ std::vector<ThreadStatus> thread_list;
+ Status s = env_->GetThreadList(&thread_list);
+
+ for (int i = 0; i < 2; ++i) {
+ // repeat the test with differet number of high / low priority threads
+ const int kTestCount = 3;
+ const unsigned int kHighPriCounts[kTestCount] = {3, 2, 5};
+ const unsigned int kLowPriCounts[kTestCount] = {10, 15, 3};
+ const unsigned int kBottomPriCounts[kTestCount] = {2, 1, 4};
+ for (int test = 0; test < kTestCount; ++test) {
+ // Change the number of threads in high / low priority pool.
+ env_->SetBackgroundThreads(kHighPriCounts[test], Env::HIGH);
+ env_->SetBackgroundThreads(kLowPriCounts[test], Env::LOW);
+ env_->SetBackgroundThreads(kBottomPriCounts[test], Env::BOTTOM);
+ // Wait to ensure the all threads has been registered
+ unsigned int thread_type_counts[ThreadStatus::NUM_THREAD_TYPES];
+ // TODO(ajkr): it'd be better if SetBackgroundThreads returned only after
+ // all threads have been registered.
+ // Try up to 60 seconds.
+ for (int num_try = 0; num_try < 60000; num_try++) {
+ env_->SleepForMicroseconds(1000);
+ thread_list.clear();
+ s = env_->GetThreadList(&thread_list);
+ ASSERT_OK(s);
+ memset(thread_type_counts, 0, sizeof(thread_type_counts));
+ for (auto thread : thread_list) {
+ ASSERT_LT(thread.thread_type, ThreadStatus::NUM_THREAD_TYPES);
+ thread_type_counts[thread.thread_type]++;
+ }
+ if (thread_type_counts[ThreadStatus::HIGH_PRIORITY] ==
+ kHighPriCounts[test] &&
+ thread_type_counts[ThreadStatus::LOW_PRIORITY] ==
+ kLowPriCounts[test] &&
+ thread_type_counts[ThreadStatus::BOTTOM_PRIORITY] ==
+ kBottomPriCounts[test]) {
+ break;
+ }
+ }
+ // Verify the number of high-priority threads
+ ASSERT_EQ(thread_type_counts[ThreadStatus::HIGH_PRIORITY],
+ kHighPriCounts[test]);
+ // Verify the number of low-priority threads
+ ASSERT_EQ(thread_type_counts[ThreadStatus::LOW_PRIORITY],
+ kLowPriCounts[test]);
+ // Verify the number of bottom-priority threads
+ ASSERT_EQ(thread_type_counts[ThreadStatus::BOTTOM_PRIORITY],
+ kBottomPriCounts[test]);
+ }
+ if (i == 0) {
+ // repeat the test with multiple column families
+ CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options);
+ env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_,
+ true);
+ }
+ }
+ db_->DropColumnFamily(handles_[2]);
+ delete handles_[2];
+ handles_.erase(handles_.begin() + 2);
+ env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_,
+ true);
+ Close();
+ env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_,
+ true);
+}
+
+TEST_F(DBTest, DisableThreadStatus) {
+ Options options;
+ options.env = env_;
+ options.enable_thread_tracking = false;
+ TryReopen(options);
+ CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options);
+ // Verify non of the column family info exists
+ env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_,
+ false);
+}
+
+TEST_F(DBTest, ThreadStatusFlush) {
+ Options options;
+ options.env = env_;
+ options.write_buffer_size = 100000; // Small write buffer
+ options.enable_thread_tracking = true;
+ options = CurrentOptions(options);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"FlushJob::FlushJob()", "DBTest::ThreadStatusFlush:1"},
+ {"DBTest::ThreadStatusFlush:2", "FlushJob::WriteLevel0Table"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ CreateAndReopenWithCF({"pikachu"}, options);
+ VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0);
+
+ ASSERT_OK(Put(1, "foo", "v1"));
+ ASSERT_EQ("v1", Get(1, "foo"));
+ VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0);
+
+ uint64_t num_running_flushes = 0;
+ db_->GetIntProperty(DB::Properties::kNumRunningFlushes, &num_running_flushes);
+ ASSERT_EQ(num_running_flushes, 0);
+
+ Put(1, "k1", std::string(100000, 'x')); // Fill memtable
+ Put(1, "k2", std::string(100000, 'y')); // Trigger flush
+
+ // The first sync point is to make sure there's one flush job
+ // running when we perform VerifyOperationCount().
+ TEST_SYNC_POINT("DBTest::ThreadStatusFlush:1");
+ VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 1);
+ db_->GetIntProperty(DB::Properties::kNumRunningFlushes, &num_running_flushes);
+ ASSERT_EQ(num_running_flushes, 1);
+ // This second sync point is to ensure the flush job will not
+ // be completed until we already perform VerifyOperationCount().
+ TEST_SYNC_POINT("DBTest::ThreadStatusFlush:2");
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBTestWithParam, ThreadStatusSingleCompaction) {
+ const int kTestKeySize = 16;
+ const int kTestValueSize = 984;
+ const int kEntrySize = kTestKeySize + kTestValueSize;
+ const int kEntriesPerBuffer = 100;
+ Options options;
+ options.create_if_missing = true;
+ options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+ options.compaction_style = kCompactionStyleLevel;
+ options.target_file_size_base = options.write_buffer_size;
+ options.max_bytes_for_level_base = options.target_file_size_base * 2;
+ options.max_bytes_for_level_multiplier = 2;
+ options.compression = kNoCompression;
+ options = CurrentOptions(options);
+ options.env = env_;
+ options.enable_thread_tracking = true;
+ const int kNumL0Files = 4;
+ options.level0_file_num_compaction_trigger = kNumL0Files;
+ options.max_subcompactions = max_subcompactions_;
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"DBTest::ThreadStatusSingleCompaction:0", "DBImpl::BGWorkCompaction"},
+ {"CompactionJob::Run():Start", "DBTest::ThreadStatusSingleCompaction:1"},
+ {"DBTest::ThreadStatusSingleCompaction:2", "CompactionJob::Run():End"},
+ });
+ for (int tests = 0; tests < 2; ++tests) {
+ DestroyAndReopen(options);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ // The Put Phase.
+ for (int file = 0; file < kNumL0Files; ++file) {
+ for (int key = 0; key < kEntriesPerBuffer; ++key) {
+ ASSERT_OK(Put(ToString(key + file * kEntriesPerBuffer),
+ RandomString(&rnd, kTestValueSize)));
+ }
+ Flush();
+ }
+ // This makes sure a compaction won't be scheduled until
+ // we have done with the above Put Phase.
+ uint64_t num_running_compactions = 0;
+ db_->GetIntProperty(DB::Properties::kNumRunningCompactions,
+ &num_running_compactions);
+ ASSERT_EQ(num_running_compactions, 0);
+ TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:0");
+ ASSERT_GE(NumTableFilesAtLevel(0),
+ options.level0_file_num_compaction_trigger);
+
+ // This makes sure at least one compaction is running.
+ TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:1");
+
+ if (options.enable_thread_tracking) {
+ // expecting one single L0 to L1 compaction
+ VerifyOperationCount(env_, ThreadStatus::OP_COMPACTION, 1);
+ } else {
+ // If thread tracking is not enabled, compaction count should be 0.
+ VerifyOperationCount(env_, ThreadStatus::OP_COMPACTION, 0);
+ }
+ db_->GetIntProperty(DB::Properties::kNumRunningCompactions,
+ &num_running_compactions);
+ ASSERT_EQ(num_running_compactions, 1);
+ // TODO(yhchiang): adding assert to verify each compaction stage.
+ TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:2");
+
+ // repeat the test with disabling thread tracking.
+ options.enable_thread_tracking = false;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+}
+
+TEST_P(DBTestWithParam, PreShutdownManualCompaction) {
+ Options options = CurrentOptions();
+ options.max_subcompactions = max_subcompactions_;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // iter - 0 with 7 levels
+ // iter - 1 with 3 levels
+ for (int iter = 0; iter < 2; ++iter) {
+ MakeTables(3, "p", "q", 1);
+ ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+ // Compaction range falls before files
+ Compact(1, "", "c");
+ ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+ // Compaction range falls after files
+ Compact(1, "r", "z");
+ ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+ // Compaction range overlaps files
+ Compact(1, "p1", "p9");
+ ASSERT_EQ("0,0,1", FilesPerLevel(1));
+
+ // Populate a different range
+ MakeTables(3, "c", "e", 1);
+ ASSERT_EQ("1,1,2", FilesPerLevel(1));
+
+ // Compact just the new range
+ Compact(1, "b", "f");
+ ASSERT_EQ("0,0,2", FilesPerLevel(1));
+
+ // Compact all
+ MakeTables(1, "a", "z", 1);
+ ASSERT_EQ("1,0,2", FilesPerLevel(1));
+ CancelAllBackgroundWork(db_);
+ db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr);
+ ASSERT_EQ("1,0,2", FilesPerLevel(1));
+
+ if (iter == 0) {
+ options = CurrentOptions();
+ options.num_levels = 3;
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ }
+ }
+}
+
+TEST_F(DBTest, PreShutdownFlush) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ASSERT_OK(Put(1, "key", "value"));
+ CancelAllBackgroundWork(db_);
+ Status s =
+ db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr);
+ ASSERT_TRUE(s.IsShutdownInProgress());
+}
+
+TEST_P(DBTestWithParam, PreShutdownMultipleCompaction) {
+ const int kTestKeySize = 16;
+ const int kTestValueSize = 984;
+ const int kEntrySize = kTestKeySize + kTestValueSize;
+ const int kEntriesPerBuffer = 40;
+ const int kNumL0Files = 4;
+
+ const int kHighPriCount = 3;
+ const int kLowPriCount = 5;
+ env_->SetBackgroundThreads(kHighPriCount, Env::HIGH);
+ env_->SetBackgroundThreads(kLowPriCount, Env::LOW);
+
+ Options options;
+ options.create_if_missing = true;
+ options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+ options.compaction_style = kCompactionStyleLevel;
+ options.target_file_size_base = options.write_buffer_size;
+ options.max_bytes_for_level_base =
+ options.target_file_size_base * kNumL0Files;
+ options.compression = kNoCompression;
+ options = CurrentOptions(options);
+ options.env = env_;
+ options.enable_thread_tracking = true;
+ options.level0_file_num_compaction_trigger = kNumL0Files;
+ options.max_bytes_for_level_multiplier = 2;
+ options.max_background_compactions = kLowPriCount;
+ options.level0_stop_writes_trigger = 1 << 10;
+ options.level0_slowdown_writes_trigger = 1 << 10;
+ options.max_subcompactions = max_subcompactions_;
+
+ TryReopen(options);
+ Random rnd(301);
+
+ std::vector<ThreadStatus> thread_list;
+ // Delay both flush and compaction
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"FlushJob::FlushJob()", "CompactionJob::Run():Start"},
+ {"CompactionJob::Run():Start",
+ "DBTest::PreShutdownMultipleCompaction:Preshutdown"},
+ {"CompactionJob::Run():Start",
+ "DBTest::PreShutdownMultipleCompaction:VerifyCompaction"},
+ {"DBTest::PreShutdownMultipleCompaction:Preshutdown",
+ "CompactionJob::Run():End"},
+ {"CompactionJob::Run():End",
+ "DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown"}});
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Make rocksdb busy
+ int key = 0;
+ // check how many threads are doing compaction using GetThreadList
+ int operation_count[ThreadStatus::NUM_OP_TYPES] = {0};
+ for (int file = 0; file < 16 * kNumL0Files; ++file) {
+ for (int k = 0; k < kEntriesPerBuffer; ++k) {
+ ASSERT_OK(Put(ToString(key++), RandomString(&rnd, kTestValueSize)));
+ }
+
+ Status s = env_->GetThreadList(&thread_list);
+ for (auto thread : thread_list) {
+ operation_count[thread.operation_type]++;
+ }
+
+ // Speed up the test
+ if (operation_count[ThreadStatus::OP_FLUSH] > 1 &&
+ operation_count[ThreadStatus::OP_COMPACTION] >
+ 0.6 * options.max_background_compactions) {
+ break;
+ }
+ if (file == 15 * kNumL0Files) {
+ TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:Preshutdown");
+ }
+ }
+
+ TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:Preshutdown");
+ ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1);
+ CancelAllBackgroundWork(db_);
+ TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown");
+ dbfull()->TEST_WaitForCompact();
+ // Record the number of compactions at a time.
+ for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) {
+ operation_count[i] = 0;
+ }
+ Status s = env_->GetThreadList(&thread_list);
+ for (auto thread : thread_list) {
+ operation_count[thread.operation_type]++;
+ }
+ ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0);
+}
+
+TEST_P(DBTestWithParam, PreShutdownCompactionMiddle) {
+ const int kTestKeySize = 16;
+ const int kTestValueSize = 984;
+ const int kEntrySize = kTestKeySize + kTestValueSize;
+ const int kEntriesPerBuffer = 40;
+ const int kNumL0Files = 4;
+
+ const int kHighPriCount = 3;
+ const int kLowPriCount = 5;
+ env_->SetBackgroundThreads(kHighPriCount, Env::HIGH);
+ env_->SetBackgroundThreads(kLowPriCount, Env::LOW);
+
+ Options options;
+ options.create_if_missing = true;
+ options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+ options.compaction_style = kCompactionStyleLevel;
+ options.target_file_size_base = options.write_buffer_size;
+ options.max_bytes_for_level_base =
+ options.target_file_size_base * kNumL0Files;
+ options.compression = kNoCompression;
+ options = CurrentOptions(options);
+ options.env = env_;
+ options.enable_thread_tracking = true;
+ options.level0_file_num_compaction_trigger = kNumL0Files;
+ options.max_bytes_for_level_multiplier = 2;
+ options.max_background_compactions = kLowPriCount;
+ options.level0_stop_writes_trigger = 1 << 10;
+ options.level0_slowdown_writes_trigger = 1 << 10;
+ options.max_subcompactions = max_subcompactions_;
+
+ TryReopen(options);
+ Random rnd(301);
+
+ std::vector<ThreadStatus> thread_list;
+ // Delay both flush and compaction
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBTest::PreShutdownCompactionMiddle:Preshutdown",
+ "CompactionJob::Run():Inprogress"},
+ {"CompactionJob::Run():Start",
+ "DBTest::PreShutdownCompactionMiddle:VerifyCompaction"},
+ {"CompactionJob::Run():Inprogress", "CompactionJob::Run():End"},
+ {"CompactionJob::Run():End",
+ "DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown"}});
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Make rocksdb busy
+ int key = 0;
+ // check how many threads are doing compaction using GetThreadList
+ int operation_count[ThreadStatus::NUM_OP_TYPES] = {0};
+ for (int file = 0; file < 16 * kNumL0Files; ++file) {
+ for (int k = 0; k < kEntriesPerBuffer; ++k) {
+ ASSERT_OK(Put(ToString(key++), RandomString(&rnd, kTestValueSize)));
+ }
+
+ Status s = env_->GetThreadList(&thread_list);
+ for (auto thread : thread_list) {
+ operation_count[thread.operation_type]++;
+ }
+
+ // Speed up the test
+ if (operation_count[ThreadStatus::OP_FLUSH] > 1 &&
+ operation_count[ThreadStatus::OP_COMPACTION] >
+ 0.6 * options.max_background_compactions) {
+ break;
+ }
+ if (file == 15 * kNumL0Files) {
+ TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyCompaction");
+ }
+ }
+
+ ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1);
+ CancelAllBackgroundWork(db_);
+ TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:Preshutdown");
+ TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown");
+ dbfull()->TEST_WaitForCompact();
+ // Record the number of compactions at a time.
+ for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) {
+ operation_count[i] = 0;
+ }
+ Status s = env_->GetThreadList(&thread_list);
+ for (auto thread : thread_list) {
+ operation_count[thread.operation_type]++;
+ }
+ ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0);
+}
+
+#endif // ROCKSDB_USING_THREAD_STATUS
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, FlushOnDestroy) {
+ WriteOptions wo;
+ wo.disableWAL = true;
+ ASSERT_OK(Put("foo", "v1", wo));
+ CancelAllBackgroundWork(db_);
+}
+
+TEST_F(DBTest, DynamicLevelCompressionPerLevel) {
+ if (!Snappy_Supported()) {
+ return;
+ }
+ const int kNKeys = 120;
+ int keys[kNKeys];
+ for (int i = 0; i < kNKeys; i++) {
+ keys[i] = i;
+ }
+ std::random_shuffle(std::begin(keys), std::end(keys));
+
+ Random rnd(301);
+ Options options;
+ options.create_if_missing = true;
+ options.db_write_buffer_size = 20480;
+ options.write_buffer_size = 20480;
+ options.max_write_buffer_number = 2;
+ options.level0_file_num_compaction_trigger = 2;
+ options.level0_slowdown_writes_trigger = 2;
+ options.level0_stop_writes_trigger = 2;
+ options.target_file_size_base = 20480;
+ options.level_compaction_dynamic_level_bytes = true;
+ options.max_bytes_for_level_base = 102400;
+ options.max_bytes_for_level_multiplier = 4;
+ options.max_background_compactions = 1;
+ options.num_levels = 5;
+
+ options.compression_per_level.resize(3);
+ options.compression_per_level[0] = kNoCompression;
+ options.compression_per_level[1] = kNoCompression;
+ options.compression_per_level[2] = kSnappyCompression;
+
+ OnFileDeletionListener* listener = new OnFileDeletionListener();
+ options.listeners.emplace_back(listener);
+
+ DestroyAndReopen(options);
+
+ // Insert more than 80K. L4 should be base level. Neither L0 nor L4 should
+ // be compressed, so total data size should be more than 80K.
+ for (int i = 0; i < 20; i++) {
+ ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000)));
+ }
+ Flush();
+ dbfull()->TEST_WaitForCompact();
+
+ ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(3), 0);
+ // Assuming each files' metadata is at least 50 bytes/
+ ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(4), 20U * 4000U + 50U * 4);
+
+ // Insert 400KB. Some data will be compressed
+ for (int i = 21; i < 120; i++) {
+ ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000)));
+ }
+ Flush();
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+ ASSERT_LT(SizeAtLevel(0) + SizeAtLevel(3) + SizeAtLevel(4),
+ 120U * 4000U + 50U * 24);
+ // Make sure data in files in L3 is not compacted by removing all files
+ // in L4 and calculate number of rows
+ ASSERT_OK(dbfull()->SetOptions({
+ {"disable_auto_compactions", "true"},
+ }));
+ ColumnFamilyMetaData cf_meta;
+ db_->GetColumnFamilyMetaData(&cf_meta);
+ for (auto file : cf_meta.levels[4].files) {
+ listener->SetExpectedFileName(dbname_ + file.name);
+ ASSERT_OK(dbfull()->DeleteFile(file.name));
+ }
+ listener->VerifyMatchedCount(cf_meta.levels[4].files.size());
+
+ int num_keys = 0;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ num_keys++;
+ }
+ ASSERT_OK(iter->status());
+ ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(3), num_keys * 4000U + num_keys * 10U);
+}
+
+TEST_F(DBTest, DynamicLevelCompressionPerLevel2) {
+ if (!Snappy_Supported() || !LZ4_Supported() || !Zlib_Supported()) {
+ return;
+ }
+ const int kNKeys = 500;
+ int keys[kNKeys];
+ for (int i = 0; i < kNKeys; i++) {
+ keys[i] = i;
+ }
+ std::random_shuffle(std::begin(keys), std::end(keys));
+
+ Random rnd(301);
+ Options options;
+ options.create_if_missing = true;
+ options.db_write_buffer_size = 6000000;
+ options.write_buffer_size = 600000;
+ options.max_write_buffer_number = 2;
+ options.level0_file_num_compaction_trigger = 2;
+ options.level0_slowdown_writes_trigger = 2;
+ options.level0_stop_writes_trigger = 2;
+ options.soft_pending_compaction_bytes_limit = 1024 * 1024;
+ options.target_file_size_base = 20;
+
+ options.level_compaction_dynamic_level_bytes = true;
+ options.max_bytes_for_level_base = 200;
+ options.max_bytes_for_level_multiplier = 8;
+ options.max_background_compactions = 1;
+ options.num_levels = 5;
+ std::shared_ptr<mock::MockTableFactory> mtf(new mock::MockTableFactory);
+ options.table_factory = mtf;
+
+ options.compression_per_level.resize(3);
+ options.compression_per_level[0] = kNoCompression;
+ options.compression_per_level[1] = kLZ4Compression;
+ options.compression_per_level[2] = kZlibCompression;
+
+ DestroyAndReopen(options);
+ // When base level is L4, L4 is LZ4.
+ std::atomic<int> num_zlib(0);
+ std::atomic<int> num_lz4(0);
+ std::atomic<int> num_no(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+ Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+ if (compaction->output_level() == 4) {
+ ASSERT_TRUE(compaction->output_compression() == kLZ4Compression);
+ num_lz4.fetch_add(1);
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) {
+ auto* compression = reinterpret_cast<CompressionType*>(arg);
+ ASSERT_TRUE(*compression == kNoCompression);
+ num_no.fetch_add(1);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ for (int i = 0; i < 100; i++) {
+ std::string value = RandomString(&rnd, 200);
+ ASSERT_OK(Put(Key(keys[i]), value));
+ if (i % 25 == 24) {
+ Flush();
+ dbfull()->TEST_WaitForCompact();
+ }
+ }
+
+ Flush();
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(3), 0);
+ ASSERT_GT(NumTableFilesAtLevel(4), 0);
+ ASSERT_GT(num_no.load(), 2);
+ ASSERT_GT(num_lz4.load(), 0);
+ int prev_num_files_l4 = NumTableFilesAtLevel(4);
+
+ // After base level turn L4->L3, L3 becomes LZ4 and L4 becomes Zlib
+ num_lz4.store(0);
+ num_no.store(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+ Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+ if (compaction->output_level() == 4 && compaction->start_level() == 3) {
+ ASSERT_TRUE(compaction->output_compression() == kZlibCompression);
+ num_zlib.fetch_add(1);
+ } else {
+ ASSERT_TRUE(compaction->output_compression() == kLZ4Compression);
+ num_lz4.fetch_add(1);
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) {
+ auto* compression = reinterpret_cast<CompressionType*>(arg);
+ ASSERT_TRUE(*compression == kNoCompression);
+ num_no.fetch_add(1);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ for (int i = 101; i < 500; i++) {
+ std::string value = RandomString(&rnd, 200);
+ ASSERT_OK(Put(Key(keys[i]), value));
+ if (i % 100 == 99) {
+ Flush();
+ dbfull()->TEST_WaitForCompact();
+ }
+ }
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+ ASSERT_GT(NumTableFilesAtLevel(3), 0);
+ ASSERT_GT(NumTableFilesAtLevel(4), prev_num_files_l4);
+ ASSERT_GT(num_no.load(), 2);
+ ASSERT_GT(num_lz4.load(), 0);
+ ASSERT_GT(num_zlib.load(), 0);
+}
+
+TEST_F(DBTest, DynamicCompactionOptions) {
+ // minimum write buffer size is enforced at 64KB
+ const uint64_t k32KB = 1 << 15;
+ const uint64_t k64KB = 1 << 16;
+ const uint64_t k128KB = 1 << 17;
+ const uint64_t k1MB = 1 << 20;
+ const uint64_t k4KB = 1 << 12;
+ Options options;
+ options.env = env_;
+ options.create_if_missing = true;
+ options.compression = kNoCompression;
+ options.soft_pending_compaction_bytes_limit = 1024 * 1024;
+ options.write_buffer_size = k64KB;
+ options.arena_block_size = 4 * k4KB;
+ options.max_write_buffer_number = 2;
+ // Compaction related options
+ options.level0_file_num_compaction_trigger = 3;
+ options.level0_slowdown_writes_trigger = 4;
+ options.level0_stop_writes_trigger = 8;
+ options.target_file_size_base = k64KB;
+ options.max_compaction_bytes = options.target_file_size_base * 10;
+ options.target_file_size_multiplier = 1;
+ options.max_bytes_for_level_base = k128KB;
+ options.max_bytes_for_level_multiplier = 4;
+
+ // Block flush thread and disable compaction thread
+ env_->SetBackgroundThreads(1, Env::LOW);
+ env_->SetBackgroundThreads(1, Env::HIGH);
+ DestroyAndReopen(options);
+
+ auto gen_l0_kb = [this](int start, int size, int stride) {
+ Random rnd(301);
+ for (int i = 0; i < size; i++) {
+ ASSERT_OK(Put(Key(start + stride * i), RandomString(&rnd, 1024)));
+ }
+ dbfull()->TEST_WaitForFlushMemTable();
+ };
+
+ // Write 3 files that have the same key range.
+ // Since level0_file_num_compaction_trigger is 3, compaction should be
+ // triggered. The compaction should result in one L1 file
+ gen_l0_kb(0, 64, 1);
+ ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+ gen_l0_kb(0, 64, 1);
+ ASSERT_EQ(NumTableFilesAtLevel(0), 2);
+ gen_l0_kb(0, 64, 1);
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ("0,1", FilesPerLevel());
+ std::vector<LiveFileMetaData> metadata;
+ db_->GetLiveFilesMetaData(&metadata);
+ ASSERT_EQ(1U, metadata.size());
+ ASSERT_LE(metadata[0].size, k64KB + k4KB);
+ ASSERT_GE(metadata[0].size, k64KB - k4KB);
+
+ // Test compaction trigger and target_file_size_base
+ // Reduce compaction trigger to 2, and reduce L1 file size to 32KB.
+ // Writing to 64KB L0 files should trigger a compaction. Since these
+ // 2 L0 files have the same key range, compaction merge them and should
+ // result in 2 32KB L1 files.
+ ASSERT_OK(dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"},
+ {"target_file_size_base", ToString(k32KB)}}));
+
+ gen_l0_kb(0, 64, 1);
+ ASSERT_EQ("1,1", FilesPerLevel());
+ gen_l0_kb(0, 64, 1);
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ("0,2", FilesPerLevel());
+ metadata.clear();
+ db_->GetLiveFilesMetaData(&metadata);
+ ASSERT_EQ(2U, metadata.size());
+ ASSERT_LE(metadata[0].size, k32KB + k4KB);
+ ASSERT_GE(metadata[0].size, k32KB - k4KB);
+ ASSERT_LE(metadata[1].size, k32KB + k4KB);
+ ASSERT_GE(metadata[1].size, k32KB - k4KB);
+
+ // Test max_bytes_for_level_base
+ // Increase level base size to 256KB and write enough data that will
+ // fill L1 and L2. L1 size should be around 256KB while L2 size should be
+ // around 256KB x 4.
+ ASSERT_OK(
+ dbfull()->SetOptions({{"max_bytes_for_level_base", ToString(k1MB)}}));
+
+ // writing 96 x 64KB => 6 * 1024KB
+ // (L1 + L2) = (1 + 4) * 1024KB
+ for (int i = 0; i < 96; ++i) {
+ gen_l0_kb(i, 64, 96);
+ }
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_GT(SizeAtLevel(1), k1MB / 2);
+ ASSERT_LT(SizeAtLevel(1), k1MB + k1MB / 2);
+
+ // Within (0.5, 1.5) of 4MB.
+ ASSERT_GT(SizeAtLevel(2), 2 * k1MB);
+ ASSERT_LT(SizeAtLevel(2), 6 * k1MB);
+
+ // Test max_bytes_for_level_multiplier and
+ // max_bytes_for_level_base. Now, reduce both mulitplier and level base,
+ // After filling enough data that can fit in L1 - L3, we should see L1 size
+ // reduces to 128KB from 256KB which was asserted previously. Same for L2.
+ ASSERT_OK(
+ dbfull()->SetOptions({{"max_bytes_for_level_multiplier", "2"},
+ {"max_bytes_for_level_base", ToString(k128KB)}}));
+
+ // writing 20 x 64KB = 10 x 128KB
+ // (L1 + L2 + L3) = (1 + 2 + 4) * 128KB
+ for (int i = 0; i < 20; ++i) {
+ gen_l0_kb(i, 64, 32);
+ }
+ dbfull()->TEST_WaitForCompact();
+ uint64_t total_size = SizeAtLevel(1) + SizeAtLevel(2) + SizeAtLevel(3);
+ ASSERT_TRUE(total_size < k128KB * 7 * 1.5);
+
+ // Test level0_stop_writes_trigger.
+ // Clean up memtable and L0. Block compaction threads. If continue to write
+ // and flush memtables. We should see put stop after 8 memtable flushes
+ // since level0_stop_writes_trigger = 8
+ dbfull()->TEST_FlushMemTable(true, true);
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ // Block compaction
+ test::SleepingBackgroundTask sleeping_task_low;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+ sleeping_task_low.WaitUntilSleeping();
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ int count = 0;
+ Random rnd(301);
+ WriteOptions wo;
+ while (count < 64) {
+ ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), wo));
+ dbfull()->TEST_FlushMemTable(true, true);
+ count++;
+ if (dbfull()->TEST_write_controler().IsStopped()) {
+ sleeping_task_low.WakeUp();
+ break;
+ }
+ }
+ // Stop trigger = 8
+ ASSERT_EQ(count, 8);
+ // Unblock
+ sleeping_task_low.WaitUntilDone();
+
+ // Now reduce level0_stop_writes_trigger to 6. Clear up memtables and L0.
+ // Block compaction thread again. Perform the put and memtable flushes
+ // until we see the stop after 6 memtable flushes.
+ ASSERT_OK(dbfull()->SetOptions({{"level0_stop_writes_trigger", "6"}}));
+ dbfull()->TEST_FlushMemTable(true);
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+ // Block compaction again
+ sleeping_task_low.Reset();
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+ sleeping_task_low.WaitUntilSleeping();
+ count = 0;
+ while (count < 64) {
+ ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), wo));
+ dbfull()->TEST_FlushMemTable(true, true);
+ count++;
+ if (dbfull()->TEST_write_controler().IsStopped()) {
+ sleeping_task_low.WakeUp();
+ break;
+ }
+ }
+ ASSERT_EQ(count, 6);
+ // Unblock
+ sleeping_task_low.WaitUntilDone();
+
+ // Test disable_auto_compactions
+ // Compaction thread is unblocked but auto compaction is disabled. Write
+ // 4 L0 files and compaction should be triggered. If auto compaction is
+ // disabled, then TEST_WaitForCompact will be waiting for nothing. Number of
+ // L0 files do not change after the call.
+ ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "true"}}));
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+ for (int i = 0; i < 4; ++i) {
+ ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+ // Wait for compaction so that put won't stop
+ dbfull()->TEST_FlushMemTable(true);
+ }
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(NumTableFilesAtLevel(0), 4);
+
+ // Enable auto compaction and perform the same test, # of L0 files should be
+ // reduced after compaction.
+ ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+ for (int i = 0; i < 4; ++i) {
+ ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+ // Wait for compaction so that put won't stop
+ dbfull()->TEST_FlushMemTable(true);
+ }
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_LT(NumTableFilesAtLevel(0), 4);
+}
+
+// Test dynamic FIFO compaction options.
+// This test covers just option parsing and makes sure that the options are
+// correctly assigned. Also look at DBOptionsTest.SetFIFOCompactionOptions
+// test which makes sure that the FIFO compaction funcionality is working
+// as expected on dynamically changing the options.
+// Even more FIFOCompactionTests are at DBTest.FIFOCompaction* .
+TEST_F(DBTest, DynamicFIFOCompactionOptions) {
+ Options options;
+ options.ttl = 0;
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+
+ // Initial defaults
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+ 1024 * 1024 * 1024);
+ ASSERT_EQ(dbfull()->GetOptions().ttl, 0);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+ false);
+
+ ASSERT_OK(dbfull()->SetOptions(
+ {{"compaction_options_fifo", "{max_table_files_size=23;}"}}));
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+ 23);
+ ASSERT_EQ(dbfull()->GetOptions().ttl, 0);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+ false);
+
+ ASSERT_OK(dbfull()->SetOptions({{"ttl", "97"}}));
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+ 23);
+ ASSERT_EQ(dbfull()->GetOptions().ttl, 97);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+ false);
+
+ ASSERT_OK(dbfull()->SetOptions({{"ttl", "203"}}));
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+ 23);
+ ASSERT_EQ(dbfull()->GetOptions().ttl, 203);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+ false);
+
+ ASSERT_OK(dbfull()->SetOptions(
+ {{"compaction_options_fifo", "{allow_compaction=true;}"}}));
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+ 23);
+ ASSERT_EQ(dbfull()->GetOptions().ttl, 203);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+ true);
+
+ ASSERT_OK(dbfull()->SetOptions(
+ {{"compaction_options_fifo", "{max_table_files_size=31;}"}}));
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+ 31);
+ ASSERT_EQ(dbfull()->GetOptions().ttl, 203);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+ true);
+
+ ASSERT_OK(dbfull()->SetOptions(
+ {{"compaction_options_fifo",
+ "{max_table_files_size=51;allow_compaction=true;}"}}));
+ ASSERT_OK(dbfull()->SetOptions({{"ttl", "49"}}));
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+ 51);
+ ASSERT_EQ(dbfull()->GetOptions().ttl, 49);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+ true);
+}
+
+TEST_F(DBTest, DynamicUniversalCompactionOptions) {
+ Options options;
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+
+ // Initial defaults
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 1U);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width,
+ 2u);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width,
+ UINT_MAX);
+ ASSERT_EQ(dbfull()
+ ->GetOptions()
+ .compaction_options_universal.max_size_amplification_percent,
+ 200u);
+ ASSERT_EQ(dbfull()
+ ->GetOptions()
+ .compaction_options_universal.compression_size_percent,
+ -1);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.stop_style,
+ kCompactionStopStyleTotalSize);
+ ASSERT_EQ(
+ dbfull()->GetOptions().compaction_options_universal.allow_trivial_move,
+ false);
+
+ ASSERT_OK(dbfull()->SetOptions(
+ {{"compaction_options_universal", "{size_ratio=7;}"}}));
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 7u);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width,
+ 2u);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width,
+ UINT_MAX);
+ ASSERT_EQ(dbfull()
+ ->GetOptions()
+ .compaction_options_universal.max_size_amplification_percent,
+ 200u);
+ ASSERT_EQ(dbfull()
+ ->GetOptions()
+ .compaction_options_universal.compression_size_percent,
+ -1);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.stop_style,
+ kCompactionStopStyleTotalSize);
+ ASSERT_EQ(
+ dbfull()->GetOptions().compaction_options_universal.allow_trivial_move,
+ false);
+
+ ASSERT_OK(dbfull()->SetOptions(
+ {{"compaction_options_universal", "{min_merge_width=11;}"}}));
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 7u);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width,
+ 11u);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width,
+ UINT_MAX);
+ ASSERT_EQ(dbfull()
+ ->GetOptions()
+ .compaction_options_universal.max_size_amplification_percent,
+ 200u);
+ ASSERT_EQ(dbfull()
+ ->GetOptions()
+ .compaction_options_universal.compression_size_percent,
+ -1);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.stop_style,
+ kCompactionStopStyleTotalSize);
+ ASSERT_EQ(
+ dbfull()->GetOptions().compaction_options_universal.allow_trivial_move,
+ false);
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTest, FileCreationRandomFailure) {
+ Options options;
+ options.env = env_;
+ options.create_if_missing = true;
+ options.write_buffer_size = 100000; // Small write buffer
+ options.target_file_size_base = 200000;
+ options.max_bytes_for_level_base = 1000000;
+ options.max_bytes_for_level_multiplier = 2;
+
+ DestroyAndReopen(options);
+ Random rnd(301);
+
+ const int kCDTKeysPerBuffer = 4;
+ const int kTestSize = kCDTKeysPerBuffer * 4096;
+ const int kTotalIteration = 100;
+ // the second half of the test involves in random failure
+ // of file creation.
+ const int kRandomFailureTest = kTotalIteration / 2;
+ std::vector<std::string> values;
+ for (int i = 0; i < kTestSize; ++i) {
+ values.push_back("NOT_FOUND");
+ }
+ for (int j = 0; j < kTotalIteration; ++j) {
+ if (j == kRandomFailureTest) {
+ env_->non_writeable_rate_.store(90);
+ }
+ for (int k = 0; k < kTestSize; ++k) {
+ // here we expect some of the Put fails.
+ std::string value = RandomString(&rnd, 100);
+ Status s = Put(Key(k), Slice(value));
+ if (s.ok()) {
+ // update the latest successful put
+ values[k] = value;
+ }
+ // But everything before we simulate the failure-test should succeed.
+ if (j < kRandomFailureTest) {
+ ASSERT_OK(s);
+ }
+ }
+ }
+
+ // If rocksdb does not do the correct job, internal assert will fail here.
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+
+ // verify we have the latest successful update
+ for (int k = 0; k < kTestSize; ++k) {
+ auto v = Get(Key(k));
+ ASSERT_EQ(v, values[k]);
+ }
+
+ // reopen and reverify we have the latest successful update
+ env_->non_writeable_rate_.store(0);
+ Reopen(options);
+ for (int k = 0; k < kTestSize; ++k) {
+ auto v = Get(Key(k));
+ ASSERT_EQ(v, values[k]);
+ }
+}
+
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBTest, DynamicMiscOptions) {
+ // Test max_sequential_skip_in_iterations
+ Options options;
+ options.env = env_;
+ options.create_if_missing = true;
+ options.max_sequential_skip_in_iterations = 16;
+ options.compression = kNoCompression;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ DestroyAndReopen(options);
+
+ auto assert_reseek_count = [this, &options](int key_start, int num_reseek) {
+ int key0 = key_start;
+ int key1 = key_start + 1;
+ int key2 = key_start + 2;
+ Random rnd(301);
+ ASSERT_OK(Put(Key(key0), RandomString(&rnd, 8)));
+ for (int i = 0; i < 10; ++i) {
+ ASSERT_OK(Put(Key(key1), RandomString(&rnd, 8)));
+ }
+ ASSERT_OK(Put(Key(key2), RandomString(&rnd, 8)));
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+ iter->Seek(Key(key1));
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Key(key1)), 0);
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Key(key2)), 0);
+ ASSERT_EQ(num_reseek,
+ TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION));
+ };
+ // No reseek
+ assert_reseek_count(100, 0);
+
+ ASSERT_OK(dbfull()->SetOptions({{"max_sequential_skip_in_iterations", "4"}}));
+ // Clear memtable and make new option effective
+ dbfull()->TEST_FlushMemTable(true);
+ // Trigger reseek
+ assert_reseek_count(200, 1);
+
+ ASSERT_OK(
+ dbfull()->SetOptions({{"max_sequential_skip_in_iterations", "16"}}));
+ // Clear memtable and make new option effective
+ dbfull()->TEST_FlushMemTable(true);
+ // No reseek
+ assert_reseek_count(300, 1);
+
+ MutableCFOptions mutable_cf_options;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ // Test soft_pending_compaction_bytes_limit,
+ // hard_pending_compaction_bytes_limit
+ ASSERT_OK(dbfull()->SetOptions(
+ handles_[1], {{"soft_pending_compaction_bytes_limit", "200"},
+ {"hard_pending_compaction_bytes_limit", "300"}}));
+ ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
+ &mutable_cf_options));
+ ASSERT_EQ(200, mutable_cf_options.soft_pending_compaction_bytes_limit);
+ ASSERT_EQ(300, mutable_cf_options.hard_pending_compaction_bytes_limit);
+ // Test report_bg_io_stats
+ ASSERT_OK(
+ dbfull()->SetOptions(handles_[1], {{"report_bg_io_stats", "true"}}));
+ // sanity check
+ ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
+ &mutable_cf_options));
+ ASSERT_TRUE(mutable_cf_options.report_bg_io_stats);
+ // Test compression
+ // sanity check
+ ASSERT_OK(dbfull()->SetOptions({{"compression", "kNoCompression"}}));
+ ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0],
+ &mutable_cf_options));
+ ASSERT_EQ(CompressionType::kNoCompression, mutable_cf_options.compression);
+
+ if (Snappy_Supported()) {
+ ASSERT_OK(dbfull()->SetOptions({{"compression", "kSnappyCompression"}}));
+ ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0],
+ &mutable_cf_options));
+ ASSERT_EQ(CompressionType::kSnappyCompression,
+ mutable_cf_options.compression);
+ }
+
+ // Test paranoid_file_checks already done in db_block_cache_test
+ ASSERT_OK(
+ dbfull()->SetOptions(handles_[1], {{"paranoid_file_checks", "true"}}));
+ ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
+ &mutable_cf_options));
+ ASSERT_TRUE(mutable_cf_options.report_bg_io_stats);
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTest, L0L1L2AndUpHitCounter) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 32 * 1024;
+ options.target_file_size_base = 32 * 1024;
+ options.level0_file_num_compaction_trigger = 2;
+ options.level0_slowdown_writes_trigger = 2;
+ options.level0_stop_writes_trigger = 4;
+ options.max_bytes_for_level_base = 64 * 1024;
+ options.max_write_buffer_number = 2;
+ options.max_background_compactions = 8;
+ options.max_background_flushes = 8;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ CreateAndReopenWithCF({"mypikachu"}, options);
+
+ int numkeys = 20000;
+ for (int i = 0; i < numkeys; i++) {
+ ASSERT_OK(Put(1, Key(i), "val"));
+ }
+ ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0));
+ ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1));
+ ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP));
+
+ ASSERT_OK(Flush(1));
+ dbfull()->TEST_WaitForCompact();
+
+ for (int i = 0; i < numkeys; i++) {
+ ASSERT_EQ(Get(1, Key(i)), "val");
+ }
+
+ ASSERT_GT(TestGetTickerCount(options, GET_HIT_L0), 100);
+ ASSERT_GT(TestGetTickerCount(options, GET_HIT_L1), 100);
+ ASSERT_GT(TestGetTickerCount(options, GET_HIT_L2_AND_UP), 100);
+
+ ASSERT_EQ(numkeys, TestGetTickerCount(options, GET_HIT_L0) +
+ TestGetTickerCount(options, GET_HIT_L1) +
+ TestGetTickerCount(options, GET_HIT_L2_AND_UP));
+}
+
+TEST_F(DBTest, EncodeDecompressedBlockSizeTest) {
+ // iter 0 -- zlib
+ // iter 1 -- bzip2
+ // iter 2 -- lz4
+ // iter 3 -- lz4HC
+ // iter 4 -- xpress
+ CompressionType compressions[] = {kZlibCompression, kBZip2Compression,
+ kLZ4Compression, kLZ4HCCompression,
+ kXpressCompression};
+ for (auto comp : compressions) {
+ if (!CompressionTypeSupported(comp)) {
+ continue;
+ }
+ // first_table_version 1 -- generate with table_version == 1, read with
+ // table_version == 2
+ // first_table_version 2 -- generate with table_version == 2, read with
+ // table_version == 1
+ for (int first_table_version = 1; first_table_version <= 2;
+ ++first_table_version) {
+ BlockBasedTableOptions table_options;
+ table_options.format_version = first_table_version;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+ Options options = CurrentOptions();
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.create_if_missing = true;
+ options.compression = comp;
+ DestroyAndReopen(options);
+
+ int kNumKeysWritten = 1000;
+
+ Random rnd(301);
+ for (int i = 0; i < kNumKeysWritten; ++i) {
+ // compressible string
+ ASSERT_OK(Put(Key(i), RandomString(&rnd, 128) + std::string(128, 'a')));
+ }
+
+ table_options.format_version = first_table_version == 1 ? 2 : 1;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+ for (int i = 0; i < kNumKeysWritten; ++i) {
+ auto r = Get(Key(i));
+ ASSERT_EQ(r.substr(128), std::string(128, 'a'));
+ }
+ }
+ }
+}
+
+TEST_F(DBTest, CloseSpeedup) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleLevel;
+ options.write_buffer_size = 110 << 10; // 110KB
+ options.arena_block_size = 4 << 10;
+ options.level0_file_num_compaction_trigger = 2;
+ options.num_levels = 4;
+ options.max_bytes_for_level_base = 400 * 1024;
+ options.max_write_buffer_number = 16;
+
+ // Block background threads
+ env_->SetBackgroundThreads(1, Env::LOW);
+ env_->SetBackgroundThreads(1, Env::HIGH);
+ test::SleepingBackgroundTask sleeping_task_low;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+ test::SleepingBackgroundTask sleeping_task_high;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+ &sleeping_task_high, Env::Priority::HIGH);
+
+ std::vector<std::string> filenames;
+ env_->GetChildren(dbname_, &filenames);
+ // Delete archival files.
+ for (size_t i = 0; i < filenames.size(); ++i) {
+ env_->DeleteFile(dbname_ + "/" + filenames[i]);
+ }
+ env_->DeleteDir(dbname_);
+ DestroyAndReopen(options);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ env_->SetBackgroundThreads(1, Env::LOW);
+ env_->SetBackgroundThreads(1, Env::HIGH);
+ Random rnd(301);
+ int key_idx = 0;
+
+ // First three 110KB files are not going to level 2
+ // After that, (100K, 200K)
+ for (int num = 0; num < 5; num++) {
+ GenerateNewFile(&rnd, &key_idx, true);
+ }
+
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ Close();
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ // Unblock background threads
+ sleeping_task_high.WakeUp();
+ sleeping_task_high.WaitUntilDone();
+ sleeping_task_low.WakeUp();
+ sleeping_task_low.WaitUntilDone();
+
+ Destroy(options);
+}
+
+class DelayedMergeOperator : public MergeOperator {
+ private:
+ DBTest* db_test_;
+
+ public:
+ explicit DelayedMergeOperator(DBTest* d) : db_test_(d) {}
+
+ bool FullMergeV2(const MergeOperationInput& /*merge_in*/,
+ MergeOperationOutput* merge_out) const override {
+ db_test_->env_->addon_time_.fetch_add(1000);
+ merge_out->new_value = "";
+ return true;
+ }
+
+ const char* Name() const override { return "DelayedMergeOperator"; }
+};
+
+TEST_F(DBTest, MergeTestTime) {
+ std::string one, two, three;
+ PutFixed64(&one, 1);
+ PutFixed64(&two, 2);
+ PutFixed64(&three, 3);
+
+ // Enable time profiling
+ SetPerfLevel(kEnableTime);
+ this->env_->addon_time_.store(0);
+ this->env_->time_elapse_only_sleep_ = true;
+ this->env_->no_slowdown_ = true;
+ Options options = CurrentOptions();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.merge_operator.reset(new DelayedMergeOperator(this));
+ DestroyAndReopen(options);
+
+ ASSERT_EQ(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 0);
+ db_->Put(WriteOptions(), "foo", one);
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->Merge(WriteOptions(), "foo", two));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->Merge(WriteOptions(), "foo", three));
+ ASSERT_OK(Flush());
+
+ ReadOptions opt;
+ opt.verify_checksums = true;
+ opt.snapshot = nullptr;
+ std::string result;
+ db_->Get(opt, "foo", &result);
+
+ ASSERT_EQ(1000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME));
+
+ ReadOptions read_options;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ int count = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_OK(iter->status());
+ ++count;
+ }
+
+ ASSERT_EQ(1, count);
+ ASSERT_EQ(2000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME));
+#ifdef ROCKSDB_USING_THREAD_STATUS
+ ASSERT_GT(TestGetTickerCount(options, FLUSH_WRITE_BYTES), 0);
+#endif // ROCKSDB_USING_THREAD_STATUS
+ this->env_->time_elapse_only_sleep_ = false;
+}
+
+#ifndef ROCKSDB_LITE
+TEST_P(DBTestWithParam, MergeCompactionTimeTest) {
+ SetPerfLevel(kEnableTime);
+ Options options = CurrentOptions();
+ options.compaction_filter_factory = std::make_shared<KeepFilterFactory>();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.merge_operator.reset(new DelayedMergeOperator(this));
+ options.compaction_style = kCompactionStyleUniversal;
+ options.max_subcompactions = max_subcompactions_;
+ DestroyAndReopen(options);
+
+ for (int i = 0; i < 1000; i++) {
+ ASSERT_OK(db_->Merge(WriteOptions(), "foo", "TEST"));
+ ASSERT_OK(Flush());
+ }
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+
+ ASSERT_NE(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 0);
+}
+
+TEST_P(DBTestWithParam, FilterCompactionTimeTest) {
+ Options options = CurrentOptions();
+ options.compaction_filter_factory =
+ std::make_shared<DelayFilterFactory>(this);
+ options.disable_auto_compactions = true;
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.statistics->set_stats_level(kExceptTimeForMutex);
+ options.max_subcompactions = max_subcompactions_;
+ DestroyAndReopen(options);
+
+ // put some data
+ for (int table = 0; table < 4; ++table) {
+ for (int i = 0; i < 10 + table; ++i) {
+ Put(ToString(table * 100 + i), "val");
+ }
+ Flush();
+ }
+
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ(0U, CountLiveFiles());
+
+ Reopen(options);
+
+ Iterator* itr = db_->NewIterator(ReadOptions());
+ itr->SeekToFirst();
+ ASSERT_NE(TestGetTickerCount(options, FILTER_OPERATION_TOTAL_TIME), 0);
+ delete itr;
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTest, TestLogCleanup) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 64 * 1024; // very small
+ // only two memtables allowed ==> only two log files
+ options.max_write_buffer_number = 2;
+ Reopen(options);
+
+ for (int i = 0; i < 100000; ++i) {
+ Put(Key(i), "val");
+ // only 2 memtables will be alive, so logs_to_free needs to always be below
+ // 2
+ ASSERT_LT(dbfull()->TEST_LogsToFreeSize(), static_cast<size_t>(3));
+ }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, EmptyCompactedDB) {
+ Options options = CurrentOptions();
+ options.max_open_files = -1;
+ Close();
+ ASSERT_OK(ReadOnlyReopen(options));
+ Status s = Put("new", "value");
+ ASSERT_TRUE(s.IsNotSupported());
+ Close();
+}
+#endif // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, SuggestCompactRangeTest) {
+ class CompactionFilterFactoryGetContext : public CompactionFilterFactory {
+ public:
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& context) override {
+ saved_context = context;
+ std::unique_ptr<CompactionFilter> empty_filter;
+ return empty_filter;
+ }
+ const char* Name() const override {
+ return "CompactionFilterFactoryGetContext";
+ }
+ static bool IsManual(CompactionFilterFactory* compaction_filter_factory) {
+ return reinterpret_cast<CompactionFilterFactoryGetContext*>(
+ compaction_filter_factory)
+ ->saved_context.is_manual_compaction;
+ }
+ CompactionFilter::Context saved_context;
+ };
+
+ Options options = CurrentOptions();
+ options.memtable_factory.reset(
+ new SpecialSkipListFactory(DBTestBase::kNumKeysByGenerateNewRandomFile));
+ options.compaction_style = kCompactionStyleLevel;
+ options.compaction_filter_factory.reset(
+ new CompactionFilterFactoryGetContext());
+ options.write_buffer_size = 200 << 10;
+ options.arena_block_size = 4 << 10;
+ options.level0_file_num_compaction_trigger = 4;
+ options.num_levels = 4;
+ options.compression = kNoCompression;
+ options.max_bytes_for_level_base = 450 << 10;
+ options.target_file_size_base = 98 << 10;
+ options.max_compaction_bytes = static_cast<uint64_t>(1) << 60; // inf
+
+ Reopen(options);
+
+ Random rnd(301);
+
+ for (int num = 0; num < 3; num++) {
+ GenerateNewRandomFile(&rnd);
+ }
+
+ GenerateNewRandomFile(&rnd);
+ ASSERT_EQ("0,4", FilesPerLevel(0));
+ ASSERT_TRUE(!CompactionFilterFactoryGetContext::IsManual(
+ options.compaction_filter_factory.get()));
+
+ GenerateNewRandomFile(&rnd);
+ ASSERT_EQ("1,4", FilesPerLevel(0));
+
+ GenerateNewRandomFile(&rnd);
+ ASSERT_EQ("2,4", FilesPerLevel(0));
+
+ GenerateNewRandomFile(&rnd);
+ ASSERT_EQ("3,4", FilesPerLevel(0));
+
+ GenerateNewRandomFile(&rnd);
+ ASSERT_EQ("0,4,4", FilesPerLevel(0));
+
+ GenerateNewRandomFile(&rnd);
+ ASSERT_EQ("1,4,4", FilesPerLevel(0));
+
+ GenerateNewRandomFile(&rnd);
+ ASSERT_EQ("2,4,4", FilesPerLevel(0));
+
+ GenerateNewRandomFile(&rnd);
+ ASSERT_EQ("3,4,4", FilesPerLevel(0));
+
+ GenerateNewRandomFile(&rnd);
+ ASSERT_EQ("0,4,8", FilesPerLevel(0));
+
+ GenerateNewRandomFile(&rnd);
+ ASSERT_EQ("1,4,8", FilesPerLevel(0));
+
+ // compact it three times
+ for (int i = 0; i < 3; ++i) {
+ ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr));
+ dbfull()->TEST_WaitForCompact();
+ }
+
+ // All files are compacted
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_EQ(0, NumTableFilesAtLevel(1));
+
+ GenerateNewRandomFile(&rnd);
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+ // nonoverlapping with the file on level 0
+ Slice start("a"), end("b");
+ ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
+ dbfull()->TEST_WaitForCompact();
+
+ // should not compact the level 0 file
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+ start = Slice("j");
+ end = Slice("m");
+ ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_TRUE(CompactionFilterFactoryGetContext::IsManual(
+ options.compaction_filter_factory.get()));
+
+ // now it should compact the level 0 file
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_EQ(1, NumTableFilesAtLevel(1));
+}
+
+TEST_F(DBTest, PromoteL0) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.write_buffer_size = 10 * 1024 * 1024;
+ DestroyAndReopen(options);
+
+ // non overlapping ranges
+ std::vector<std::pair<int32_t, int32_t>> ranges = {
+ {81, 160}, {0, 80}, {161, 240}, {241, 320}};
+
+ int32_t value_size = 10 * 1024; // 10 KB
+
+ Random rnd(301);
+ std::map<int32_t, std::string> values;
+ for (const auto& range : ranges) {
+ for (int32_t j = range.first; j < range.second; j++) {
+ values[j] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(j), values[j]));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ int32_t level0_files = NumTableFilesAtLevel(0, 0);
+ ASSERT_EQ(level0_files, ranges.size());
+ ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0); // No files in L1
+
+ // Promote L0 level to L2.
+ ASSERT_OK(experimental::PromoteL0(db_, db_->DefaultColumnFamily(), 2));
+ // We expect that all the files were trivially moved from L0 to L2
+ ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(2, 0), level0_files);
+
+ for (const auto& kv : values) {
+ ASSERT_EQ(Get(Key(kv.first)), kv.second);
+ }
+}
+
+TEST_F(DBTest, PromoteL0Failure) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.write_buffer_size = 10 * 1024 * 1024;
+ DestroyAndReopen(options);
+
+ // Produce two L0 files with overlapping ranges.
+ ASSERT_OK(Put(Key(0), ""));
+ ASSERT_OK(Put(Key(3), ""));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put(Key(1), ""));
+ ASSERT_OK(Flush());
+
+ Status status;
+ // Fails because L0 has overlapping files.
+ status = experimental::PromoteL0(db_, db_->DefaultColumnFamily());
+ ASSERT_TRUE(status.IsInvalidArgument());
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ // Now there is a file in L1.
+ ASSERT_GE(NumTableFilesAtLevel(1, 0), 1);
+
+ ASSERT_OK(Put(Key(5), ""));
+ ASSERT_OK(Flush());
+ // Fails because L1 is non-empty.
+ status = experimental::PromoteL0(db_, db_->DefaultColumnFamily());
+ ASSERT_TRUE(status.IsInvalidArgument());
+}
+
+// Github issue #596
+TEST_F(DBTest, CompactRangeWithEmptyBottomLevel) {
+ const int kNumLevels = 2;
+ const int kNumL0Files = 2;
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.num_levels = kNumLevels;
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < kNumL0Files; ++i) {
+ ASSERT_OK(Put(Key(0), RandomString(&rnd, 1024)));
+ Flush();
+ }
+ ASSERT_EQ(NumTableFilesAtLevel(0), kNumL0Files);
+ ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(1), kNumL0Files);
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTest, AutomaticConflictsWithManualCompaction) {
+ const int kNumL0Files = 50;
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = 4;
+ // never slowdown / stop
+ options.level0_slowdown_writes_trigger = 999999;
+ options.level0_stop_writes_trigger = 999999;
+ options.max_background_compactions = 10;
+ DestroyAndReopen(options);
+
+ // schedule automatic compactions after the manual one starts, but before it
+ // finishes to ensure conflict.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::BackgroundCompaction:Start",
+ "DBTest::AutomaticConflictsWithManualCompaction:PrePuts"},
+ {"DBTest::AutomaticConflictsWithManualCompaction:PostPuts",
+ "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"}});
+ std::atomic<int> callback_count(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::MaybeScheduleFlushOrCompaction:Conflict",
+ [&](void* /*arg*/) { callback_count.fetch_add(1); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ for (int i = 0; i < 2; ++i) {
+ // put two keys to ensure no trivial move
+ for (int j = 0; j < 2; ++j) {
+ ASSERT_OK(Put(Key(j), RandomString(&rnd, 1024)));
+ }
+ ASSERT_OK(Flush());
+ }
+ port::Thread manual_compaction_thread([this]() {
+ CompactRangeOptions croptions;
+ croptions.exclusive_manual_compaction = true;
+ ASSERT_OK(db_->CompactRange(croptions, nullptr, nullptr));
+ });
+
+ TEST_SYNC_POINT("DBTest::AutomaticConflictsWithManualCompaction:PrePuts");
+ for (int i = 0; i < kNumL0Files; ++i) {
+ // put two keys to ensure no trivial move
+ for (int j = 0; j < 2; ++j) {
+ ASSERT_OK(Put(Key(j), RandomString(&rnd, 1024)));
+ }
+ ASSERT_OK(Flush());
+ }
+ TEST_SYNC_POINT("DBTest::AutomaticConflictsWithManualCompaction:PostPuts");
+
+ ASSERT_GE(callback_count.load(), 1);
+ for (int i = 0; i < 2; ++i) {
+ ASSERT_NE("NOT_FOUND", Get(Key(i)));
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ manual_compaction_thread.join();
+ dbfull()->TEST_WaitForCompact();
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, CompactFilesShouldTriggerAutoCompaction) {
+ Options options = CurrentOptions();
+ options.max_background_compactions = 1;
+ options.level0_file_num_compaction_trigger = 4;
+ options.level0_slowdown_writes_trigger = 36;
+ options.level0_stop_writes_trigger = 36;
+ DestroyAndReopen(options);
+
+ // generate files for manual compaction
+ Random rnd(301);
+ for (int i = 0; i < 2; ++i) {
+ // put two keys to ensure no trivial move
+ for (int j = 0; j < 2; ++j) {
+ ASSERT_OK(Put(Key(j), RandomString(&rnd, 1024)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ ROCKSDB_NAMESPACE::ColumnFamilyMetaData cf_meta_data;
+ db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta_data);
+
+ std::vector<std::string> input_files;
+ input_files.push_back(cf_meta_data.levels[0].files[0].name);
+
+ SyncPoint::GetInstance()->LoadDependency({
+ {"CompactFilesImpl:0",
+ "DBTest::CompactFilesShouldTriggerAutoCompaction:Begin"},
+ {"DBTest::CompactFilesShouldTriggerAutoCompaction:End",
+ "CompactFilesImpl:1"},
+ });
+
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ port::Thread manual_compaction_thread([&]() {
+ auto s = db_->CompactFiles(CompactionOptions(),
+ db_->DefaultColumnFamily(), input_files, 0);
+ });
+
+ TEST_SYNC_POINT(
+ "DBTest::CompactFilesShouldTriggerAutoCompaction:Begin");
+ // generate enough files to trigger compaction
+ for (int i = 0; i < 20; ++i) {
+ for (int j = 0; j < 2; ++j) {
+ ASSERT_OK(Put(Key(j), RandomString(&rnd, 1024)));
+ }
+ ASSERT_OK(Flush());
+ }
+ db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta_data);
+ ASSERT_GT(cf_meta_data.levels[0].files.size(),
+ options.level0_file_num_compaction_trigger);
+ TEST_SYNC_POINT(
+ "DBTest::CompactFilesShouldTriggerAutoCompaction:End");
+
+ manual_compaction_thread.join();
+ dbfull()->TEST_WaitForCompact();
+
+ db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta_data);
+ ASSERT_LE(cf_meta_data.levels[0].files.size(),
+ options.level0_file_num_compaction_trigger);
+}
+#endif // ROCKSDB_LITE
+
+// Github issue #595
+// Large write batch with column families
+TEST_F(DBTest, LargeBatchWithColumnFamilies) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.write_buffer_size = 100000; // Small write buffer
+ CreateAndReopenWithCF({"pikachu"}, options);
+ int64_t j = 0;
+ for (int i = 0; i < 5; i++) {
+ for (int pass = 1; pass <= 3; pass++) {
+ WriteBatch batch;
+ size_t write_size = 1024 * 1024 * (5 + i);
+ fprintf(stderr, "prepare: %" ROCKSDB_PRIszt " MB, pass:%d\n",
+ (write_size / 1024 / 1024), pass);
+ for (;;) {
+ std::string data(3000, j++ % 127 + 20);
+ data += ToString(j);
+ batch.Put(handles_[0], Slice(data), Slice(data));
+ if (batch.GetDataSize() > write_size) {
+ break;
+ }
+ }
+ fprintf(stderr, "write: %" ROCKSDB_PRIszt " MB\n",
+ (batch.GetDataSize() / 1024 / 1024));
+ ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+ fprintf(stderr, "done\n");
+ }
+ }
+ // make sure we can re-open it.
+ ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+}
+
+// Make sure that Flushes can proceed in parallel with CompactRange()
+TEST_F(DBTest, FlushesInParallelWithCompactRange) {
+ // iter == 0 -- leveled
+ // iter == 1 -- leveled, but throw in a flush between two levels compacting
+ // iter == 2 -- universal
+ for (int iter = 0; iter < 3; ++iter) {
+ Options options = CurrentOptions();
+ if (iter < 2) {
+ options.compaction_style = kCompactionStyleLevel;
+ } else {
+ options.compaction_style = kCompactionStyleUniversal;
+ }
+ options.write_buffer_size = 110 << 10;
+ options.level0_file_num_compaction_trigger = 4;
+ options.num_levels = 4;
+ options.compression = kNoCompression;
+ options.max_bytes_for_level_base = 450 << 10;
+ options.target_file_size_base = 98 << 10;
+ options.max_write_buffer_number = 2;
+
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ for (int num = 0; num < 14; num++) {
+ GenerateNewRandomFile(&rnd);
+ }
+
+ if (iter == 1) {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::RunManualCompaction()::1",
+ "DBTest::FlushesInParallelWithCompactRange:1"},
+ {"DBTest::FlushesInParallelWithCompactRange:2",
+ "DBImpl::RunManualCompaction()::2"}});
+ } else {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"CompactionJob::Run():Start",
+ "DBTest::FlushesInParallelWithCompactRange:1"},
+ {"DBTest::FlushesInParallelWithCompactRange:2",
+ "CompactionJob::Run():End"}});
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ std::vector<port::Thread> threads;
+ threads.emplace_back([&]() { Compact("a", "z"); });
+
+ TEST_SYNC_POINT("DBTest::FlushesInParallelWithCompactRange:1");
+
+ // this has to start a flush. if flushes are blocked, this will try to
+ // create
+ // 3 memtables, and that will fail because max_write_buffer_number is 2
+ for (int num = 0; num < 3; num++) {
+ GenerateNewRandomFile(&rnd, /* nowait */ true);
+ }
+
+ TEST_SYNC_POINT("DBTest::FlushesInParallelWithCompactRange:2");
+
+ for (auto& t : threads) {
+ t.join();
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+}
+
+TEST_F(DBTest, DelayedWriteRate) {
+ const int kEntriesPerMemTable = 100;
+ const int kTotalFlushes = 12;
+
+ Options options = CurrentOptions();
+ env_->SetBackgroundThreads(1, Env::LOW);
+ options.env = env_;
+ env_->no_slowdown_ = true;
+ options.write_buffer_size = 100000000;
+ options.max_write_buffer_number = 256;
+ options.max_background_compactions = 1;
+ options.level0_file_num_compaction_trigger = 3;
+ options.level0_slowdown_writes_trigger = 3;
+ options.level0_stop_writes_trigger = 999999;
+ options.delayed_write_rate = 20000000; // Start with 200MB/s
+ options.memtable_factory.reset(
+ new SpecialSkipListFactory(kEntriesPerMemTable));
+
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Block compactions
+ test::SleepingBackgroundTask sleeping_task_low;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+
+ for (int i = 0; i < 3; i++) {
+ Put(Key(i), std::string(10000, 'x'));
+ Flush();
+ }
+
+ // These writes will be slowed down to 1KB/s
+ uint64_t estimated_sleep_time = 0;
+ Random rnd(301);
+ Put("", "");
+ uint64_t cur_rate = options.delayed_write_rate;
+ for (int i = 0; i < kTotalFlushes; i++) {
+ uint64_t size_memtable = 0;
+ for (int j = 0; j < kEntriesPerMemTable; j++) {
+ auto rand_num = rnd.Uniform(20);
+ // Spread the size range to more.
+ size_t entry_size = rand_num * rand_num * rand_num;
+ WriteOptions wo;
+ Put(Key(i), std::string(entry_size, 'x'), wo);
+ size_memtable += entry_size + 18;
+ // Occasionally sleep a while
+ if (rnd.Uniform(20) == 6) {
+ env_->SleepForMicroseconds(2666);
+ }
+ }
+ dbfull()->TEST_WaitForFlushMemTable();
+ estimated_sleep_time += size_memtable * 1000000u / cur_rate;
+ // Slow down twice. One for memtable switch and one for flush finishes.
+ cur_rate = static_cast<uint64_t>(static_cast<double>(cur_rate) *
+ kIncSlowdownRatio * kIncSlowdownRatio);
+ }
+ // Estimate the total sleep time fall into the rough range.
+ ASSERT_GT(env_->addon_time_.load(),
+ static_cast<int64_t>(estimated_sleep_time / 2));
+ ASSERT_LT(env_->addon_time_.load(),
+ static_cast<int64_t>(estimated_sleep_time * 2));
+
+ env_->no_slowdown_ = false;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ sleeping_task_low.WakeUp();
+ sleeping_task_low.WaitUntilDone();
+}
+
+TEST_F(DBTest, HardLimit) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ env_->SetBackgroundThreads(1, Env::LOW);
+ options.max_write_buffer_number = 256;
+ options.write_buffer_size = 110 << 10; // 110KB
+ options.arena_block_size = 4 * 1024;
+ options.level0_file_num_compaction_trigger = 4;
+ options.level0_slowdown_writes_trigger = 999999;
+ options.level0_stop_writes_trigger = 999999;
+ options.hard_pending_compaction_bytes_limit = 800 << 10;
+ options.max_bytes_for_level_base = 10000000000u;
+ options.max_background_compactions = 1;
+ options.memtable_factory.reset(
+ new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+
+ env_->SetBackgroundThreads(1, Env::LOW);
+ test::SleepingBackgroundTask sleeping_task_low;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ std::atomic<int> callback_count(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::DelayWrite:Wait", [&](void* /*arg*/) {
+ callback_count.fetch_add(1);
+ sleeping_task_low.WakeUp();
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ int key_idx = 0;
+ for (int num = 0; num < 5; num++) {
+ GenerateNewFile(&rnd, &key_idx, true);
+ dbfull()->TEST_WaitForFlushMemTable();
+ }
+
+ ASSERT_EQ(0, callback_count.load());
+
+ for (int num = 0; num < 5; num++) {
+ GenerateNewFile(&rnd, &key_idx, true);
+ dbfull()->TEST_WaitForFlushMemTable();
+ }
+ ASSERT_GE(callback_count.load(), 1);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ sleeping_task_low.WaitUntilDone();
+}
+
+#if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
+class WriteStallListener : public EventListener {
+ public:
+ WriteStallListener() : condition_(WriteStallCondition::kNormal) {}
+ void OnStallConditionsChanged(const WriteStallInfo& info) override {
+ MutexLock l(&mutex_);
+ condition_ = info.condition.cur;
+ }
+ bool CheckCondition(WriteStallCondition expected) {
+ MutexLock l(&mutex_);
+ return expected == condition_;
+ }
+ private:
+ port::Mutex mutex_;
+ WriteStallCondition condition_;
+};
+
+TEST_F(DBTest, SoftLimit) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.write_buffer_size = 100000; // Small write buffer
+ options.max_write_buffer_number = 256;
+ options.level0_file_num_compaction_trigger = 1;
+ options.level0_slowdown_writes_trigger = 3;
+ options.level0_stop_writes_trigger = 999999;
+ options.delayed_write_rate = 20000; // About 200KB/s limited rate
+ options.soft_pending_compaction_bytes_limit = 160000;
+ options.target_file_size_base = 99999999; // All into one file
+ options.max_bytes_for_level_base = 50000;
+ options.max_bytes_for_level_multiplier = 10;
+ options.max_background_compactions = 1;
+ options.compression = kNoCompression;
+ WriteStallListener* listener = new WriteStallListener();
+ options.listeners.emplace_back(listener);
+
+ // FlushMemtable with opt.wait=true does not wait for
+ // `OnStallConditionsChanged` being called. The event listener is triggered
+ // on `JobContext::Clean`, which happens after flush result is installed.
+ // We use sync point to create a custom WaitForFlush that waits for
+ // context cleanup.
+ port::Mutex flush_mutex;
+ port::CondVar flush_cv(&flush_mutex);
+ bool flush_finished = false;
+ auto InstallFlushCallback = [&]() {
+ {
+ MutexLock l(&flush_mutex);
+ flush_finished = false;
+ }
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCallFlush:ContextCleanedUp", [&](void*) {
+ {
+ MutexLock l(&flush_mutex);
+ flush_finished = true;
+ }
+ flush_cv.SignalAll();
+ });
+ };
+ auto WaitForFlush = [&]() {
+ {
+ MutexLock l(&flush_mutex);
+ while (!flush_finished) {
+ flush_cv.Wait();
+ }
+ }
+ SyncPoint::GetInstance()->ClearCallBack(
+ "DBImpl::BackgroundCallFlush:ContextCleanedUp");
+ };
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Reopen(options);
+
+ // Generating 360KB in Level 3
+ for (int i = 0; i < 72; i++) {
+ Put(Key(i), std::string(5000, 'x'));
+ if (i % 10 == 0) {
+ dbfull()->TEST_FlushMemTable(true, true);
+ }
+ }
+ dbfull()->TEST_WaitForCompact();
+ MoveFilesToLevel(3);
+
+ // Generating 360KB in Level 2
+ for (int i = 0; i < 72; i++) {
+ Put(Key(i), std::string(5000, 'x'));
+ if (i % 10 == 0) {
+ dbfull()->TEST_FlushMemTable(true, true);
+ }
+ }
+ dbfull()->TEST_WaitForCompact();
+ MoveFilesToLevel(2);
+
+ Put(Key(0), "");
+
+ test::SleepingBackgroundTask sleeping_task_low;
+ // Block compactions
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+ sleeping_task_low.WaitUntilSleeping();
+
+ // Create 3 L0 files, making score of L0 to be 3.
+ for (int i = 0; i < 3; i++) {
+ Put(Key(i), std::string(5000, 'x'));
+ Put(Key(100 - i), std::string(5000, 'x'));
+ // Flush the file. File size is around 30KB.
+ InstallFlushCallback();
+ dbfull()->TEST_FlushMemTable(true, true);
+ WaitForFlush();
+ }
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed));
+
+ sleeping_task_low.WakeUp();
+ sleeping_task_low.WaitUntilDone();
+ sleeping_task_low.Reset();
+ dbfull()->TEST_WaitForCompact();
+
+ // Now there is one L1 file but doesn't trigger soft_rate_limit
+ // The L1 file size is around 30KB.
+ ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+ ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kNormal));
+
+ // Only allow one compactin going through.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BackgroundCallCompaction:0", [&](void* /*arg*/) {
+ // Schedule a sleeping task.
+ sleeping_task_low.Reset();
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+ &sleeping_task_low, Env::Priority::LOW);
+ });
+
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+ sleeping_task_low.WaitUntilSleeping();
+ // Create 3 L0 files, making score of L0 to be 3
+ for (int i = 0; i < 3; i++) {
+ Put(Key(10 + i), std::string(5000, 'x'));
+ Put(Key(90 - i), std::string(5000, 'x'));
+ // Flush the file. File size is around 30KB.
+ InstallFlushCallback();
+ dbfull()->TEST_FlushMemTable(true, true);
+ WaitForFlush();
+ }
+
+ // Wake up sleep task to enable compaction to run and waits
+ // for it to go to sleep state again to make sure one compaction
+ // goes through.
+ sleeping_task_low.WakeUp();
+ sleeping_task_low.WaitUntilSleeping();
+
+ // Now there is one L1 file (around 60KB) which exceeds 50KB base by 10KB
+ // Given level multiplier 10, estimated pending compaction is around 100KB
+ // doesn't trigger soft_pending_compaction_bytes_limit
+ ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+ ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kNormal));
+
+ // Create 3 L0 files, making score of L0 to be 3, higher than L0.
+ for (int i = 0; i < 3; i++) {
+ Put(Key(20 + i), std::string(5000, 'x'));
+ Put(Key(80 - i), std::string(5000, 'x'));
+ // Flush the file. File size is around 30KB.
+ InstallFlushCallback();
+ dbfull()->TEST_FlushMemTable(true, true);
+ WaitForFlush();
+ }
+ // Wake up sleep task to enable compaction to run and waits
+ // for it to go to sleep state again to make sure one compaction
+ // goes through.
+ sleeping_task_low.WakeUp();
+ sleeping_task_low.WaitUntilSleeping();
+
+ // Now there is one L1 file (around 90KB) which exceeds 50KB base by 40KB
+ // L2 size is 360KB, so the estimated level fanout 4, estimated pending
+ // compaction is around 200KB
+ // triggerring soft_pending_compaction_bytes_limit
+ ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed));
+
+ sleeping_task_low.WakeUp();
+ sleeping_task_low.WaitUntilSleeping();
+
+ ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kNormal));
+
+ // shrink level base so L2 will hit soft limit easier.
+ ASSERT_OK(dbfull()->SetOptions({
+ {"max_bytes_for_level_base", "5000"},
+ }));
+
+ Put("", "");
+ Flush();
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed));
+
+ sleeping_task_low.WaitUntilSleeping();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ sleeping_task_low.WakeUp();
+ sleeping_task_low.WaitUntilDone();
+}
+
+TEST_F(DBTest, LastWriteBufferDelay) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.write_buffer_size = 100000;
+ options.max_write_buffer_number = 4;
+ options.delayed_write_rate = 20000;
+ options.compression = kNoCompression;
+ options.disable_auto_compactions = true;
+ int kNumKeysPerMemtable = 3;
+ options.memtable_factory.reset(
+ new SpecialSkipListFactory(kNumKeysPerMemtable));
+
+ Reopen(options);
+ test::SleepingBackgroundTask sleeping_task;
+ // Block flushes
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+ Env::Priority::HIGH);
+ sleeping_task.WaitUntilSleeping();
+
+ // Create 3 L0 files, making score of L0 to be 3.
+ for (int i = 0; i < 3; i++) {
+ // Fill one mem table
+ for (int j = 0; j < kNumKeysPerMemtable; j++) {
+ Put(Key(j), "");
+ }
+ ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+ }
+ // Inserting a new entry would create a new mem table, triggering slow down.
+ Put(Key(0), "");
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+
+ sleeping_task.WakeUp();
+ sleeping_task.WaitUntilDone();
+}
+#endif // !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
+
+TEST_F(DBTest, FailWhenCompressionNotSupportedTest) {
+ CompressionType compressions[] = {kZlibCompression, kBZip2Compression,
+ kLZ4Compression, kLZ4HCCompression,
+ kXpressCompression};
+ for (auto comp : compressions) {
+ if (!CompressionTypeSupported(comp)) {
+ // not supported, we should fail the Open()
+ Options options = CurrentOptions();
+ options.compression = comp;
+ ASSERT_TRUE(!TryReopen(options).ok());
+ // Try if CreateColumnFamily also fails
+ options.compression = kNoCompression;
+ ASSERT_OK(TryReopen(options));
+ ColumnFamilyOptions cf_options(options);
+ cf_options.compression = comp;
+ ColumnFamilyHandle* handle;
+ ASSERT_TRUE(!db_->CreateColumnFamily(cf_options, "name", &handle).ok());
+ }
+ }
+}
+
+TEST_F(DBTest, CreateColumnFamilyShouldFailOnIncompatibleOptions) {
+ Options options = CurrentOptions();
+ options.max_open_files = 100;
+ Reopen(options);
+
+ ColumnFamilyOptions cf_options(options);
+ // ttl is now supported when max_open_files is -1.
+ cf_options.ttl = 3600;
+ ColumnFamilyHandle* handle;
+ ASSERT_OK(db_->CreateColumnFamily(cf_options, "pikachu", &handle));
+ delete handle;
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, RowCache) {
+ Options options = CurrentOptions();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.row_cache = NewLRUCache(8192);
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0);
+ ASSERT_EQ(Get("foo"), "bar");
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
+ ASSERT_EQ(Get("foo"), "bar");
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
+}
+
+TEST_F(DBTest, PinnableSliceAndRowCache) {
+ Options options = CurrentOptions();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.row_cache = NewLRUCache(8192);
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ(Get("foo"), "bar");
+ ASSERT_EQ(
+ reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+ 1);
+
+ {
+ PinnableSlice pin_slice;
+ ASSERT_EQ(Get("foo", &pin_slice), Status::OK());
+ ASSERT_EQ(pin_slice.ToString(), "bar");
+ // Entry is already in cache, lookup will remove the element from lru
+ ASSERT_EQ(
+ reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+ 0);
+ }
+ // After PinnableSlice destruction element is added back in LRU
+ ASSERT_EQ(
+ reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+ 1);
+}
+
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTest, DeletingOldWalAfterDrop) {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"Test:AllowFlushes", "DBImpl::BGWorkFlush"},
+ {"DBImpl::BGWorkFlush:done", "Test:WaitForFlush"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ Options options = CurrentOptions();
+ options.max_total_wal_size = 8192;
+ options.compression = kNoCompression;
+ options.write_buffer_size = 1 << 20;
+ options.level0_file_num_compaction_trigger = (1 << 30);
+ options.level0_slowdown_writes_trigger = (1 << 30);
+ options.level0_stop_writes_trigger = (1 << 30);
+ options.disable_auto_compactions = true;
+ DestroyAndReopen(options);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ CreateColumnFamilies({"cf1", "cf2"}, options);
+ ASSERT_OK(Put(0, "key1", DummyString(8192)));
+ ASSERT_OK(Put(0, "key2", DummyString(8192)));
+ // the oldest wal should now be getting_flushed
+ ASSERT_OK(db_->DropColumnFamily(handles_[0]));
+ // all flushes should now do nothing because their CF is dropped
+ TEST_SYNC_POINT("Test:AllowFlushes");
+ TEST_SYNC_POINT("Test:WaitForFlush");
+ uint64_t lognum1 = dbfull()->TEST_LogfileNumber();
+ ASSERT_OK(Put(1, "key3", DummyString(8192)));
+ ASSERT_OK(Put(1, "key4", DummyString(8192)));
+ // new wal should have been created
+ uint64_t lognum2 = dbfull()->TEST_LogfileNumber();
+ EXPECT_GT(lognum2, lognum1);
+}
+
+TEST_F(DBTest, UnsupportedManualSync) {
+ DestroyAndReopen(CurrentOptions());
+ env_->is_wal_sync_thread_safe_.store(false);
+ Status s = db_->SyncWAL();
+ ASSERT_TRUE(s.IsNotSupported());
+}
+
+INSTANTIATE_TEST_CASE_P(DBTestWithParam, DBTestWithParam,
+ ::testing::Combine(::testing::Values(1, 4),
+ ::testing::Bool()));
+
+TEST_F(DBTest, PauseBackgroundWorkTest) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100000; // Small write buffer
+ Reopen(options);
+
+ std::vector<port::Thread> threads;
+ std::atomic<bool> done(false);
+ db_->PauseBackgroundWork();
+ threads.emplace_back([&]() {
+ Random rnd(301);
+ for (int i = 0; i < 10000; ++i) {
+ Put(RandomString(&rnd, 10), RandomString(&rnd, 10));
+ }
+ done.store(true);
+ });
+ env_->SleepForMicroseconds(200000);
+ // make sure the thread is not done
+ ASSERT_FALSE(done.load());
+ db_->ContinueBackgroundWork();
+ for (auto& t : threads) {
+ t.join();
+ }
+ // now it's done
+ ASSERT_TRUE(done.load());
+}
+
+// Keep spawning short-living threads that create an iterator and quit.
+// Meanwhile in another thread keep flushing memtables.
+// This used to cause a deadlock.
+TEST_F(DBTest, ThreadLocalPtrDeadlock) {
+ std::atomic<int> flushes_done{0};
+ std::atomic<int> threads_destroyed{0};
+ auto done = [&] {
+ return flushes_done.load() > 10;
+ };
+
+ port::Thread flushing_thread([&] {
+ for (int i = 0; !done(); ++i) {
+ ASSERT_OK(db_->Put(WriteOptions(), Slice("hi"),
+ Slice(std::to_string(i).c_str())));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ int cnt = ++flushes_done;
+ fprintf(stderr, "Flushed %d times\n", cnt);
+ }
+ });
+
+ std::vector<port::Thread> thread_spawning_threads(10);
+ for (auto& t: thread_spawning_threads) {
+ t = port::Thread([&] {
+ while (!done()) {
+ {
+ port::Thread tmp_thread([&] {
+ auto it = db_->NewIterator(ReadOptions());
+ delete it;
+ });
+ tmp_thread.join();
+ }
+ ++threads_destroyed;
+ }
+ });
+ }
+
+ for (auto& t: thread_spawning_threads) {
+ t.join();
+ }
+ flushing_thread.join();
+ fprintf(stderr, "Done. Flushed %d times, destroyed %d threads\n",
+ flushes_done.load(), threads_destroyed.load());
+}
+
+TEST_F(DBTest, LargeBlockSizeTest) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ASSERT_OK(Put(0, "foo", "bar"));
+ BlockBasedTableOptions table_options;
+ table_options.block_size = 8LL * 1024 * 1024 * 1024LL;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ ASSERT_NOK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+}
+
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBTest, CreationTimeOfOldestFile) {
+ const int kNumKeysPerFile = 32;
+ const int kNumLevelFiles = 2;
+ const int kValueSize = 100;
+
+ Options options = CurrentOptions();
+ options.max_open_files = -1;
+ env_->time_elapse_only_sleep_ = false;
+ options.env = env_;
+
+ env_->addon_time_.store(0);
+ DestroyAndReopen(options);
+
+ bool set_file_creation_time_to_zero = true;
+ int idx = 0;
+
+ int64_t time_1 = 0;
+ env_->GetCurrentTime(&time_1);
+ const uint64_t uint_time_1 = static_cast<uint64_t>(time_1);
+
+ // Add 50 hours
+ env_->addon_time_.fetch_add(50 * 60 * 60);
+
+ int64_t time_2 = 0;
+ env_->GetCurrentTime(&time_2);
+ const uint64_t uint_time_2 = static_cast<uint64_t>(time_2);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "PropertyBlockBuilder::AddTableProperty:Start", [&](void* arg) {
+ TableProperties* props = reinterpret_cast<TableProperties*>(arg);
+ if (set_file_creation_time_to_zero) {
+ if (idx == 0) {
+ props->file_creation_time = 0;
+ idx++;
+ } else if (idx == 1) {
+ props->file_creation_time = uint_time_1;
+ idx = 0;
+ }
+ } else {
+ if (idx == 0) {
+ props->file_creation_time = uint_time_1;
+ idx++;
+ } else if (idx == 1) {
+ props->file_creation_time = uint_time_2;
+ }
+ }
+ });
+ // Set file creation time in manifest all to 0.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "FileMetaData::FileMetaData", [&](void* arg) {
+ FileMetaData* meta = static_cast<FileMetaData*>(arg);
+ meta->file_creation_time = 0;
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ for (int i = 0; i < kNumLevelFiles; ++i) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(
+ Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+ }
+ Flush();
+ }
+
+ // At this point there should be 2 files, one with file_creation_time = 0 and
+ // the other non-zero. GetCreationTimeOfOldestFile API should return 0.
+ uint64_t creation_time;
+ Status s1 = dbfull()->GetCreationTimeOfOldestFile(&creation_time);
+ ASSERT_EQ(0, creation_time);
+ ASSERT_EQ(s1, Status::OK());
+
+ // Testing with non-zero file creation time.
+ set_file_creation_time_to_zero = false;
+ options = CurrentOptions();
+ options.max_open_files = -1;
+ env_->time_elapse_only_sleep_ = false;
+ options.env = env_;
+
+ env_->addon_time_.store(0);
+ DestroyAndReopen(options);
+
+ for (int i = 0; i < kNumLevelFiles; ++i) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(
+ Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+ }
+ Flush();
+ }
+
+ // At this point there should be 2 files with non-zero file creation time.
+ // GetCreationTimeOfOldestFile API should return non-zero value.
+ uint64_t ctime;
+ Status s2 = dbfull()->GetCreationTimeOfOldestFile(&ctime);
+ ASSERT_EQ(uint_time_1, ctime);
+ ASSERT_EQ(s2, Status::OK());
+
+ // Testing with max_open_files != -1
+ options = CurrentOptions();
+ options.max_open_files = 10;
+ DestroyAndReopen(options);
+ Status s3 = dbfull()->GetCreationTimeOfOldestFile(&ctime);
+ ASSERT_EQ(s3, Status::NotSupported());
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+#endif
+
+} // namespace ROCKSDB_NAMESPACE
+
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_test2.cc b/src/rocksdb/db/db_test2.cc
new file mode 100644
index 000000000..f4e8e960a
--- /dev/null
+++ b/src/rocksdb/db/db_test2.cc
@@ -0,0 +1,4695 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <atomic>
+#include <cstdlib>
+#include <functional>
+
+#include "db/db_test_util.h"
+#include "db/read_callback.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/persistent_cache.h"
+#include "rocksdb/wal_filter.h"
+#include "test_util/fault_injection_test_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBTest2 : public DBTestBase {
+ public:
+ DBTest2() : DBTestBase("/db_test2") {}
+};
+
+class PrefixFullBloomWithReverseComparator
+ : public DBTestBase,
+ public ::testing::WithParamInterface<bool> {
+ public:
+ PrefixFullBloomWithReverseComparator()
+ : DBTestBase("/prefix_bloom_reverse") {}
+ void SetUp() override { if_cache_filter_ = GetParam(); }
+ bool if_cache_filter_;
+};
+
+TEST_P(PrefixFullBloomWithReverseComparator,
+ PrefixFullBloomWithReverseComparator) {
+ Options options = last_options_;
+ options.comparator = ReverseBytewiseComparator();
+ options.prefix_extractor.reset(NewCappedPrefixTransform(3));
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ BlockBasedTableOptions bbto;
+ if (if_cache_filter_) {
+ bbto.no_block_cache = false;
+ bbto.cache_index_and_filter_blocks = true;
+ bbto.block_cache = NewLRUCache(1);
+ }
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ bbto.whole_key_filtering = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ DestroyAndReopen(options);
+
+ ASSERT_OK(dbfull()->Put(WriteOptions(), "bar123", "foo"));
+ ASSERT_OK(dbfull()->Put(WriteOptions(), "bar234", "foo2"));
+ ASSERT_OK(dbfull()->Put(WriteOptions(), "foo123", "foo3"));
+
+ dbfull()->Flush(FlushOptions());
+
+ if (bbto.block_cache) {
+ bbto.block_cache->EraseUnRefEntries();
+ }
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+ iter->Seek("bar345");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("bar234", iter->key().ToString());
+ ASSERT_EQ("foo2", iter->value().ToString());
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("bar123", iter->key().ToString());
+ ASSERT_EQ("foo", iter->value().ToString());
+
+ iter->Seek("foo234");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("foo123", iter->key().ToString());
+ ASSERT_EQ("foo3", iter->value().ToString());
+
+ iter->Seek("bar");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+}
+
+INSTANTIATE_TEST_CASE_P(PrefixFullBloomWithReverseComparator,
+ PrefixFullBloomWithReverseComparator, testing::Bool());
+
+TEST_F(DBTest2, IteratorPropertyVersionNumber) {
+ Put("", "");
+ Iterator* iter1 = db_->NewIterator(ReadOptions());
+ std::string prop_value;
+ ASSERT_OK(
+ iter1->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
+ uint64_t version_number1 =
+ static_cast<uint64_t>(std::atoi(prop_value.c_str()));
+
+ Put("", "");
+ Flush();
+
+ Iterator* iter2 = db_->NewIterator(ReadOptions());
+ ASSERT_OK(
+ iter2->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
+ uint64_t version_number2 =
+ static_cast<uint64_t>(std::atoi(prop_value.c_str()));
+
+ ASSERT_GT(version_number2, version_number1);
+
+ Put("", "");
+
+ Iterator* iter3 = db_->NewIterator(ReadOptions());
+ ASSERT_OK(
+ iter3->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
+ uint64_t version_number3 =
+ static_cast<uint64_t>(std::atoi(prop_value.c_str()));
+
+ ASSERT_EQ(version_number2, version_number3);
+
+ iter1->SeekToFirst();
+ ASSERT_OK(
+ iter1->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
+ uint64_t version_number1_new =
+ static_cast<uint64_t>(std::atoi(prop_value.c_str()));
+ ASSERT_EQ(version_number1, version_number1_new);
+
+ delete iter1;
+ delete iter2;
+ delete iter3;
+}
+
+TEST_F(DBTest2, CacheIndexAndFilterWithDBRestart) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ BlockBasedTableOptions table_options;
+ table_options.cache_index_and_filter_blocks = true;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ Put(1, "a", "begin");
+ Put(1, "z", "end");
+ ASSERT_OK(Flush(1));
+ TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ std::string value;
+ value = Get(1, "a");
+}
+
+TEST_F(DBTest2, MaxSuccessiveMergesChangeWithDBRecovery) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.max_successive_merges = 3;
+ options.merge_operator = MergeOperators::CreatePutOperator();
+ options.disable_auto_compactions = true;
+ DestroyAndReopen(options);
+ Put("poi", "Finch");
+ db_->Merge(WriteOptions(), "poi", "Reese");
+ db_->Merge(WriteOptions(), "poi", "Shaw");
+ db_->Merge(WriteOptions(), "poi", "Root");
+ options.max_successive_merges = 2;
+ Reopen(options);
+}
+
+#ifndef ROCKSDB_LITE
+class DBTestSharedWriteBufferAcrossCFs
+ : public DBTestBase,
+ public testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+ DBTestSharedWriteBufferAcrossCFs()
+ : DBTestBase("/db_test_shared_write_buffer") {}
+ void SetUp() override {
+ use_old_interface_ = std::get<0>(GetParam());
+ cost_cache_ = std::get<1>(GetParam());
+ }
+ bool use_old_interface_;
+ bool cost_cache_;
+};
+
+TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) {
+ Options options = CurrentOptions();
+ options.arena_block_size = 4096;
+
+ // Avoid undeterministic value by malloc_usable_size();
+ // Force arena block size to 1
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "Arena::Arena:0", [&](void* arg) {
+ size_t* block_size = static_cast<size_t*>(arg);
+ *block_size = 1;
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "Arena::AllocateNewBlock:0", [&](void* arg) {
+ std::pair<size_t*, size_t*>* pair =
+ static_cast<std::pair<size_t*, size_t*>*>(arg);
+ *std::get<0>(*pair) = *std::get<1>(*pair);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // The total soft write buffer size is about 105000
+ std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+ ASSERT_LT(cache->GetUsage(), 256 * 1024);
+
+ if (use_old_interface_) {
+ options.db_write_buffer_size = 120000; // this is the real limit
+ } else if (!cost_cache_) {
+ options.write_buffer_manager.reset(new WriteBufferManager(114285));
+ } else {
+ options.write_buffer_manager.reset(new WriteBufferManager(114285, cache));
+ }
+ options.write_buffer_size = 500000; // this is never hit
+ CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);
+
+ WriteOptions wo;
+ wo.disableWAL = true;
+
+ std::function<void()> wait_flush = [&]() {
+ dbfull()->TEST_WaitForFlushMemTable(handles_[0]);
+ dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+ dbfull()->TEST_WaitForFlushMemTable(handles_[2]);
+ dbfull()->TEST_WaitForFlushMemTable(handles_[3]);
+ };
+
+ // Create some data and flush "default" and "nikitich" so that they
+ // are newer CFs created.
+ ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+ Flush(3);
+ ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+ ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+ Flush(0);
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+ static_cast<uint64_t>(1));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+ static_cast<uint64_t>(1));
+
+ ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
+ if (cost_cache_) {
+ ASSERT_GE(cache->GetUsage(), 256 * 1024);
+ ASSERT_LE(cache->GetUsage(), 2 * 256 * 1024);
+ }
+ wait_flush();
+ ASSERT_OK(Put(0, Key(1), DummyString(60000), wo));
+ if (cost_cache_) {
+ ASSERT_GE(cache->GetUsage(), 256 * 1024);
+ ASSERT_LE(cache->GetUsage(), 2 * 256 * 1024);
+ }
+ wait_flush();
+ ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+ // No flush should trigger
+ wait_flush();
+ {
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+ static_cast<uint64_t>(1));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+ static_cast<uint64_t>(0));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+ static_cast<uint64_t>(0));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+ static_cast<uint64_t>(1));
+ }
+
+ // Trigger a flush. Flushing "nikitich".
+ ASSERT_OK(Put(3, Key(2), DummyString(30000), wo));
+ wait_flush();
+ ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+ wait_flush();
+ {
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+ static_cast<uint64_t>(1));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+ static_cast<uint64_t>(0));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+ static_cast<uint64_t>(0));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+ static_cast<uint64_t>(2));
+ }
+
+ // Without hitting the threshold, no flush should trigger.
+ ASSERT_OK(Put(2, Key(1), DummyString(30000), wo));
+ wait_flush();
+ ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+ wait_flush();
+ ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+ wait_flush();
+ {
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+ static_cast<uint64_t>(1));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+ static_cast<uint64_t>(0));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+ static_cast<uint64_t>(0));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+ static_cast<uint64_t>(2));
+ }
+
+ // Hit the write buffer limit again. "default"
+ // will have been flushed.
+ ASSERT_OK(Put(2, Key(2), DummyString(10000), wo));
+ wait_flush();
+ ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+ wait_flush();
+ ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+ wait_flush();
+ ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+ wait_flush();
+ ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+ wait_flush();
+ {
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+ static_cast<uint64_t>(2));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+ static_cast<uint64_t>(0));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+ static_cast<uint64_t>(0));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+ static_cast<uint64_t>(2));
+ }
+
+ // Trigger another flush. This time "dobrynia". "pikachu" should not
+ // be flushed, althrough it was never flushed.
+ ASSERT_OK(Put(1, Key(1), DummyString(1), wo));
+ wait_flush();
+ ASSERT_OK(Put(2, Key(1), DummyString(80000), wo));
+ wait_flush();
+ ASSERT_OK(Put(1, Key(1), DummyString(1), wo));
+ wait_flush();
+ ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+ wait_flush();
+
+ {
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+ static_cast<uint64_t>(2));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+ static_cast<uint64_t>(0));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+ static_cast<uint64_t>(1));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+ static_cast<uint64_t>(2));
+ }
+ if (cost_cache_) {
+ ASSERT_GE(cache->GetUsage(), 256 * 1024);
+ Close();
+ options.write_buffer_manager.reset();
+ last_options_.write_buffer_manager.reset();
+ ASSERT_LT(cache->GetUsage(), 256 * 1024);
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+INSTANTIATE_TEST_CASE_P(DBTestSharedWriteBufferAcrossCFs,
+ DBTestSharedWriteBufferAcrossCFs,
+ ::testing::Values(std::make_tuple(true, false),
+ std::make_tuple(false, false),
+ std::make_tuple(false, true)));
+
+TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) {
+ std::string dbname2 = test::PerThreadDBPath("db_shared_wb_db2");
+ Options options = CurrentOptions();
+ options.arena_block_size = 4096;
+ // Avoid undeterministic value by malloc_usable_size();
+ // Force arena block size to 1
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "Arena::Arena:0", [&](void* arg) {
+ size_t* block_size = static_cast<size_t*>(arg);
+ *block_size = 1;
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "Arena::AllocateNewBlock:0", [&](void* arg) {
+ std::pair<size_t*, size_t*>* pair =
+ static_cast<std::pair<size_t*, size_t*>*>(arg);
+ *std::get<0>(*pair) = *std::get<1>(*pair);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ options.write_buffer_size = 500000; // this is never hit
+ // Use a write buffer total size so that the soft limit is about
+ // 105000.
+ options.write_buffer_manager.reset(new WriteBufferManager(120000));
+ CreateAndReopenWithCF({"cf1", "cf2"}, options);
+
+ ASSERT_OK(DestroyDB(dbname2, options));
+ DB* db2 = nullptr;
+ ASSERT_OK(DB::Open(options, dbname2, &db2));
+
+ WriteOptions wo;
+ wo.disableWAL = true;
+
+ std::function<void()> wait_flush = [&]() {
+ dbfull()->TEST_WaitForFlushMemTable(handles_[0]);
+ dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+ dbfull()->TEST_WaitForFlushMemTable(handles_[2]);
+ static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable();
+ };
+
+ // Trigger a flush on cf2
+ ASSERT_OK(Put(2, Key(1), DummyString(70000), wo));
+ wait_flush();
+ ASSERT_OK(Put(0, Key(1), DummyString(20000), wo));
+ wait_flush();
+
+ // Insert to DB2
+ ASSERT_OK(db2->Put(wo, Key(2), DummyString(20000)));
+ wait_flush();
+
+ ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+ wait_flush();
+ static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable();
+ {
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default") +
+ GetNumberOfSstFilesForColumnFamily(db_, "cf1") +
+ GetNumberOfSstFilesForColumnFamily(db_, "cf2"),
+ static_cast<uint64_t>(1));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
+ static_cast<uint64_t>(0));
+ }
+
+ // Triggering to flush another CF in DB1
+ ASSERT_OK(db2->Put(wo, Key(2), DummyString(70000)));
+ wait_flush();
+ ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+ wait_flush();
+ {
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+ static_cast<uint64_t>(1));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"),
+ static_cast<uint64_t>(0));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"),
+ static_cast<uint64_t>(1));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
+ static_cast<uint64_t>(0));
+ }
+
+ // Triggering flush in DB2.
+ ASSERT_OK(db2->Put(wo, Key(3), DummyString(40000)));
+ wait_flush();
+ ASSERT_OK(db2->Put(wo, Key(1), DummyString(1)));
+ wait_flush();
+ static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable();
+ {
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+ static_cast<uint64_t>(1));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"),
+ static_cast<uint64_t>(0));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"),
+ static_cast<uint64_t>(1));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
+ static_cast<uint64_t>(1));
+ }
+
+ delete db2;
+ ASSERT_OK(DestroyDB(dbname2, options));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, TestWriteBufferNoLimitWithCache) {
+ Options options = CurrentOptions();
+ options.arena_block_size = 4096;
+ std::shared_ptr<Cache> cache =
+ NewLRUCache(LRUCacheOptions(10000000, 1, false, 0.0));
+ options.write_buffer_size = 50000; // this is never hit
+ // Use a write buffer total size so that the soft limit is about
+ // 105000.
+ options.write_buffer_manager.reset(new WriteBufferManager(0, cache));
+ Reopen(options);
+
+ ASSERT_OK(Put("foo", "bar"));
+ // One dummy entry is 256KB.
+ ASSERT_GT(cache->GetUsage(), 128000);
+}
+
+namespace {
+ void ValidateKeyExistence(DB* db, const std::vector<Slice>& keys_must_exist,
+ const std::vector<Slice>& keys_must_not_exist) {
+ // Ensure that expected keys exist
+ std::vector<std::string> values;
+ if (keys_must_exist.size() > 0) {
+ std::vector<Status> status_list =
+ db->MultiGet(ReadOptions(), keys_must_exist, &values);
+ for (size_t i = 0; i < keys_must_exist.size(); i++) {
+ ASSERT_OK(status_list[i]);
+ }
+ }
+
+ // Ensure that given keys don't exist
+ if (keys_must_not_exist.size() > 0) {
+ std::vector<Status> status_list =
+ db->MultiGet(ReadOptions(), keys_must_not_exist, &values);
+ for (size_t i = 0; i < keys_must_not_exist.size(); i++) {
+ ASSERT_TRUE(status_list[i].IsNotFound());
+ }
+ }
+ }
+
+} // namespace
+
+TEST_F(DBTest2, WalFilterTest) {
+ class TestWalFilter : public WalFilter {
+ private:
+ // Processing option that is requested to be applied at the given index
+ WalFilter::WalProcessingOption wal_processing_option_;
+ // Index at which to apply wal_processing_option_
+ // At other indexes default wal_processing_option::kContinueProcessing is
+ // returned.
+ size_t apply_option_at_record_index_;
+ // Current record index, incremented with each record encountered.
+ size_t current_record_index_;
+
+ public:
+ TestWalFilter(WalFilter::WalProcessingOption wal_processing_option,
+ size_t apply_option_for_record_index)
+ : wal_processing_option_(wal_processing_option),
+ apply_option_at_record_index_(apply_option_for_record_index),
+ current_record_index_(0) {}
+
+ WalProcessingOption LogRecord(const WriteBatch& /*batch*/,
+ WriteBatch* /*new_batch*/,
+ bool* /*batch_changed*/) const override {
+ WalFilter::WalProcessingOption option_to_return;
+
+ if (current_record_index_ == apply_option_at_record_index_) {
+ option_to_return = wal_processing_option_;
+ }
+ else {
+ option_to_return = WalProcessingOption::kContinueProcessing;
+ }
+
+ // Filter is passed as a const object for RocksDB to not modify the
+ // object, however we modify it for our own purpose here and hence
+ // cast the constness away.
+ (const_cast<TestWalFilter*>(this)->current_record_index_)++;
+
+ return option_to_return;
+ }
+
+ const char* Name() const override { return "TestWalFilter"; }
+ };
+
+ // Create 3 batches with two keys each
+ std::vector<std::vector<std::string>> batch_keys(3);
+
+ batch_keys[0].push_back("key1");
+ batch_keys[0].push_back("key2");
+ batch_keys[1].push_back("key3");
+ batch_keys[1].push_back("key4");
+ batch_keys[2].push_back("key5");
+ batch_keys[2].push_back("key6");
+
+ // Test with all WAL processing options
+ for (int option = 0;
+ option < static_cast<int>(
+ WalFilter::WalProcessingOption::kWalProcessingOptionMax);
+ option++) {
+ Options options = OptionsForLogIterTest();
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({ "pikachu" }, options);
+
+ // Write given keys in given batches
+ for (size_t i = 0; i < batch_keys.size(); i++) {
+ WriteBatch batch;
+ for (size_t j = 0; j < batch_keys[i].size(); j++) {
+ batch.Put(handles_[0], batch_keys[i][j], DummyString(1024));
+ }
+ dbfull()->Write(WriteOptions(), &batch);
+ }
+
+ WalFilter::WalProcessingOption wal_processing_option =
+ static_cast<WalFilter::WalProcessingOption>(option);
+
+ // Create a test filter that would apply wal_processing_option at the first
+ // record
+ size_t apply_option_for_record_index = 1;
+ TestWalFilter test_wal_filter(wal_processing_option,
+ apply_option_for_record_index);
+
+ // Reopen database with option to use WAL filter
+ options = OptionsForLogIterTest();
+ options.wal_filter = &test_wal_filter;
+ Status status =
+ TryReopenWithColumnFamilies({ "default", "pikachu" }, options);
+ if (wal_processing_option ==
+ WalFilter::WalProcessingOption::kCorruptedRecord) {
+ assert(!status.ok());
+ // In case of corruption we can turn off paranoid_checks to reopen
+ // databse
+ options.paranoid_checks = false;
+ ReopenWithColumnFamilies({ "default", "pikachu" }, options);
+ }
+ else {
+ assert(status.ok());
+ }
+
+ // Compute which keys we expect to be found
+ // and which we expect not to be found after recovery.
+ std::vector<Slice> keys_must_exist;
+ std::vector<Slice> keys_must_not_exist;
+ switch (wal_processing_option) {
+ case WalFilter::WalProcessingOption::kCorruptedRecord:
+ case WalFilter::WalProcessingOption::kContinueProcessing: {
+ fprintf(stderr, "Testing with complete WAL processing\n");
+ // we expect all records to be processed
+ for (size_t i = 0; i < batch_keys.size(); i++) {
+ for (size_t j = 0; j < batch_keys[i].size(); j++) {
+ keys_must_exist.push_back(Slice(batch_keys[i][j]));
+ }
+ }
+ break;
+ }
+ case WalFilter::WalProcessingOption::kIgnoreCurrentRecord: {
+ fprintf(stderr,
+ "Testing with ignoring record %" ROCKSDB_PRIszt " only\n",
+ apply_option_for_record_index);
+ // We expect the record with apply_option_for_record_index to be not
+ // found.
+ for (size_t i = 0; i < batch_keys.size(); i++) {
+ for (size_t j = 0; j < batch_keys[i].size(); j++) {
+ if (i == apply_option_for_record_index) {
+ keys_must_not_exist.push_back(Slice(batch_keys[i][j]));
+ }
+ else {
+ keys_must_exist.push_back(Slice(batch_keys[i][j]));
+ }
+ }
+ }
+ break;
+ }
+ case WalFilter::WalProcessingOption::kStopReplay: {
+ fprintf(stderr,
+ "Testing with stopping replay from record %" ROCKSDB_PRIszt
+ "\n",
+ apply_option_for_record_index);
+ // We expect records beyond apply_option_for_record_index to be not
+ // found.
+ for (size_t i = 0; i < batch_keys.size(); i++) {
+ for (size_t j = 0; j < batch_keys[i].size(); j++) {
+ if (i >= apply_option_for_record_index) {
+ keys_must_not_exist.push_back(Slice(batch_keys[i][j]));
+ }
+ else {
+ keys_must_exist.push_back(Slice(batch_keys[i][j]));
+ }
+ }
+ }
+ break;
+ }
+ default:
+ assert(false); // unhandled case
+ }
+
+ bool checked_after_reopen = false;
+
+ while (true) {
+ // Ensure that expected keys exists
+ // and not expected keys don't exist after recovery
+ ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist);
+
+ if (checked_after_reopen) {
+ break;
+ }
+
+ // reopen database again to make sure previous log(s) are not used
+ //(even if they were skipped)
+ // reopn database with option to use WAL filter
+ options = OptionsForLogIterTest();
+ ReopenWithColumnFamilies({ "default", "pikachu" }, options);
+
+ checked_after_reopen = true;
+ }
+ }
+}
+
+TEST_F(DBTest2, WalFilterTestWithChangeBatch) {
+ class ChangeBatchHandler : public WriteBatch::Handler {
+ private:
+ // Batch to insert keys in
+ WriteBatch* new_write_batch_;
+ // Number of keys to add in the new batch
+ size_t num_keys_to_add_in_new_batch_;
+ // Number of keys added to new batch
+ size_t num_keys_added_;
+
+ public:
+ ChangeBatchHandler(WriteBatch* new_write_batch,
+ size_t num_keys_to_add_in_new_batch)
+ : new_write_batch_(new_write_batch),
+ num_keys_to_add_in_new_batch_(num_keys_to_add_in_new_batch),
+ num_keys_added_(0) {}
+ void Put(const Slice& key, const Slice& value) override {
+ if (num_keys_added_ < num_keys_to_add_in_new_batch_) {
+ new_write_batch_->Put(key, value);
+ ++num_keys_added_;
+ }
+ }
+ };
+
+ class TestWalFilterWithChangeBatch : public WalFilter {
+ private:
+ // Index at which to start changing records
+ size_t change_records_from_index_;
+ // Number of keys to add in the new batch
+ size_t num_keys_to_add_in_new_batch_;
+ // Current record index, incremented with each record encountered.
+ size_t current_record_index_;
+
+ public:
+ TestWalFilterWithChangeBatch(size_t change_records_from_index,
+ size_t num_keys_to_add_in_new_batch)
+ : change_records_from_index_(change_records_from_index),
+ num_keys_to_add_in_new_batch_(num_keys_to_add_in_new_batch),
+ current_record_index_(0) {}
+
+ WalProcessingOption LogRecord(const WriteBatch& batch,
+ WriteBatch* new_batch,
+ bool* batch_changed) const override {
+ if (current_record_index_ >= change_records_from_index_) {
+ ChangeBatchHandler handler(new_batch, num_keys_to_add_in_new_batch_);
+ batch.Iterate(&handler);
+ *batch_changed = true;
+ }
+
+ // Filter is passed as a const object for RocksDB to not modify the
+ // object, however we modify it for our own purpose here and hence
+ // cast the constness away.
+ (const_cast<TestWalFilterWithChangeBatch*>(this)
+ ->current_record_index_)++;
+
+ return WalProcessingOption::kContinueProcessing;
+ }
+
+ const char* Name() const override { return "TestWalFilterWithChangeBatch"; }
+ };
+
+ std::vector<std::vector<std::string>> batch_keys(3);
+
+ batch_keys[0].push_back("key1");
+ batch_keys[0].push_back("key2");
+ batch_keys[1].push_back("key3");
+ batch_keys[1].push_back("key4");
+ batch_keys[2].push_back("key5");
+ batch_keys[2].push_back("key6");
+
+ Options options = OptionsForLogIterTest();
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({ "pikachu" }, options);
+
+ // Write given keys in given batches
+ for (size_t i = 0; i < batch_keys.size(); i++) {
+ WriteBatch batch;
+ for (size_t j = 0; j < batch_keys[i].size(); j++) {
+ batch.Put(handles_[0], batch_keys[i][j], DummyString(1024));
+ }
+ dbfull()->Write(WriteOptions(), &batch);
+ }
+
+ // Create a test filter that would apply wal_processing_option at the first
+ // record
+ size_t change_records_from_index = 1;
+ size_t num_keys_to_add_in_new_batch = 1;
+ TestWalFilterWithChangeBatch test_wal_filter_with_change_batch(
+ change_records_from_index, num_keys_to_add_in_new_batch);
+
+ // Reopen database with option to use WAL filter
+ options = OptionsForLogIterTest();
+ options.wal_filter = &test_wal_filter_with_change_batch;
+ ReopenWithColumnFamilies({ "default", "pikachu" }, options);
+
+ // Ensure that all keys exist before change_records_from_index_
+ // And after that index only single key exists
+ // as our filter adds only single key for each batch
+ std::vector<Slice> keys_must_exist;
+ std::vector<Slice> keys_must_not_exist;
+
+ for (size_t i = 0; i < batch_keys.size(); i++) {
+ for (size_t j = 0; j < batch_keys[i].size(); j++) {
+ if (i >= change_records_from_index && j >= num_keys_to_add_in_new_batch) {
+ keys_must_not_exist.push_back(Slice(batch_keys[i][j]));
+ }
+ else {
+ keys_must_exist.push_back(Slice(batch_keys[i][j]));
+ }
+ }
+ }
+
+ bool checked_after_reopen = false;
+
+ while (true) {
+ // Ensure that expected keys exists
+ // and not expected keys don't exist after recovery
+ ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist);
+
+ if (checked_after_reopen) {
+ break;
+ }
+
+ // reopen database again to make sure previous log(s) are not used
+ //(even if they were skipped)
+ // reopn database with option to use WAL filter
+ options = OptionsForLogIterTest();
+ ReopenWithColumnFamilies({ "default", "pikachu" }, options);
+
+ checked_after_reopen = true;
+ }
+}
+
+TEST_F(DBTest2, WalFilterTestWithChangeBatchExtraKeys) {
+ class TestWalFilterWithChangeBatchAddExtraKeys : public WalFilter {
+ public:
+ WalProcessingOption LogRecord(const WriteBatch& batch, WriteBatch* new_batch,
+ bool* batch_changed) const override {
+ *new_batch = batch;
+ new_batch->Put("key_extra", "value_extra");
+ *batch_changed = true;
+ return WalProcessingOption::kContinueProcessing;
+ }
+
+ const char* Name() const override {
+ return "WalFilterTestWithChangeBatchExtraKeys";
+ }
+ };
+
+ std::vector<std::vector<std::string>> batch_keys(3);
+
+ batch_keys[0].push_back("key1");
+ batch_keys[0].push_back("key2");
+ batch_keys[1].push_back("key3");
+ batch_keys[1].push_back("key4");
+ batch_keys[2].push_back("key5");
+ batch_keys[2].push_back("key6");
+
+ Options options = OptionsForLogIterTest();
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({ "pikachu" }, options);
+
+ // Write given keys in given batches
+ for (size_t i = 0; i < batch_keys.size(); i++) {
+ WriteBatch batch;
+ for (size_t j = 0; j < batch_keys[i].size(); j++) {
+ batch.Put(handles_[0], batch_keys[i][j], DummyString(1024));
+ }
+ dbfull()->Write(WriteOptions(), &batch);
+ }
+
+ // Create a test filter that would add extra keys
+ TestWalFilterWithChangeBatchAddExtraKeys test_wal_filter_extra_keys;
+
+ // Reopen database with option to use WAL filter
+ options = OptionsForLogIterTest();
+ options.wal_filter = &test_wal_filter_extra_keys;
+ Status status = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_TRUE(status.IsNotSupported());
+
+ // Reopen without filter, now reopen should succeed - previous
+ // attempt to open must not have altered the db.
+ options = OptionsForLogIterTest();
+ ReopenWithColumnFamilies({ "default", "pikachu" }, options);
+
+ std::vector<Slice> keys_must_exist;
+ std::vector<Slice> keys_must_not_exist; // empty vector
+
+ for (size_t i = 0; i < batch_keys.size(); i++) {
+ for (size_t j = 0; j < batch_keys[i].size(); j++) {
+ keys_must_exist.push_back(Slice(batch_keys[i][j]));
+ }
+ }
+
+ ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist);
+}
+
+TEST_F(DBTest2, WalFilterTestWithColumnFamilies) {
+ class TestWalFilterWithColumnFamilies : public WalFilter {
+ private:
+ // column_family_id -> log_number map (provided to WALFilter)
+ std::map<uint32_t, uint64_t> cf_log_number_map_;
+ // column_family_name -> column_family_id map (provided to WALFilter)
+ std::map<std::string, uint32_t> cf_name_id_map_;
+ // column_family_name -> keys_found_in_wal map
+ // We store keys that are applicable to the column_family
+ // during recovery (i.e. aren't already flushed to SST file(s))
+ // for verification against the keys we expect.
+ std::map<uint32_t, std::vector<std::string>> cf_wal_keys_;
+ public:
+ void ColumnFamilyLogNumberMap(
+ const std::map<uint32_t, uint64_t>& cf_lognumber_map,
+ const std::map<std::string, uint32_t>& cf_name_id_map) override {
+ cf_log_number_map_ = cf_lognumber_map;
+ cf_name_id_map_ = cf_name_id_map;
+ }
+
+ WalProcessingOption LogRecordFound(unsigned long long log_number,
+ const std::string& /*log_file_name*/,
+ const WriteBatch& batch,
+ WriteBatch* /*new_batch*/,
+ bool* /*batch_changed*/) override {
+ class LogRecordBatchHandler : public WriteBatch::Handler {
+ private:
+ const std::map<uint32_t, uint64_t> & cf_log_number_map_;
+ std::map<uint32_t, std::vector<std::string>> & cf_wal_keys_;
+ unsigned long long log_number_;
+ public:
+ LogRecordBatchHandler(unsigned long long current_log_number,
+ const std::map<uint32_t, uint64_t> & cf_log_number_map,
+ std::map<uint32_t, std::vector<std::string>> & cf_wal_keys) :
+ cf_log_number_map_(cf_log_number_map),
+ cf_wal_keys_(cf_wal_keys),
+ log_number_(current_log_number){}
+
+ Status PutCF(uint32_t column_family_id, const Slice& key,
+ const Slice& /*value*/) override {
+ auto it = cf_log_number_map_.find(column_family_id);
+ assert(it != cf_log_number_map_.end());
+ unsigned long long log_number_for_cf = it->second;
+ // If the current record is applicable for column_family_id
+ // (i.e. isn't flushed to SST file(s) for column_family_id)
+ // add it to the cf_wal_keys_ map for verification.
+ if (log_number_ >= log_number_for_cf) {
+ cf_wal_keys_[column_family_id].push_back(std::string(key.data(),
+ key.size()));
+ }
+ return Status::OK();
+ }
+ } handler(log_number, cf_log_number_map_, cf_wal_keys_);
+
+ batch.Iterate(&handler);
+
+ return WalProcessingOption::kContinueProcessing;
+ }
+
+ const char* Name() const override {
+ return "WalFilterTestWithColumnFamilies";
+ }
+
+ const std::map<uint32_t, std::vector<std::string>>& GetColumnFamilyKeys() {
+ return cf_wal_keys_;
+ }
+
+ const std::map<std::string, uint32_t> & GetColumnFamilyNameIdMap() {
+ return cf_name_id_map_;
+ }
+ };
+
+ std::vector<std::vector<std::string>> batch_keys_pre_flush(3);
+
+ batch_keys_pre_flush[0].push_back("key1");
+ batch_keys_pre_flush[0].push_back("key2");
+ batch_keys_pre_flush[1].push_back("key3");
+ batch_keys_pre_flush[1].push_back("key4");
+ batch_keys_pre_flush[2].push_back("key5");
+ batch_keys_pre_flush[2].push_back("key6");
+
+ Options options = OptionsForLogIterTest();
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({ "pikachu" }, options);
+
+ // Write given keys in given batches
+ for (size_t i = 0; i < batch_keys_pre_flush.size(); i++) {
+ WriteBatch batch;
+ for (size_t j = 0; j < batch_keys_pre_flush[i].size(); j++) {
+ batch.Put(handles_[0], batch_keys_pre_flush[i][j], DummyString(1024));
+ batch.Put(handles_[1], batch_keys_pre_flush[i][j], DummyString(1024));
+ }
+ dbfull()->Write(WriteOptions(), &batch);
+ }
+
+ //Flush default column-family
+ db_->Flush(FlushOptions(), handles_[0]);
+
+ // Do some more writes
+ std::vector<std::vector<std::string>> batch_keys_post_flush(3);
+
+ batch_keys_post_flush[0].push_back("key7");
+ batch_keys_post_flush[0].push_back("key8");
+ batch_keys_post_flush[1].push_back("key9");
+ batch_keys_post_flush[1].push_back("key10");
+ batch_keys_post_flush[2].push_back("key11");
+ batch_keys_post_flush[2].push_back("key12");
+
+ // Write given keys in given batches
+ for (size_t i = 0; i < batch_keys_post_flush.size(); i++) {
+ WriteBatch batch;
+ for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) {
+ batch.Put(handles_[0], batch_keys_post_flush[i][j], DummyString(1024));
+ batch.Put(handles_[1], batch_keys_post_flush[i][j], DummyString(1024));
+ }
+ dbfull()->Write(WriteOptions(), &batch);
+ }
+
+ // On Recovery we should only find the second batch applicable to default CF
+ // But both batches applicable to pikachu CF
+
+ // Create a test filter that would add extra keys
+ TestWalFilterWithColumnFamilies test_wal_filter_column_families;
+
+ // Reopen database with option to use WAL filter
+ options = OptionsForLogIterTest();
+ options.wal_filter = &test_wal_filter_column_families;
+ Status status =
+ TryReopenWithColumnFamilies({ "default", "pikachu" }, options);
+ ASSERT_TRUE(status.ok());
+
+ // verify that handles_[0] only has post_flush keys
+ // while handles_[1] has pre and post flush keys
+ auto cf_wal_keys = test_wal_filter_column_families.GetColumnFamilyKeys();
+ auto name_id_map = test_wal_filter_column_families.GetColumnFamilyNameIdMap();
+ size_t index = 0;
+ auto keys_cf = cf_wal_keys[name_id_map[kDefaultColumnFamilyName]];
+ //default column-family, only post_flush keys are expected
+ for (size_t i = 0; i < batch_keys_post_flush.size(); i++) {
+ for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) {
+ Slice key_from_the_log(keys_cf[index++]);
+ Slice batch_key(batch_keys_post_flush[i][j]);
+ ASSERT_TRUE(key_from_the_log.compare(batch_key) == 0);
+ }
+ }
+ ASSERT_TRUE(index == keys_cf.size());
+
+ index = 0;
+ keys_cf = cf_wal_keys[name_id_map["pikachu"]];
+ //pikachu column-family, all keys are expected
+ for (size_t i = 0; i < batch_keys_pre_flush.size(); i++) {
+ for (size_t j = 0; j < batch_keys_pre_flush[i].size(); j++) {
+ Slice key_from_the_log(keys_cf[index++]);
+ Slice batch_key(batch_keys_pre_flush[i][j]);
+ ASSERT_TRUE(key_from_the_log.compare(batch_key) == 0);
+ }
+ }
+
+ for (size_t i = 0; i < batch_keys_post_flush.size(); i++) {
+ for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) {
+ Slice key_from_the_log(keys_cf[index++]);
+ Slice batch_key(batch_keys_post_flush[i][j]);
+ ASSERT_TRUE(key_from_the_log.compare(batch_key) == 0);
+ }
+ }
+ ASSERT_TRUE(index == keys_cf.size());
+}
+
+TEST_F(DBTest2, PresetCompressionDict) {
+ // Verifies that compression ratio improves when dictionary is enabled, and
+ // improves even further when the dictionary is trained by ZSTD.
+ const size_t kBlockSizeBytes = 4 << 10;
+ const size_t kL0FileBytes = 128 << 10;
+ const size_t kApproxPerBlockOverheadBytes = 50;
+ const int kNumL0Files = 5;
+
+ Options options;
+ // Make sure to use any custom env that the test is configured with.
+ options.env = CurrentOptions().env;
+ options.allow_concurrent_memtable_write = false;
+ options.arena_block_size = kBlockSizeBytes;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ options.level0_file_num_compaction_trigger = kNumL0Files;
+ options.memtable_factory.reset(
+ new SpecialSkipListFactory(kL0FileBytes / kBlockSizeBytes));
+ options.num_levels = 2;
+ options.target_file_size_base = kL0FileBytes;
+ options.target_file_size_multiplier = 2;
+ options.write_buffer_size = kL0FileBytes;
+ BlockBasedTableOptions table_options;
+ table_options.block_size = kBlockSizeBytes;
+ std::vector<CompressionType> compression_types;
+ if (Zlib_Supported()) {
+ compression_types.push_back(kZlibCompression);
+ }
+#if LZ4_VERSION_NUMBER >= 10400 // r124+
+ compression_types.push_back(kLZ4Compression);
+ compression_types.push_back(kLZ4HCCompression);
+#endif // LZ4_VERSION_NUMBER >= 10400
+ if (ZSTD_Supported()) {
+ compression_types.push_back(kZSTD);
+ }
+
+ enum DictionaryTypes : int {
+ kWithoutDict,
+ kWithDict,
+ kWithZSTDTrainedDict,
+ kDictEnd,
+ };
+
+ for (auto compression_type : compression_types) {
+ options.compression = compression_type;
+ size_t bytes_without_dict = 0;
+ size_t bytes_with_dict = 0;
+ size_t bytes_with_zstd_trained_dict = 0;
+ for (int i = kWithoutDict; i < kDictEnd; i++) {
+ // First iteration: compress without preset dictionary
+ // Second iteration: compress with preset dictionary
+ // Third iteration (zstd only): compress with zstd-trained dictionary
+ //
+ // To make sure the compression dictionary has the intended effect, we
+ // verify the compressed size is smaller in successive iterations. Also in
+ // the non-first iterations, verify the data we get out is the same data
+ // we put in.
+ switch (i) {
+ case kWithoutDict:
+ options.compression_opts.max_dict_bytes = 0;
+ options.compression_opts.zstd_max_train_bytes = 0;
+ break;
+ case kWithDict:
+ options.compression_opts.max_dict_bytes = kBlockSizeBytes;
+ options.compression_opts.zstd_max_train_bytes = 0;
+ break;
+ case kWithZSTDTrainedDict:
+ if (compression_type != kZSTD) {
+ continue;
+ }
+ options.compression_opts.max_dict_bytes = kBlockSizeBytes;
+ options.compression_opts.zstd_max_train_bytes = kL0FileBytes;
+ break;
+ default:
+ assert(false);
+ }
+
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ CreateAndReopenWithCF({"pikachu"}, options);
+ Random rnd(301);
+ std::string seq_datas[10];
+ for (int j = 0; j < 10; ++j) {
+ seq_datas[j] =
+ RandomString(&rnd, kBlockSizeBytes - kApproxPerBlockOverheadBytes);
+ }
+
+ ASSERT_EQ(0, NumTableFilesAtLevel(0, 1));
+ for (int j = 0; j < kNumL0Files; ++j) {
+ for (size_t k = 0; k < kL0FileBytes / kBlockSizeBytes + 1; ++k) {
+ auto key_num = j * (kL0FileBytes / kBlockSizeBytes) + k;
+ ASSERT_OK(Put(1, Key(static_cast<int>(key_num)),
+ seq_datas[(key_num / 10) % 10]));
+ }
+ dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+ ASSERT_EQ(j + 1, NumTableFilesAtLevel(0, 1));
+ }
+ dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
+ true /* disallow_trivial_move */);
+ ASSERT_EQ(0, NumTableFilesAtLevel(0, 1));
+ ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);
+
+ // Get the live sst files size
+ size_t total_sst_bytes = TotalSize(1);
+ if (i == kWithoutDict) {
+ bytes_without_dict = total_sst_bytes;
+ } else if (i == kWithDict) {
+ bytes_with_dict = total_sst_bytes;
+ } else if (i == kWithZSTDTrainedDict) {
+ bytes_with_zstd_trained_dict = total_sst_bytes;
+ }
+
+ for (size_t j = 0; j < kNumL0Files * (kL0FileBytes / kBlockSizeBytes);
+ j++) {
+ ASSERT_EQ(seq_datas[(j / 10) % 10], Get(1, Key(static_cast<int>(j))));
+ }
+ if (i == kWithDict) {
+ ASSERT_GT(bytes_without_dict, bytes_with_dict);
+ } else if (i == kWithZSTDTrainedDict) {
+ // In zstd compression, it is sometimes possible that using a trained
+ // dictionary does not get as good a compression ratio as without
+ // training.
+ // But using a dictionary (with or without training) should always get
+ // better compression ratio than not using one.
+ ASSERT_TRUE(bytes_with_dict > bytes_with_zstd_trained_dict ||
+ bytes_without_dict > bytes_with_zstd_trained_dict);
+ }
+
+ DestroyAndReopen(options);
+ }
+ }
+}
+
+TEST_F(DBTest2, PresetCompressionDictLocality) {
+ if (!ZSTD_Supported()) {
+ return;
+ }
+ // Verifies that compression dictionary is generated from local data. The
+ // verification simply checks all output SSTs have different compression
+ // dictionaries. We do not verify effectiveness as that'd likely be flaky in
+ // the future.
+ const int kNumEntriesPerFile = 1 << 10; // 1KB
+ const int kNumBytesPerEntry = 1 << 10; // 1KB
+ const int kNumFiles = 4;
+ Options options = CurrentOptions();
+ options.compression = kZSTD;
+ options.compression_opts.max_dict_bytes = 1 << 14; // 16KB
+ options.compression_opts.zstd_max_train_bytes = 1 << 18; // 256KB
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry;
+ BlockBasedTableOptions table_options;
+ table_options.cache_index_and_filter_blocks = true;
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+ Reopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < kNumFiles; ++i) {
+ for (int j = 0; j < kNumEntriesPerFile; ++j) {
+ ASSERT_OK(Put(Key(i * kNumEntriesPerFile + j),
+ RandomString(&rnd, kNumBytesPerEntry)));
+ }
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(1);
+ ASSERT_EQ(NumTableFilesAtLevel(1), i + 1);
+ }
+
+ // Store all the dictionaries generated during a full compaction.
+ std::vector<std::string> compression_dicts;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict",
+ [&](void* arg) {
+ compression_dicts.emplace_back(static_cast<Slice*>(arg)->ToString());
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ CompactRangeOptions compact_range_opts;
+ compact_range_opts.bottommost_level_compaction =
+ BottommostLevelCompaction::kForceOptimized;
+ ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr));
+
+ // Dictionary compression should not be so good as to compress four totally
+ // random files into one. If it does then there's probably something wrong
+ // with the test.
+ ASSERT_GT(NumTableFilesAtLevel(1), 1);
+
+ // Furthermore, there should be one compression dictionary generated per file.
+ // And they should all be different from each other.
+ ASSERT_EQ(NumTableFilesAtLevel(1),
+ static_cast<int>(compression_dicts.size()));
+ for (size_t i = 1; i < compression_dicts.size(); ++i) {
+ std::string& a = compression_dicts[i - 1];
+ std::string& b = compression_dicts[i];
+ size_t alen = a.size();
+ size_t blen = b.size();
+ ASSERT_TRUE(alen != blen || memcmp(a.data(), b.data(), alen) != 0);
+ }
+}
+
+class CompactionCompressionListener : public EventListener {
+ public:
+ explicit CompactionCompressionListener(Options* db_options)
+ : db_options_(db_options) {}
+
+ void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci) override {
+ // Figure out last level with files
+ int bottommost_level = 0;
+ for (int level = 0; level < db->NumberLevels(); level++) {
+ std::string files_at_level;
+ ASSERT_TRUE(
+ db->GetProperty("rocksdb.num-files-at-level" + NumberToString(level),
+ &files_at_level));
+ if (files_at_level != "0") {
+ bottommost_level = level;
+ }
+ }
+
+ if (db_options_->bottommost_compression != kDisableCompressionOption &&
+ ci.output_level == bottommost_level) {
+ ASSERT_EQ(ci.compression, db_options_->bottommost_compression);
+ } else if (db_options_->compression_per_level.size() != 0) {
+ ASSERT_EQ(ci.compression,
+ db_options_->compression_per_level[ci.output_level]);
+ } else {
+ ASSERT_EQ(ci.compression, db_options_->compression);
+ }
+ max_level_checked = std::max(max_level_checked, ci.output_level);
+ }
+
+ int max_level_checked = 0;
+ const Options* db_options_;
+};
+
+TEST_F(DBTest2, CompressionOptions) {
+ if (!Zlib_Supported() || !Snappy_Supported()) {
+ return;
+ }
+
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = 2;
+ options.max_bytes_for_level_base = 100;
+ options.max_bytes_for_level_multiplier = 2;
+ options.num_levels = 7;
+ options.max_background_compactions = 1;
+
+ CompactionCompressionListener* listener =
+ new CompactionCompressionListener(&options);
+ options.listeners.emplace_back(listener);
+
+ const int kKeySize = 5;
+ const int kValSize = 20;
+ Random rnd(301);
+
+ for (int iter = 0; iter <= 2; iter++) {
+ listener->max_level_checked = 0;
+
+ if (iter == 0) {
+ // Use different compression algorithms for different levels but
+ // always use Zlib for bottommost level
+ options.compression_per_level = {kNoCompression, kNoCompression,
+ kNoCompression, kSnappyCompression,
+ kSnappyCompression, kSnappyCompression,
+ kZlibCompression};
+ options.compression = kNoCompression;
+ options.bottommost_compression = kZlibCompression;
+ } else if (iter == 1) {
+ // Use Snappy except for bottommost level use ZLib
+ options.compression_per_level = {};
+ options.compression = kSnappyCompression;
+ options.bottommost_compression = kZlibCompression;
+ } else if (iter == 2) {
+ // Use Snappy everywhere
+ options.compression_per_level = {};
+ options.compression = kSnappyCompression;
+ options.bottommost_compression = kDisableCompressionOption;
+ }
+
+ DestroyAndReopen(options);
+ // Write 10 random files
+ for (int i = 0; i < 10; i++) {
+ for (int j = 0; j < 5; j++) {
+ ASSERT_OK(
+ Put(RandomString(&rnd, kKeySize), RandomString(&rnd, kValSize)));
+ }
+ ASSERT_OK(Flush());
+ dbfull()->TEST_WaitForCompact();
+ }
+
+ // Make sure that we wrote enough to check all 7 levels
+ ASSERT_EQ(listener->max_level_checked, 6);
+ }
+}
+
+class CompactionStallTestListener : public EventListener {
+ public:
+ CompactionStallTestListener() : compacting_files_cnt_(0), compacted_files_cnt_(0) {}
+
+ void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override {
+ ASSERT_EQ(ci.cf_name, "default");
+ ASSERT_EQ(ci.base_input_level, 0);
+ ASSERT_EQ(ci.compaction_reason, CompactionReason::kLevelL0FilesNum);
+ compacting_files_cnt_ += ci.input_files.size();
+ }
+
+ void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
+ ASSERT_EQ(ci.cf_name, "default");
+ ASSERT_EQ(ci.base_input_level, 0);
+ ASSERT_EQ(ci.compaction_reason, CompactionReason::kLevelL0FilesNum);
+ compacted_files_cnt_ += ci.input_files.size();
+ }
+
+ std::atomic<size_t> compacting_files_cnt_;
+ std::atomic<size_t> compacted_files_cnt_;
+};
+
+TEST_F(DBTest2, CompactionStall) {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::BGWorkCompaction", "DBTest2::CompactionStall:0"},
+ {"DBImpl::BGWorkCompaction", "DBTest2::CompactionStall:1"},
+ {"DBTest2::CompactionStall:2",
+ "DBImpl::NotifyOnCompactionBegin::UnlockMutex"},
+ {"DBTest2::CompactionStall:3",
+ "DBImpl::NotifyOnCompactionCompleted::UnlockMutex"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = 4;
+ options.max_background_compactions = 40;
+ CompactionStallTestListener* listener = new CompactionStallTestListener();
+ options.listeners.emplace_back(listener);
+ DestroyAndReopen(options);
+ // make sure all background compaction jobs can be scheduled
+ auto stop_token =
+ dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+ Random rnd(301);
+
+ // 4 Files in L0
+ for (int i = 0; i < 4; i++) {
+ for (int j = 0; j < 10; j++) {
+ ASSERT_OK(Put(RandomString(&rnd, 10), RandomString(&rnd, 10)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ // Wait for compaction to be triggered
+ TEST_SYNC_POINT("DBTest2::CompactionStall:0");
+
+ // Clear "DBImpl::BGWorkCompaction" SYNC_POINT since we want to hold it again
+ // at DBTest2::CompactionStall::1
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+
+ // Another 6 L0 files to trigger compaction again
+ for (int i = 0; i < 6; i++) {
+ for (int j = 0; j < 10; j++) {
+ ASSERT_OK(Put(RandomString(&rnd, 10), RandomString(&rnd, 10)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ // Wait for another compaction to be triggered
+ TEST_SYNC_POINT("DBTest2::CompactionStall:1");
+
+ // Hold NotifyOnCompactionBegin in the unlock mutex section
+ TEST_SYNC_POINT("DBTest2::CompactionStall:2");
+
+ // Hold NotifyOnCompactionCompleted in the unlock mutex section
+ TEST_SYNC_POINT("DBTest2::CompactionStall:3");
+
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_LT(NumTableFilesAtLevel(0),
+ options.level0_file_num_compaction_trigger);
+ ASSERT_GT(listener->compacted_files_cnt_.load(),
+ 10 - options.level0_file_num_compaction_trigger);
+ ASSERT_EQ(listener->compacting_files_cnt_.load(), listener->compacted_files_cnt_.load());
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTest2, FirstSnapshotTest) {
+ Options options;
+ options.write_buffer_size = 100000; // Small write buffer
+ options = CurrentOptions(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // This snapshot will have sequence number 0 what is expected behaviour.
+ const Snapshot* s1 = db_->GetSnapshot();
+
+ Put(1, "k1", std::string(100000, 'x')); // Fill memtable
+ Put(1, "k2", std::string(100000, 'y')); // Trigger flush
+
+ db_->ReleaseSnapshot(s1);
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, DuplicateSnapshot) {
+ Options options;
+ options = CurrentOptions(options);
+ std::vector<const Snapshot*> snapshots;
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+ SequenceNumber oldest_ww_snap, first_ww_snap;
+
+ Put("k", "v"); // inc seq
+ snapshots.push_back(db_->GetSnapshot());
+ snapshots.push_back(db_->GetSnapshot());
+ Put("k", "v"); // inc seq
+ snapshots.push_back(db_->GetSnapshot());
+ snapshots.push_back(dbi->GetSnapshotForWriteConflictBoundary());
+ first_ww_snap = snapshots.back()->GetSequenceNumber();
+ Put("k", "v"); // inc seq
+ snapshots.push_back(dbi->GetSnapshotForWriteConflictBoundary());
+ snapshots.push_back(db_->GetSnapshot());
+ Put("k", "v"); // inc seq
+ snapshots.push_back(db_->GetSnapshot());
+
+ {
+ InstrumentedMutexLock l(dbi->mutex());
+ auto seqs = dbi->snapshots().GetAll(&oldest_ww_snap);
+ ASSERT_EQ(seqs.size(), 4); // duplicates are not counted
+ ASSERT_EQ(oldest_ww_snap, first_ww_snap);
+ }
+
+ for (auto s : snapshots) {
+ db_->ReleaseSnapshot(s);
+ }
+}
+#endif // ROCKSDB_LITE
+
+class PinL0IndexAndFilterBlocksTest
+ : public DBTestBase,
+ public testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+ PinL0IndexAndFilterBlocksTest() : DBTestBase("/db_pin_l0_index_bloom_test") {}
+ void SetUp() override {
+ infinite_max_files_ = std::get<0>(GetParam());
+ disallow_preload_ = std::get<1>(GetParam());
+ }
+
+ void CreateTwoLevels(Options* options, bool close_afterwards) {
+ if (infinite_max_files_) {
+ options->max_open_files = -1;
+ }
+ options->create_if_missing = true;
+ options->statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ BlockBasedTableOptions table_options;
+ table_options.cache_index_and_filter_blocks = true;
+ table_options.pin_l0_filter_and_index_blocks_in_cache = true;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+ options->table_factory.reset(new BlockBasedTableFactory(table_options));
+ CreateAndReopenWithCF({"pikachu"}, *options);
+
+ Put(1, "a", "begin");
+ Put(1, "z", "end");
+ ASSERT_OK(Flush(1));
+ // move this table to L1
+ dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+
+ // reset block cache
+ table_options.block_cache = NewLRUCache(64 * 1024);
+ options->table_factory.reset(NewBlockBasedTableFactory(table_options));
+ TryReopenWithColumnFamilies({"default", "pikachu"}, *options);
+ // create new table at L0
+ Put(1, "a2", "begin2");
+ Put(1, "z2", "end2");
+ ASSERT_OK(Flush(1));
+
+ if (close_afterwards) {
+ Close(); // This ensures that there is no ref to block cache entries
+ }
+ table_options.block_cache->EraseUnRefEntries();
+ }
+
+ bool infinite_max_files_;
+ bool disallow_preload_;
+};
+
+TEST_P(PinL0IndexAndFilterBlocksTest,
+ IndexAndFilterBlocksOfNewTableAddedToCacheWithPinning) {
+ Options options = CurrentOptions();
+ if (infinite_max_files_) {
+ options.max_open_files = -1;
+ }
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ BlockBasedTableOptions table_options;
+ table_options.cache_index_and_filter_blocks = true;
+ table_options.pin_l0_filter_and_index_blocks_in_cache = true;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_OK(Put(1, "key", "val"));
+ // Create a new table.
+ ASSERT_OK(Flush(1));
+
+ // index/filter blocks added to block cache right after table creation.
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+
+ // only index/filter were added
+ ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS));
+
+ std::string value;
+ // Miss and hit count should remain the same, they're all pinned.
+ db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value);
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+
+ // Miss and hit count should remain the same, they're all pinned.
+ value = Get(1, "key");
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+}
+
+TEST_P(PinL0IndexAndFilterBlocksTest,
+ MultiLevelIndexAndFilterBlocksCachedWithPinning) {
+ Options options = CurrentOptions();
+ PinL0IndexAndFilterBlocksTest::CreateTwoLevels(&options, false);
+ // get base cache values
+ uint64_t fm = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+ uint64_t fh = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
+ uint64_t im = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS);
+ uint64_t ih = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT);
+
+ std::string value;
+ // this should be read from L0
+ // so cache values don't change
+ value = Get(1, "a2");
+ ASSERT_EQ(fm, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+ ASSERT_EQ(im, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+
+ // this should be read from L1
+ // the file is opened, prefetching results in a cache filter miss
+ // the block is loaded and added to the cache,
+ // then the get results in a cache hit for L1
+ // When we have inifinite max_files, there is still cache miss because we have
+ // reset the block cache
+ value = Get(1, "a");
+ ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+}
+
+TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) {
+ Options options = CurrentOptions();
+ // This ensures that db does not ref anything in the block cache, so
+ // EraseUnRefEntries could clear them up.
+ bool close_afterwards = true;
+ PinL0IndexAndFilterBlocksTest::CreateTwoLevels(&options, close_afterwards);
+
+ // Get base cache values
+ uint64_t fm = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+ uint64_t fh = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
+ uint64_t im = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS);
+ uint64_t ih = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT);
+
+ if (disallow_preload_) {
+ // Now we have two files. We narrow the max open files to allow 3 entries
+ // so that preloading SST files won't happen.
+ options.max_open_files = 13;
+ // RocksDB sanitize max open files to at least 20. Modify it back.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+ int* max_open_files = static_cast<int*>(arg);
+ *max_open_files = 13;
+ });
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Reopen database. If max_open_files is set as -1, table readers will be
+ // preloaded. This will trigger a BlockBasedTable::Open() and prefetch
+ // L0 index and filter. Level 1's prefetching is disabled in DB::Open()
+ TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ if (!disallow_preload_) {
+ // After reopen, cache miss are increased by one because we read (and only
+ // read) filter and index on L0
+ ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+ ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+ } else {
+ // If max_open_files is not -1, we do not preload table readers, so there is
+ // no change.
+ ASSERT_EQ(fm, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+ ASSERT_EQ(im, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+ }
+ std::string value;
+ // this should be read from L0
+ value = Get(1, "a2");
+ // If max_open_files is -1, we have pinned index and filter in Rep, so there
+ // will not be changes in index and filter misses or hits. If max_open_files
+ // is not -1, Get() will open a TableReader and prefetch index and filter.
+ ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+ ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+
+ // this should be read from L1
+ value = Get(1, "a");
+ if (!disallow_preload_) {
+ // In inifinite max files case, there's a cache miss in executing Get()
+ // because index and filter are not prefetched before.
+ ASSERT_EQ(fm + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+ ASSERT_EQ(im + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+ } else {
+ // In this case, cache miss will be increased by one in
+ // BlockBasedTable::Open() because this is not in DB::Open() code path so we
+ // will prefetch L1's index and filter. Cache hit will also be increased by
+ // one because Get() will read index and filter from the block cache
+ // prefetched in previous Open() call.
+ ASSERT_EQ(fm + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+ ASSERT_EQ(im + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(ih + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+ }
+
+ // Force a full compaction to one single file. There will be a block
+ // cache read for both of index and filter. If prefetch doesn't explicitly
+ // happen, it will happen when verifying the file.
+ Compact(1, "a", "zzzzz");
+ dbfull()->TEST_WaitForCompact();
+
+ if (!disallow_preload_) {
+ ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+ ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(ih + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+ } else {
+ ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+ ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+ }
+
+ // Bloom and index hit will happen when a Get() happens.
+ value = Get(1, "a");
+ if (!disallow_preload_) {
+ ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+ ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+ } else {
+ ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(fh + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+ ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(PinL0IndexAndFilterBlocksTest,
+ PinL0IndexAndFilterBlocksTest,
+ ::testing::Values(std::make_tuple(true, false),
+ std::make_tuple(false, false),
+ std::make_tuple(false, true)));
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, MaxCompactionBytesTest) {
+ Options options = CurrentOptions();
+ options.memtable_factory.reset(
+ new SpecialSkipListFactory(DBTestBase::kNumKeysByGenerateNewRandomFile));
+ options.compaction_style = kCompactionStyleLevel;
+ options.write_buffer_size = 200 << 10;
+ options.arena_block_size = 4 << 10;
+ options.level0_file_num_compaction_trigger = 4;
+ options.num_levels = 4;
+ options.compression = kNoCompression;
+ options.max_bytes_for_level_base = 450 << 10;
+ options.target_file_size_base = 100 << 10;
+ // Infinite for full compaction.
+ options.max_compaction_bytes = options.target_file_size_base * 100;
+
+ Reopen(options);
+
+ Random rnd(301);
+
+ for (int num = 0; num < 8; num++) {
+ GenerateNewRandomFile(&rnd);
+ }
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,8", FilesPerLevel(0));
+
+ // When compact from Ln -> Ln+1, cut a file if the file overlaps with
+ // more than three files in Ln+1.
+ options.max_compaction_bytes = options.target_file_size_base * 3;
+ Reopen(options);
+
+ GenerateNewRandomFile(&rnd);
+ // Add three more small files that overlap with the previous file
+ for (int i = 0; i < 3; i++) {
+ Put("a", "z");
+ ASSERT_OK(Flush());
+ }
+ dbfull()->TEST_WaitForCompact();
+
+ // Output files to L1 are cut to three pieces, according to
+ // options.max_compaction_bytes
+ ASSERT_EQ("0,3,8", FilesPerLevel(0));
+}
+
+static void UniqueIdCallback(void* arg) {
+ int* result = reinterpret_cast<int*>(arg);
+ if (*result == -1) {
+ *result = 0;
+ }
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback);
+}
+
+class MockPersistentCache : public PersistentCache {
+ public:
+ explicit MockPersistentCache(const bool is_compressed, const size_t max_size)
+ : is_compressed_(is_compressed), max_size_(max_size) {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback);
+ }
+
+ ~MockPersistentCache() override {}
+
+ PersistentCache::StatsType Stats() override {
+ return PersistentCache::StatsType();
+ }
+
+ Status Insert(const Slice& page_key, const char* data,
+ const size_t size) override {
+ MutexLock _(&lock_);
+
+ if (size_ > max_size_) {
+ size_ -= data_.begin()->second.size();
+ data_.erase(data_.begin());
+ }
+
+ data_.insert(std::make_pair(page_key.ToString(), std::string(data, size)));
+ size_ += size;
+ return Status::OK();
+ }
+
+ Status Lookup(const Slice& page_key, std::unique_ptr<char[]>* data,
+ size_t* size) override {
+ MutexLock _(&lock_);
+ auto it = data_.find(page_key.ToString());
+ if (it == data_.end()) {
+ return Status::NotFound();
+ }
+
+ assert(page_key.ToString() == it->first);
+ data->reset(new char[it->second.size()]);
+ memcpy(data->get(), it->second.c_str(), it->second.size());
+ *size = it->second.size();
+ return Status::OK();
+ }
+
+ bool IsCompressed() override { return is_compressed_; }
+
+ std::string GetPrintableOptions() const override {
+ return "MockPersistentCache";
+ }
+
+ port::Mutex lock_;
+ std::map<std::string, std::string> data_;
+ const bool is_compressed_ = true;
+ size_t size_ = 0;
+ const size_t max_size_ = 10 * 1024; // 10KiB
+};
+
+#ifdef OS_LINUX
+// Make sure that in CPU time perf context counters, Env::NowCPUNanos()
+// is used, rather than Env::CPUNanos();
+TEST_F(DBTest2, TestPerfContextGetCpuTime) {
+ // force resizing table cache so table handle is not preloaded so that
+ // we can measure find_table_nanos during Get().
+ dbfull()->TEST_table_cache()->SetCapacity(0);
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Flush());
+ env_->now_cpu_count_.store(0);
+
+ // CPU timing is not enabled with kEnableTimeExceptForMutex
+ SetPerfLevel(PerfLevel::kEnableTimeExceptForMutex);
+ ASSERT_EQ("bar", Get("foo"));
+ ASSERT_EQ(0, get_perf_context()->get_cpu_nanos);
+ ASSERT_EQ(0, env_->now_cpu_count_.load());
+
+ uint64_t kDummyAddonTime = uint64_t{1000000000000};
+
+ // Add time to NowNanos() reading.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "TableCache::FindTable:0",
+ [&](void* /*arg*/) { env_->addon_time_.fetch_add(kDummyAddonTime); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
+ ASSERT_EQ("bar", Get("foo"));
+ ASSERT_GT(env_->now_cpu_count_.load(), 2);
+ ASSERT_LT(get_perf_context()->get_cpu_nanos, kDummyAddonTime);
+ ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonTime);
+
+ SetPerfLevel(PerfLevel::kDisable);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, TestPerfContextIterCpuTime) {
+ DestroyAndReopen(CurrentOptions());
+ // force resizing table cache so table handle is not preloaded so that
+ // we can measure find_table_nanos during iteration
+ dbfull()->TEST_table_cache()->SetCapacity(0);
+
+ const size_t kNumEntries = 10;
+ for (size_t i = 0; i < kNumEntries; ++i) {
+ ASSERT_OK(Put("k" + ToString(i), "v" + ToString(i)));
+ }
+ ASSERT_OK(Flush());
+ for (size_t i = 0; i < kNumEntries; ++i) {
+ ASSERT_EQ("v" + ToString(i), Get("k" + ToString(i)));
+ }
+ std::string last_key = "k" + ToString(kNumEntries - 1);
+ std::string last_value = "v" + ToString(kNumEntries - 1);
+ env_->now_cpu_count_.store(0);
+
+ // CPU timing is not enabled with kEnableTimeExceptForMutex
+ SetPerfLevel(PerfLevel::kEnableTimeExceptForMutex);
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ iter->Seek("k0");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("v0", iter->value().ToString());
+ iter->SeekForPrev(last_key);
+ ASSERT_TRUE(iter->Valid());
+ iter->SeekToLast();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(last_value, iter->value().ToString());
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("v0", iter->value().ToString());
+ ASSERT_EQ(0, get_perf_context()->iter_seek_cpu_nanos);
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("v1", iter->value().ToString());
+ ASSERT_EQ(0, get_perf_context()->iter_next_cpu_nanos);
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("v0", iter->value().ToString());
+ ASSERT_EQ(0, get_perf_context()->iter_prev_cpu_nanos);
+ ASSERT_EQ(0, env_->now_cpu_count_.load());
+ delete iter;
+
+ uint64_t kDummyAddonTime = uint64_t{1000000000000};
+
+ // Add time to NowNanos() reading.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "TableCache::FindTable:0",
+ [&](void* /*arg*/) { env_->addon_time_.fetch_add(kDummyAddonTime); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
+ iter = db_->NewIterator(ReadOptions());
+ iter->Seek("k0");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("v0", iter->value().ToString());
+ iter->SeekForPrev(last_key);
+ ASSERT_TRUE(iter->Valid());
+ iter->SeekToLast();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(last_value, iter->value().ToString());
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("v0", iter->value().ToString());
+ ASSERT_GT(get_perf_context()->iter_seek_cpu_nanos, 0);
+ ASSERT_LT(get_perf_context()->iter_seek_cpu_nanos, kDummyAddonTime);
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("v1", iter->value().ToString());
+ ASSERT_GT(get_perf_context()->iter_next_cpu_nanos, 0);
+ ASSERT_LT(get_perf_context()->iter_next_cpu_nanos, kDummyAddonTime);
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("v0", iter->value().ToString());
+ ASSERT_GT(get_perf_context()->iter_prev_cpu_nanos, 0);
+ ASSERT_LT(get_perf_context()->iter_prev_cpu_nanos, kDummyAddonTime);
+ ASSERT_GE(env_->now_cpu_count_.load(), 12);
+ ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonTime);
+
+ SetPerfLevel(PerfLevel::kDisable);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ delete iter;
+}
+#endif // OS_LINUX
+
+// GetUniqueIdFromFile is not implemented on these platforms. Persistent cache
+// breaks when that function is not implemented and no regular block cache is
+// provided.
+#if !defined(OS_SOLARIS) && !defined(OS_WIN)
+TEST_F(DBTest2, PersistentCache) {
+ int num_iter = 80;
+
+ Options options;
+ options.write_buffer_size = 64 * 1024; // small write buffer
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options = CurrentOptions(options);
+
+ auto bsizes = {/*no block cache*/ 0, /*1M*/ 1 * 1024 * 1024};
+ auto types = {/*compressed*/ 1, /*uncompressed*/ 0};
+ for (auto bsize : bsizes) {
+ for (auto type : types) {
+ BlockBasedTableOptions table_options;
+ table_options.persistent_cache.reset(
+ new MockPersistentCache(type, 10 * 1024));
+ table_options.no_block_cache = true;
+ table_options.block_cache = bsize ? NewLRUCache(bsize) : nullptr;
+ table_options.block_cache_compressed = nullptr;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ // default column family doesn't have block cache
+ Options no_block_cache_opts;
+ no_block_cache_opts.statistics = options.statistics;
+ no_block_cache_opts = CurrentOptions(no_block_cache_opts);
+ BlockBasedTableOptions table_options_no_bc;
+ table_options_no_bc.no_block_cache = true;
+ no_block_cache_opts.table_factory.reset(
+ NewBlockBasedTableFactory(table_options_no_bc));
+ ReopenWithColumnFamilies(
+ {"default", "pikachu"},
+ std::vector<Options>({no_block_cache_opts, options}));
+
+ Random rnd(301);
+
+ // Write 8MB (80 values, each 100K)
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+ std::vector<std::string> values;
+ std::string str;
+ for (int i = 0; i < num_iter; i++) {
+ if (i % 4 == 0) { // high compression ratio
+ str = RandomString(&rnd, 1000);
+ }
+ values.push_back(str);
+ ASSERT_OK(Put(1, Key(i), values[i]));
+ }
+
+ // flush all data from memtable so that reads are from block cache
+ ASSERT_OK(Flush(1));
+
+ for (int i = 0; i < num_iter; i++) {
+ ASSERT_EQ(Get(1, Key(i)), values[i]);
+ }
+
+ auto hit = options.statistics->getTickerCount(PERSISTENT_CACHE_HIT);
+ auto miss = options.statistics->getTickerCount(PERSISTENT_CACHE_MISS);
+
+ ASSERT_GT(hit, 0);
+ ASSERT_GT(miss, 0);
+ }
+ }
+}
+#endif // !defined(OS_SOLARIS) && !defined(OS_WIN)
+
+namespace {
+void CountSyncPoint() {
+ TEST_SYNC_POINT_CALLBACK("DBTest2::MarkedPoint", nullptr /* arg */);
+}
+} // namespace
+
+TEST_F(DBTest2, SyncPointMarker) {
+ std::atomic<int> sync_point_called(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBTest2::MarkedPoint",
+ [&](void* /*arg*/) { sync_point_called.fetch_add(1); });
+
+ // The first dependency enforces Marker can be loaded before MarkedPoint.
+ // The second checks that thread 1's MarkedPoint should be disabled here.
+ // Execution order:
+ // | Thread 1 | Thread 2 |
+ // | | Marker |
+ // | MarkedPoint | |
+ // | Thread1First | |
+ // | | MarkedPoint |
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependencyAndMarkers(
+ {{"DBTest2::SyncPointMarker:Thread1First", "DBTest2::MarkedPoint"}},
+ {{"DBTest2::SyncPointMarker:Marker", "DBTest2::MarkedPoint"}});
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ std::function<void()> func1 = [&]() {
+ CountSyncPoint();
+ TEST_SYNC_POINT("DBTest2::SyncPointMarker:Thread1First");
+ };
+
+ std::function<void()> func2 = [&]() {
+ TEST_SYNC_POINT("DBTest2::SyncPointMarker:Marker");
+ CountSyncPoint();
+ };
+
+ auto thread1 = port::Thread(func1);
+ auto thread2 = port::Thread(func2);
+ thread1.join();
+ thread2.join();
+
+ // Callback is only executed once
+ ASSERT_EQ(sync_point_called.load(), 1);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+#endif
+
+size_t GetEncodedEntrySize(size_t key_size, size_t value_size) {
+ std::string buffer;
+
+ PutVarint32(&buffer, static_cast<uint32_t>(0));
+ PutVarint32(&buffer, static_cast<uint32_t>(key_size));
+ PutVarint32(&buffer, static_cast<uint32_t>(value_size));
+
+ return buffer.size() + key_size + value_size;
+}
+
+TEST_F(DBTest2, ReadAmpBitmap) {
+ Options options = CurrentOptions();
+ BlockBasedTableOptions bbto;
+ uint32_t bytes_per_bit[2] = {1, 16};
+ for (size_t k = 0; k < 2; k++) {
+ // Disable delta encoding to make it easier to calculate read amplification
+ bbto.use_delta_encoding = false;
+ // Huge block cache to make it easier to calculate read amplification
+ bbto.block_cache = NewLRUCache(1024 * 1024 * 1024);
+ bbto.read_amp_bytes_per_bit = bytes_per_bit[k];
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ DestroyAndReopen(options);
+
+ const size_t kNumEntries = 10000;
+
+ Random rnd(301);
+ for (size_t i = 0; i < kNumEntries; i++) {
+ ASSERT_OK(Put(Key(static_cast<int>(i)), RandomString(&rnd, 100)));
+ }
+ ASSERT_OK(Flush());
+
+ Close();
+ Reopen(options);
+
+ // Read keys/values randomly and verify that reported read amp error
+ // is less than 2%
+ uint64_t total_useful_bytes = 0;
+ std::set<int> read_keys;
+ std::string value;
+ for (size_t i = 0; i < kNumEntries * 5; i++) {
+ int key_idx = rnd.Next() % kNumEntries;
+ std::string key = Key(key_idx);
+ ASSERT_OK(db_->Get(ReadOptions(), key, &value));
+
+ if (read_keys.find(key_idx) == read_keys.end()) {
+ auto internal_key = InternalKey(key, 0, ValueType::kTypeValue);
+ total_useful_bytes +=
+ GetEncodedEntrySize(internal_key.size(), value.size());
+ read_keys.insert(key_idx);
+ }
+
+ double expected_read_amp =
+ static_cast<double>(total_useful_bytes) /
+ options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
+
+ double read_amp =
+ static_cast<double>(options.statistics->getTickerCount(
+ READ_AMP_ESTIMATE_USEFUL_BYTES)) /
+ options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
+
+ double error_pct = fabs(expected_read_amp - read_amp) * 100;
+ // Error between reported read amp and real read amp should be less than
+ // 2%
+ EXPECT_LE(error_pct, 2);
+ }
+
+ // Make sure we read every thing in the DB (which is smaller than our cache)
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_EQ(iter->value().ToString(), Get(iter->key().ToString()));
+ }
+ delete iter;
+
+ // Read amp is on average 100% since we read all what we loaded in memory
+ if (k == 0) {
+ ASSERT_EQ(
+ options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES),
+ options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES));
+ } else {
+ ASSERT_NEAR(
+ options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES) *
+ 1.0f /
+ options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES),
+ 1, .01);
+ }
+ }
+}
+
+#ifndef OS_SOLARIS // GetUniqueIdFromFile is not implemented
+TEST_F(DBTest2, ReadAmpBitmapLiveInCacheAfterDBClose) {
+ {
+ const int kIdBufLen = 100;
+ char id_buf[kIdBufLen];
+#ifndef OS_WIN
+ // You can't open a directory on windows using random access file
+ std::unique_ptr<RandomAccessFile> file;
+ ASSERT_OK(env_->NewRandomAccessFile(dbname_, &file, EnvOptions()));
+ if (file->GetUniqueId(id_buf, kIdBufLen) == 0) {
+ // fs holding db directory doesn't support getting a unique file id,
+ // this means that running this test will fail because lru_cache will load
+ // the blocks again regardless of them being already in the cache
+ return;
+ }
+#else
+ std::unique_ptr<Directory> dir;
+ ASSERT_OK(env_->NewDirectory(dbname_, &dir));
+ if (dir->GetUniqueId(id_buf, kIdBufLen) == 0) {
+ // fs holding db directory doesn't support getting a unique file id,
+ // this means that running this test will fail because lru_cache will load
+ // the blocks again regardless of them being already in the cache
+ return;
+ }
+#endif
+ }
+ uint32_t bytes_per_bit[2] = {1, 16};
+ for (size_t k = 0; k < 2; k++) {
+ std::shared_ptr<Cache> lru_cache = NewLRUCache(1024 * 1024 * 1024);
+ std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+ Options options = CurrentOptions();
+ BlockBasedTableOptions bbto;
+ // Disable delta encoding to make it easier to calculate read amplification
+ bbto.use_delta_encoding = false;
+ // Huge block cache to make it easier to calculate read amplification
+ bbto.block_cache = lru_cache;
+ bbto.read_amp_bytes_per_bit = bytes_per_bit[k];
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ options.statistics = stats;
+ DestroyAndReopen(options);
+
+ const int kNumEntries = 10000;
+
+ Random rnd(301);
+ for (int i = 0; i < kNumEntries; i++) {
+ ASSERT_OK(Put(Key(i), RandomString(&rnd, 100)));
+ }
+ ASSERT_OK(Flush());
+
+ Close();
+ Reopen(options);
+
+ uint64_t total_useful_bytes = 0;
+ std::set<int> read_keys;
+ std::string value;
+ // Iter1: Read half the DB, Read even keys
+ // Key(0), Key(2), Key(4), Key(6), Key(8), ...
+ for (int i = 0; i < kNumEntries; i += 2) {
+ std::string key = Key(i);
+ ASSERT_OK(db_->Get(ReadOptions(), key, &value));
+
+ if (read_keys.find(i) == read_keys.end()) {
+ auto internal_key = InternalKey(key, 0, ValueType::kTypeValue);
+ total_useful_bytes +=
+ GetEncodedEntrySize(internal_key.size(), value.size());
+ read_keys.insert(i);
+ }
+ }
+
+ size_t total_useful_bytes_iter1 =
+ options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES);
+ size_t total_loaded_bytes_iter1 =
+ options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
+
+ Close();
+ std::shared_ptr<Statistics> new_statistics =
+ ROCKSDB_NAMESPACE::CreateDBStatistics();
+ // Destroy old statistics obj that the blocks in lru_cache are pointing to
+ options.statistics.reset();
+ // Use the statistics object that we just created
+ options.statistics = new_statistics;
+ Reopen(options);
+
+ // Iter2: Read half the DB, Read odd keys
+ // Key(1), Key(3), Key(5), Key(7), Key(9), ...
+ for (int i = 1; i < kNumEntries; i += 2) {
+ std::string key = Key(i);
+ ASSERT_OK(db_->Get(ReadOptions(), key, &value));
+
+ if (read_keys.find(i) == read_keys.end()) {
+ auto internal_key = InternalKey(key, 0, ValueType::kTypeValue);
+ total_useful_bytes +=
+ GetEncodedEntrySize(internal_key.size(), value.size());
+ read_keys.insert(i);
+ }
+ }
+
+ size_t total_useful_bytes_iter2 =
+ options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES);
+ size_t total_loaded_bytes_iter2 =
+ options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
+
+
+ // Read amp is on average 100% since we read all what we loaded in memory
+ if (k == 0) {
+ ASSERT_EQ(total_useful_bytes_iter1 + total_useful_bytes_iter2,
+ total_loaded_bytes_iter1 + total_loaded_bytes_iter2);
+ } else {
+ ASSERT_NEAR((total_useful_bytes_iter1 + total_useful_bytes_iter2) * 1.0f /
+ (total_loaded_bytes_iter1 + total_loaded_bytes_iter2),
+ 1, .01);
+ }
+ }
+}
+#endif // !OS_SOLARIS
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, AutomaticCompactionOverlapManualCompaction) {
+ Options options = CurrentOptions();
+ options.num_levels = 3;
+ options.IncreaseParallelism(20);
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put(Key(0), "a"));
+ ASSERT_OK(Put(Key(5), "a"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put(Key(10), "a"));
+ ASSERT_OK(Put(Key(15), "a"));
+ ASSERT_OK(Flush());
+
+ CompactRangeOptions cro;
+ cro.change_level = true;
+ cro.target_level = 2;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ auto get_stat = [](std::string level_str, LevelStatType type,
+ std::map<std::string, std::string> props) {
+ auto prop_str =
+ "compaction." + level_str + "." +
+ InternalStats::compaction_level_stats.at(type).property_name.c_str();
+ auto prop_item = props.find(prop_str);
+ return prop_item == props.end() ? 0 : std::stod(prop_item->second);
+ };
+
+ // Trivial move 2 files to L2
+ ASSERT_EQ("0,0,2", FilesPerLevel());
+ // Also test that the stats GetMapProperty API reporting the same result
+ {
+ std::map<std::string, std::string> prop;
+ ASSERT_TRUE(dbfull()->GetMapProperty("rocksdb.cfstats", &prop));
+ ASSERT_EQ(0, get_stat("L0", LevelStatType::NUM_FILES, prop));
+ ASSERT_EQ(0, get_stat("L1", LevelStatType::NUM_FILES, prop));
+ ASSERT_EQ(2, get_stat("L2", LevelStatType::NUM_FILES, prop));
+ ASSERT_EQ(2, get_stat("Sum", LevelStatType::NUM_FILES, prop));
+ }
+
+ // While the compaction is running, we will create 2 new files that
+ // can fit in L2, these 2 files will be moved to L2 and overlap with
+ // the running compaction and break the LSM consistency.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::Run():Start", [&](void* /*arg*/) {
+ ASSERT_OK(
+ dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"},
+ {"max_bytes_for_level_base", "1"}}));
+ ASSERT_OK(Put(Key(6), "a"));
+ ASSERT_OK(Put(Key(7), "a"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put(Key(8), "a"));
+ ASSERT_OK(Put(Key(9), "a"));
+ ASSERT_OK(Flush());
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Run a manual compaction that will compact the 2 files in L2
+ // into 1 file in L2
+ cro.exclusive_manual_compaction = false;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ // Test that the stats GetMapProperty API reporting 1 file in L2
+ {
+ std::map<std::string, std::string> prop;
+ ASSERT_TRUE(dbfull()->GetMapProperty("rocksdb.cfstats", &prop));
+ ASSERT_EQ(1, get_stat("L2", LevelStatType::NUM_FILES, prop));
+ }
+}
+
+TEST_F(DBTest2, ManualCompactionOverlapManualCompaction) {
+ Options options = CurrentOptions();
+ options.num_levels = 2;
+ options.IncreaseParallelism(20);
+ options.disable_auto_compactions = true;
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put(Key(0), "a"));
+ ASSERT_OK(Put(Key(5), "a"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put(Key(10), "a"));
+ ASSERT_OK(Put(Key(15), "a"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ // Trivial move 2 files to L1
+ ASSERT_EQ("0,2", FilesPerLevel());
+
+ std::function<void()> bg_manual_compact = [&]() {
+ std::string k1 = Key(6);
+ std::string k2 = Key(9);
+ Slice k1s(k1);
+ Slice k2s(k2);
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = false;
+ ASSERT_OK(db_->CompactRange(cro, &k1s, &k2s));
+ };
+ ROCKSDB_NAMESPACE::port::Thread bg_thread;
+
+ // While the compaction is running, we will create 2 new files that
+ // can fit in L1, these 2 files will be moved to L1 and overlap with
+ // the running compaction and break the LSM consistency.
+ std::atomic<bool> flag(false);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::Run():Start", [&](void* /*arg*/) {
+ if (flag.exchange(true)) {
+ // We want to make sure to call this callback only once
+ return;
+ }
+ ASSERT_OK(Put(Key(6), "a"));
+ ASSERT_OK(Put(Key(7), "a"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put(Key(8), "a"));
+ ASSERT_OK(Put(Key(9), "a"));
+ ASSERT_OK(Flush());
+
+ // Start a non-exclusive manual compaction in a bg thread
+ bg_thread = port::Thread(bg_manual_compact);
+ // This manual compaction conflict with the other manual compaction
+ // so it should wait until the first compaction finish
+ env_->SleepForMicroseconds(1000000);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Run a manual compaction that will compact the 2 files in L1
+ // into 1 file in L1
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = false;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ bg_thread.join();
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, PausingManualCompaction1) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.num_levels = 7;
+
+ DestroyAndReopen(options);
+ Random rnd(301);
+ // Generate a file containing 10 keys.
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(Put(Key(i), RandomString(&rnd, 50)));
+ }
+ ASSERT_OK(Flush());
+
+ // Generate another file containing same keys
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(Put(Key(i), RandomString(&rnd, 50)));
+ }
+ ASSERT_OK(Flush());
+
+ int manual_compactions_paused = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::Run():PausingManualCompaction:1", [&](void* arg) {
+ auto paused = reinterpret_cast<std::atomic<bool>*>(arg);
+ ASSERT_FALSE(paused->load(std::memory_order_acquire));
+ paused->store(true, std::memory_order_release);
+ manual_compactions_paused += 1;
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ std::vector<std::string> files_before_compact, files_after_compact;
+ // Remember file name before compaction is triggered
+ std::vector<LiveFileMetaData> files_meta;
+ dbfull()->GetLiveFilesMetaData(&files_meta);
+ for (auto file : files_meta) {
+ files_before_compact.push_back(file.name);
+ }
+
+ // OK, now trigger a manual compaction
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+ // Wait for compactions to get scheduled and stopped
+ dbfull()->TEST_WaitForCompact(true);
+
+ // Get file names after compaction is stopped
+ files_meta.clear();
+ dbfull()->GetLiveFilesMetaData(&files_meta);
+ for (auto file : files_meta) {
+ files_after_compact.push_back(file.name);
+ }
+
+ // Like nothing happened
+ ASSERT_EQ(files_before_compact, files_after_compact);
+ ASSERT_EQ(manual_compactions_paused, 1);
+
+ manual_compactions_paused = 0;
+ // Now make sure CompactFiles also not run
+ dbfull()->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(),
+ files_before_compact, 0);
+ // Wait for manual compaction to get scheduled and finish
+ dbfull()->TEST_WaitForCompact(true);
+
+ files_meta.clear();
+ files_after_compact.clear();
+ dbfull()->GetLiveFilesMetaData(&files_meta);
+ for (auto file : files_meta) {
+ files_after_compact.push_back(file.name);
+ }
+
+ ASSERT_EQ(files_before_compact, files_after_compact);
+ // CompactFiles returns at entry point
+ ASSERT_EQ(manual_compactions_paused, 0);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// PausingManualCompaction does not affect auto compaction
+TEST_F(DBTest2, PausingManualCompaction2) {
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = 2;
+ options.disable_auto_compactions = false;
+
+ DestroyAndReopen(options);
+ dbfull()->DisableManualCompaction();
+
+ Random rnd(301);
+ for (int i = 0; i < 2; i++) {
+ // Generate a file containing 10 keys.
+ for (int j = 0; j < 100; j++) {
+ ASSERT_OK(Put(Key(j), RandomString(&rnd, 50)));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+ std::vector<LiveFileMetaData> files_meta;
+ dbfull()->GetLiveFilesMetaData(&files_meta);
+ ASSERT_EQ(files_meta.size(), 1);
+}
+
+TEST_F(DBTest2, PausingManualCompaction3) {
+ CompactRangeOptions compact_options;
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.num_levels = 7;
+
+ Random rnd(301);
+ auto generate_files = [&]() {
+ for (int i = 0; i < options.num_levels; i++) {
+ for (int j = 0; j < options.num_levels - i + 1; j++) {
+ for (int k = 0; k < 1000; k++) {
+ ASSERT_OK(Put(Key(k + j * 1000), RandomString(&rnd, 50)));
+ }
+ Flush();
+ }
+
+ for (int l = 1; l < options.num_levels - i; l++) {
+ MoveFilesToLevel(l);
+ }
+ }
+ };
+
+ DestroyAndReopen(options);
+ generate_files();
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif // !ROCKSDB_LITE
+ int run_manual_compactions = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::Run():PausingManualCompaction:1",
+ [&](void* /*arg*/) { run_manual_compactions++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ dbfull()->DisableManualCompaction();
+ dbfull()->CompactRange(compact_options, nullptr, nullptr);
+ dbfull()->TEST_WaitForCompact(true);
+ // As manual compaction disabled, not even reach sync point
+ ASSERT_EQ(run_manual_compactions, 0);
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif // !ROCKSDB_LITE
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+ "CompactionJob::Run():PausingManualCompaction:1");
+ dbfull()->EnableManualCompaction();
+ dbfull()->CompactRange(compact_options, nullptr, nullptr);
+ dbfull()->TEST_WaitForCompact(true);
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+#endif // !ROCKSDB_LITE
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, PausingManualCompaction4) {
+ CompactRangeOptions compact_options;
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.num_levels = 7;
+
+ Random rnd(301);
+ auto generate_files = [&]() {
+ for (int i = 0; i < options.num_levels; i++) {
+ for (int j = 0; j < options.num_levels - i + 1; j++) {
+ for (int k = 0; k < 1000; k++) {
+ ASSERT_OK(Put(Key(k + j * 1000), RandomString(&rnd, 50)));
+ }
+ Flush();
+ }
+
+ for (int l = 1; l < options.num_levels - i; l++) {
+ MoveFilesToLevel(l);
+ }
+ }
+ };
+
+ DestroyAndReopen(options);
+ generate_files();
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif // !ROCKSDB_LITE
+ int run_manual_compactions = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::Run():PausingManualCompaction:2", [&](void* arg) {
+ auto paused = reinterpret_cast<std::atomic<bool>*>(arg);
+ ASSERT_FALSE(paused->load(std::memory_order_acquire));
+ paused->store(true, std::memory_order_release);
+ run_manual_compactions++;
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ dbfull()->EnableManualCompaction();
+ dbfull()->CompactRange(compact_options, nullptr, nullptr);
+ dbfull()->TEST_WaitForCompact(true);
+ ASSERT_EQ(run_manual_compactions, 1);
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif // !ROCKSDB_LITE
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+ "CompactionJob::Run():PausingManualCompaction:2");
+ dbfull()->EnableManualCompaction();
+ dbfull()->CompactRange(compact_options, nullptr, nullptr);
+ dbfull()->TEST_WaitForCompact(true);
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+#endif // !ROCKSDB_LITE
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, OptimizeForPointLookup) {
+ Options options = CurrentOptions();
+ Close();
+ options.OptimizeForPointLookup(2);
+ ASSERT_OK(DB::Open(options, dbname_, &db_));
+
+ ASSERT_OK(Put("foo", "v1"));
+ ASSERT_EQ("v1", Get("foo"));
+ Flush();
+ ASSERT_EQ("v1", Get("foo"));
+}
+
+TEST_F(DBTest2, OptimizeForSmallDB) {
+ Options options = CurrentOptions();
+ Close();
+ options.OptimizeForSmallDb();
+
+ // Find the cache object
+ ASSERT_EQ(std::string(BlockBasedTableFactory::kName),
+ std::string(options.table_factory->Name()));
+ BlockBasedTableOptions* table_options =
+ reinterpret_cast<BlockBasedTableOptions*>(
+ options.table_factory->GetOptions());
+ ASSERT_TRUE(table_options != nullptr);
+ std::shared_ptr<Cache> cache = table_options->block_cache;
+
+ ASSERT_EQ(0, cache->GetUsage());
+ ASSERT_OK(DB::Open(options, dbname_, &db_));
+ ASSERT_OK(Put("foo", "v1"));
+
+ // memtable size is costed to the block cache
+ ASSERT_NE(0, cache->GetUsage());
+
+ ASSERT_EQ("v1", Get("foo"));
+ Flush();
+
+ size_t prev_size = cache->GetUsage();
+ // Remember block cache size, so that we can find that
+ // it is filled after Get().
+ // Use pinnable slice so that it can ping the block so that
+ // when we check the size it is not evicted.
+ PinnableSlice value;
+ ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), "foo", &value));
+ ASSERT_GT(cache->GetUsage(), prev_size);
+ value.Reset();
+}
+
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTest2, GetRaceFlush1) {
+ ASSERT_OK(Put("foo", "v1"));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::GetImpl:1", "DBTest2::GetRaceFlush:1"},
+ {"DBTest2::GetRaceFlush:2", "DBImpl::GetImpl:2"}});
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ROCKSDB_NAMESPACE::port::Thread t1([&] {
+ TEST_SYNC_POINT("DBTest2::GetRaceFlush:1");
+ ASSERT_OK(Put("foo", "v2"));
+ Flush();
+ TEST_SYNC_POINT("DBTest2::GetRaceFlush:2");
+ });
+
+ // Get() is issued after the first Put(), so it should see either
+ // "v1" or "v2".
+ ASSERT_NE("NOT_FOUND", Get("foo"));
+ t1.join();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, GetRaceFlush2) {
+ ASSERT_OK(Put("foo", "v1"));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::GetImpl:3", "DBTest2::GetRaceFlush:1"},
+ {"DBTest2::GetRaceFlush:2", "DBImpl::GetImpl:4"}});
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ port::Thread t1([&] {
+ TEST_SYNC_POINT("DBTest2::GetRaceFlush:1");
+ ASSERT_OK(Put("foo", "v2"));
+ Flush();
+ TEST_SYNC_POINT("DBTest2::GetRaceFlush:2");
+ });
+
+ // Get() is issued after the first Put(), so it should see either
+ // "v1" or "v2".
+ ASSERT_NE("NOT_FOUND", Get("foo"));
+ t1.join();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, DirectIO) {
+ if (!IsDirectIOSupported()) {
+ return;
+ }
+ Options options = CurrentOptions();
+ options.use_direct_reads = options.use_direct_io_for_flush_and_compaction =
+ true;
+ options.allow_mmap_reads = options.allow_mmap_writes = false;
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put(Key(0), "a"));
+ ASSERT_OK(Put(Key(5), "a"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put(Key(10), "a"));
+ ASSERT_OK(Put(Key(15), "a"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ Reopen(options);
+}
+
+TEST_F(DBTest2, MemtableOnlyIterator) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_OK(Put(1, "foo", "first"));
+ ASSERT_OK(Put(1, "bar", "second"));
+
+ ReadOptions ropt;
+ ropt.read_tier = kMemtableTier;
+ std::string value;
+ Iterator* it = nullptr;
+
+ // Before flushing
+ // point lookups
+ ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value));
+ ASSERT_EQ("first", value);
+ ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value));
+ ASSERT_EQ("second", value);
+
+ // Memtable-only iterator (read_tier=kMemtableTier); data not flushed yet.
+ it = db_->NewIterator(ropt, handles_[1]);
+ int count = 0;
+ for (it->SeekToFirst(); it->Valid(); it->Next()) {
+ ASSERT_TRUE(it->Valid());
+ count++;
+ }
+ ASSERT_TRUE(!it->Valid());
+ ASSERT_EQ(2, count);
+ delete it;
+
+ Flush(1);
+
+ // After flushing
+ // point lookups
+ ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value));
+ ASSERT_EQ("first", value);
+ ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value));
+ ASSERT_EQ("second", value);
+ // nothing should be returned using memtable-only iterator after flushing.
+ it = db_->NewIterator(ropt, handles_[1]);
+ count = 0;
+ for (it->SeekToFirst(); it->Valid(); it->Next()) {
+ ASSERT_TRUE(it->Valid());
+ count++;
+ }
+ ASSERT_TRUE(!it->Valid());
+ ASSERT_EQ(0, count);
+ delete it;
+
+ // Add a key to memtable
+ ASSERT_OK(Put(1, "foobar", "third"));
+ it = db_->NewIterator(ropt, handles_[1]);
+ count = 0;
+ for (it->SeekToFirst(); it->Valid(); it->Next()) {
+ ASSERT_TRUE(it->Valid());
+ ASSERT_EQ("foobar", it->key().ToString());
+ ASSERT_EQ("third", it->value().ToString());
+ count++;
+ }
+ ASSERT_TRUE(!it->Valid());
+ ASSERT_EQ(1, count);
+ delete it;
+}
+
+TEST_F(DBTest2, LowPriWrite) {
+ Options options = CurrentOptions();
+ // Compaction pressure should trigger since 6 files
+ options.level0_file_num_compaction_trigger = 4;
+ options.level0_slowdown_writes_trigger = 12;
+ options.level0_stop_writes_trigger = 30;
+ options.delayed_write_rate = 8 * 1024 * 1024;
+ Reopen(options);
+
+ std::atomic<int> rate_limit_count(0);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "GenericRateLimiter::Request:1", [&](void* arg) {
+ rate_limit_count.fetch_add(1);
+ int64_t* rate_bytes_per_sec = static_cast<int64_t*>(arg);
+ ASSERT_EQ(1024 * 1024, *rate_bytes_per_sec);
+ });
+ // Block compaction
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"DBTest.LowPriWrite:0", "DBImpl::BGWorkCompaction"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ WriteOptions wo;
+ for (int i = 0; i < 6; i++) {
+ wo.low_pri = false;
+ Put("", "", wo);
+ wo.low_pri = true;
+ Put("", "", wo);
+ Flush();
+ }
+ ASSERT_EQ(0, rate_limit_count.load());
+ wo.low_pri = true;
+ Put("", "", wo);
+ ASSERT_EQ(1, rate_limit_count.load());
+ wo.low_pri = false;
+ Put("", "", wo);
+ ASSERT_EQ(1, rate_limit_count.load());
+
+ TEST_SYNC_POINT("DBTest.LowPriWrite:0");
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ dbfull()->TEST_WaitForCompact();
+ wo.low_pri = true;
+ Put("", "", wo);
+ ASSERT_EQ(1, rate_limit_count.load());
+ wo.low_pri = false;
+ Put("", "", wo);
+ ASSERT_EQ(1, rate_limit_count.load());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, RateLimitedCompactionReads) {
+ // compaction input has 512KB data
+ const int kNumKeysPerFile = 128;
+ const int kBytesPerKey = 1024;
+ const int kNumL0Files = 4;
+
+ for (auto use_direct_io : {false, true}) {
+ if (use_direct_io && !IsDirectIOSupported()) {
+ continue;
+ }
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.level0_file_num_compaction_trigger = kNumL0Files;
+ options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
+ options.new_table_reader_for_compaction_inputs = true;
+ // takes roughly one second, split into 100 x 10ms intervals. Each interval
+ // permits 5.12KB, which is smaller than the block size, so this test
+ // exercises the code for chunking reads.
+ options.rate_limiter.reset(NewGenericRateLimiter(
+ static_cast<int64_t>(kNumL0Files * kNumKeysPerFile *
+ kBytesPerKey) /* rate_bytes_per_sec */,
+ 10 * 1000 /* refill_period_us */, 10 /* fairness */,
+ RateLimiter::Mode::kReadsOnly));
+ options.use_direct_reads = options.use_direct_io_for_flush_and_compaction =
+ use_direct_io;
+ BlockBasedTableOptions bbto;
+ bbto.block_size = 16384;
+ bbto.no_block_cache = true;
+ options.table_factory.reset(new BlockBasedTableFactory(bbto));
+ DestroyAndReopen(options);
+
+ for (int i = 0; i < kNumL0Files; ++i) {
+ for (int j = 0; j <= kNumKeysPerFile; ++j) {
+ ASSERT_OK(Put(Key(j), DummyString(kBytesPerKey)));
+ }
+ dbfull()->TEST_WaitForFlushMemTable();
+ ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+ }
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+
+ ASSERT_EQ(0, options.rate_limiter->GetTotalBytesThrough(Env::IO_HIGH));
+ // should be slightly above 512KB due to non-data blocks read. Arbitrarily
+ // chose 1MB as the upper bound on the total bytes read.
+ size_t rate_limited_bytes =
+ options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW);
+ // Include the explicit prefetch of the footer in direct I/O case.
+ size_t direct_io_extra = use_direct_io ? 512 * 1024 : 0;
+ ASSERT_GE(
+ rate_limited_bytes,
+ static_cast<size_t>(kNumKeysPerFile * kBytesPerKey * kNumL0Files));
+ ASSERT_LT(
+ rate_limited_bytes,
+ static_cast<size_t>(2 * kNumKeysPerFile * kBytesPerKey * kNumL0Files +
+ direct_io_extra));
+
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_EQ(iter->value().ToString(), DummyString(kBytesPerKey));
+ }
+ delete iter;
+ // bytes read for user iterator shouldn't count against the rate limit.
+ ASSERT_EQ(rate_limited_bytes,
+ static_cast<size_t>(
+ options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW)));
+ }
+}
+#endif // ROCKSDB_LITE
+
+// Make sure DB can be reopen with reduced number of levels, given no file
+// is on levels higher than the new num_levels.
+TEST_F(DBTest2, ReduceLevel) {
+ Options options;
+ options.disable_auto_compactions = true;
+ options.num_levels = 7;
+ Reopen(options);
+ Put("foo", "bar");
+ Flush();
+ MoveFilesToLevel(6);
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+#endif // !ROCKSDB_LITE
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 1;
+ dbfull()->CompactRange(compact_options, nullptr, nullptr);
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("0,1", FilesPerLevel());
+#endif // !ROCKSDB_LITE
+ options.num_levels = 3;
+ Reopen(options);
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("0,1", FilesPerLevel());
+#endif // !ROCKSDB_LITE
+}
+
+// Test that ReadCallback is actually used in both memtbale and sst tables
+TEST_F(DBTest2, ReadCallbackTest) {
+ Options options;
+ options.disable_auto_compactions = true;
+ options.num_levels = 7;
+ Reopen(options);
+ std::vector<const Snapshot*> snapshots;
+ // Try to create a db with multiple layers and a memtable
+ const std::string key = "foo";
+ const std::string value = "bar";
+ // This test assumes that the seq start with 1 and increased by 1 after each
+ // write batch of size 1. If that behavior changes, the test needs to be
+ // updated as well.
+ // TODO(myabandeh): update this test to use the seq number that is returned by
+ // the DB instead of assuming what seq the DB used.
+ int i = 1;
+ for (; i < 10; i++) {
+ Put(key, value + std::to_string(i));
+ // Take a snapshot to avoid the value being removed during compaction
+ auto snapshot = dbfull()->GetSnapshot();
+ snapshots.push_back(snapshot);
+ }
+ Flush();
+ for (; i < 20; i++) {
+ Put(key, value + std::to_string(i));
+ // Take a snapshot to avoid the value being removed during compaction
+ auto snapshot = dbfull()->GetSnapshot();
+ snapshots.push_back(snapshot);
+ }
+ Flush();
+ MoveFilesToLevel(6);
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+#endif // !ROCKSDB_LITE
+ for (; i < 30; i++) {
+ Put(key, value + std::to_string(i));
+ auto snapshot = dbfull()->GetSnapshot();
+ snapshots.push_back(snapshot);
+ }
+ Flush();
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("1,0,0,0,0,0,2", FilesPerLevel());
+#endif // !ROCKSDB_LITE
+ // And also add some values to the memtable
+ for (; i < 40; i++) {
+ Put(key, value + std::to_string(i));
+ auto snapshot = dbfull()->GetSnapshot();
+ snapshots.push_back(snapshot);
+ }
+
+ class TestReadCallback : public ReadCallback {
+ public:
+ explicit TestReadCallback(SequenceNumber snapshot)
+ : ReadCallback(snapshot), snapshot_(snapshot) {}
+ bool IsVisibleFullCheck(SequenceNumber seq) override {
+ return seq <= snapshot_;
+ }
+
+ private:
+ SequenceNumber snapshot_;
+ };
+
+ for (int seq = 1; seq < i; seq++) {
+ PinnableSlice pinnable_val;
+ ReadOptions roptions;
+ TestReadCallback callback(seq);
+ bool dont_care = true;
+ DBImpl::GetImplOptions get_impl_options;
+ get_impl_options.column_family = dbfull()->DefaultColumnFamily();
+ get_impl_options.value = &pinnable_val;
+ get_impl_options.value_found = &dont_care;
+ get_impl_options.callback = &callback;
+ Status s = dbfull()->GetImpl(roptions, key, get_impl_options);
+ ASSERT_TRUE(s.ok());
+ // Assuming that after each Put the DB increased seq by one, the value and
+ // seq number must be equal since we also inc value by 1 after each Put.
+ ASSERT_EQ(value + std::to_string(seq), pinnable_val.ToString());
+ }
+
+ for (auto snapshot : snapshots) {
+ dbfull()->ReleaseSnapshot(snapshot);
+ }
+}
+
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBTest2, LiveFilesOmitObsoleteFiles) {
+ // Regression test for race condition where an obsolete file is returned to
+ // user as a "live file" but then deleted, all while file deletions are
+ // disabled.
+ //
+ // It happened like this:
+ //
+ // 1. [flush thread] Log file "x.log" found by FindObsoleteFiles
+ // 2. [user thread] DisableFileDeletions, GetSortedWalFiles are called and the
+ // latter returned "x.log"
+ // 3. [flush thread] PurgeObsoleteFiles deleted "x.log"
+ // 4. [user thread] Reading "x.log" failed
+ //
+ // Unfortunately the only regression test I can come up with involves sleep.
+ // We cannot set SyncPoints to repro since, once the fix is applied, the
+ // SyncPoints would cause a deadlock as the repro's sequence of events is now
+ // prohibited.
+ //
+ // Instead, if we sleep for a second between Find and Purge, and ensure the
+ // read attempt happens after purge, then the sequence of events will almost
+ // certainly happen on the old code.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"DBImpl::BackgroundCallFlush:FilesFound",
+ "DBTest2::LiveFilesOmitObsoleteFiles:FlushTriggered"},
+ {"DBImpl::PurgeObsoleteFiles:End",
+ "DBTest2::LiveFilesOmitObsoleteFiles:LiveFilesCaptured"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::PurgeObsoleteFiles:Begin",
+ [&](void* /*arg*/) { env_->SleepForMicroseconds(1000000); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Put("key", "val");
+ FlushOptions flush_opts;
+ flush_opts.wait = false;
+ db_->Flush(flush_opts);
+ TEST_SYNC_POINT("DBTest2::LiveFilesOmitObsoleteFiles:FlushTriggered");
+
+ db_->DisableFileDeletions();
+ VectorLogPtr log_files;
+ db_->GetSortedWalFiles(log_files);
+ TEST_SYNC_POINT("DBTest2::LiveFilesOmitObsoleteFiles:LiveFilesCaptured");
+ for (const auto& log_file : log_files) {
+ ASSERT_OK(env_->FileExists(LogFileName(dbname_, log_file->LogNumber())));
+ }
+
+ db_->EnableFileDeletions();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, TestNumPread) {
+ Options options = CurrentOptions();
+ // disable block cache
+ BlockBasedTableOptions table_options;
+ table_options.no_block_cache = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+ env_->count_random_reads_ = true;
+
+ env_->random_file_open_counter_.store(0);
+ ASSERT_OK(Put("bar", "foo"));
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Flush());
+ // After flush, we'll open the file and read footer, meta block,
+ // property block and index block.
+ ASSERT_EQ(4, env_->random_read_counter_.Read());
+ ASSERT_EQ(1, env_->random_file_open_counter_.load());
+
+ // One pread per a normal data block read
+ env_->random_file_open_counter_.store(0);
+ env_->random_read_counter_.Reset();
+ ASSERT_EQ("bar", Get("foo"));
+ ASSERT_EQ(1, env_->random_read_counter_.Read());
+ // All files are already opened.
+ ASSERT_EQ(0, env_->random_file_open_counter_.load());
+
+ env_->random_file_open_counter_.store(0);
+ env_->random_read_counter_.Reset();
+ ASSERT_OK(Put("bar2", "foo2"));
+ ASSERT_OK(Put("foo2", "bar2"));
+ ASSERT_OK(Flush());
+ // After flush, we'll open the file and read footer, meta block,
+ // property block and index block.
+ ASSERT_EQ(4, env_->random_read_counter_.Read());
+ ASSERT_EQ(1, env_->random_file_open_counter_.load());
+
+ // Compaction needs two input blocks, which requires 2 preads, and
+ // generate a new SST file which needs 4 preads (footer, meta block,
+ // property block and index block). In total 6.
+ env_->random_file_open_counter_.store(0);
+ env_->random_read_counter_.Reset();
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ(6, env_->random_read_counter_.Read());
+ // All compactin input files should have already been opened.
+ ASSERT_EQ(1, env_->random_file_open_counter_.load());
+
+ // One pread per a normal data block read
+ env_->random_file_open_counter_.store(0);
+ env_->random_read_counter_.Reset();
+ ASSERT_EQ("foo2", Get("bar2"));
+ ASSERT_EQ(1, env_->random_read_counter_.Read());
+ // SST files are already opened.
+ ASSERT_EQ(0, env_->random_file_open_counter_.load());
+}
+
+TEST_F(DBTest2, TraceAndReplay) {
+ Options options = CurrentOptions();
+ options.merge_operator = MergeOperators::CreatePutOperator();
+ ReadOptions ro;
+ WriteOptions wo;
+ TraceOptions trace_opts;
+ EnvOptions env_opts;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ Random rnd(301);
+ Iterator* single_iter = nullptr;
+
+ ASSERT_TRUE(db_->EndTrace().IsIOError());
+
+ std::string trace_filename = dbname_ + "/rocksdb.trace";
+ std::unique_ptr<TraceWriter> trace_writer;
+ ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
+ ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
+
+ ASSERT_OK(Put(0, "a", "1"));
+ ASSERT_OK(Merge(0, "b", "2"));
+ ASSERT_OK(Delete(0, "c"));
+ ASSERT_OK(SingleDelete(0, "d"));
+ ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f"));
+
+ WriteBatch batch;
+ ASSERT_OK(batch.Put("f", "11"));
+ ASSERT_OK(batch.Merge("g", "12"));
+ ASSERT_OK(batch.Delete("h"));
+ ASSERT_OK(batch.SingleDelete("i"));
+ ASSERT_OK(batch.DeleteRange("j", "k"));
+ ASSERT_OK(db_->Write(wo, &batch));
+
+ single_iter = db_->NewIterator(ro);
+ single_iter->Seek("f");
+ single_iter->SeekForPrev("g");
+ delete single_iter;
+
+ ASSERT_EQ("1", Get(0, "a"));
+ ASSERT_EQ("12", Get(0, "g"));
+
+ ASSERT_OK(Put(1, "foo", "bar"));
+ ASSERT_OK(Put(1, "rocksdb", "rocks"));
+ ASSERT_EQ("NOT_FOUND", Get(1, "leveldb"));
+
+ ASSERT_OK(db_->EndTrace());
+ // These should not get into the trace file as it is after EndTrace.
+ Put("hello", "world");
+ Merge("foo", "bar");
+
+ // Open another db, replay, and verify the data
+ std::string value;
+ std::string dbname2 = test::TmpDir(env_) + "/db_replay";
+ ASSERT_OK(DestroyDB(dbname2, options));
+
+ // Using a different name than db2, to pacify infer's use-after-lifetime
+ // warnings (http://fbinfer.com).
+ DB* db2_init = nullptr;
+ options.create_if_missing = true;
+ ASSERT_OK(DB::Open(options, dbname2, &db2_init));
+ ColumnFamilyHandle* cf;
+ ASSERT_OK(
+ db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
+ delete cf;
+ delete db2_init;
+
+ DB* db2 = nullptr;
+ std::vector<ColumnFamilyDescriptor> column_families;
+ ColumnFamilyOptions cf_options;
+ cf_options.merge_operator = MergeOperators::CreatePutOperator();
+ column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+ column_families.push_back(
+ ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+ std::vector<ColumnFamilyHandle*> handles;
+ ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2));
+
+ env_->SleepForMicroseconds(100);
+ // Verify that the keys don't already exist
+ ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());
+
+ std::unique_ptr<TraceReader> trace_reader;
+ ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
+ Replayer replayer(db2, handles_, std::move(trace_reader));
+ ASSERT_OK(replayer.Replay());
+
+ ASSERT_OK(db2->Get(ro, handles[0], "a", &value));
+ ASSERT_EQ("1", value);
+ ASSERT_OK(db2->Get(ro, handles[0], "g", &value));
+ ASSERT_EQ("12", value);
+ ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound());
+
+ ASSERT_OK(db2->Get(ro, handles[1], "foo", &value));
+ ASSERT_EQ("bar", value);
+ ASSERT_OK(db2->Get(ro, handles[1], "rocksdb", &value));
+ ASSERT_EQ("rocks", value);
+
+ for (auto handle : handles) {
+ delete handle;
+ }
+ delete db2;
+ ASSERT_OK(DestroyDB(dbname2, options));
+}
+
+TEST_F(DBTest2, TraceWithLimit) {
+ Options options = CurrentOptions();
+ options.merge_operator = MergeOperators::CreatePutOperator();
+ ReadOptions ro;
+ WriteOptions wo;
+ TraceOptions trace_opts;
+ EnvOptions env_opts;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ Random rnd(301);
+
+ // test the max trace file size options
+ trace_opts.max_trace_file_size = 5;
+ std::string trace_filename = dbname_ + "/rocksdb.trace1";
+ std::unique_ptr<TraceWriter> trace_writer;
+ ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
+ ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
+ ASSERT_OK(Put(0, "a", "1"));
+ ASSERT_OK(Put(0, "b", "1"));
+ ASSERT_OK(Put(0, "c", "1"));
+ ASSERT_OK(db_->EndTrace());
+
+ std::string dbname2 = test::TmpDir(env_) + "/db_replay2";
+ std::string value;
+ ASSERT_OK(DestroyDB(dbname2, options));
+
+ // Using a different name than db2, to pacify infer's use-after-lifetime
+ // warnings (http://fbinfer.com).
+ DB* db2_init = nullptr;
+ options.create_if_missing = true;
+ ASSERT_OK(DB::Open(options, dbname2, &db2_init));
+ ColumnFamilyHandle* cf;
+ ASSERT_OK(
+ db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
+ delete cf;
+ delete db2_init;
+
+ DB* db2 = nullptr;
+ std::vector<ColumnFamilyDescriptor> column_families;
+ ColumnFamilyOptions cf_options;
+ cf_options.merge_operator = MergeOperators::CreatePutOperator();
+ column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+ column_families.push_back(
+ ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+ std::vector<ColumnFamilyHandle*> handles;
+ ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2));
+
+ env_->SleepForMicroseconds(100);
+ // Verify that the keys don't already exist
+ ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
+
+ std::unique_ptr<TraceReader> trace_reader;
+ ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
+ Replayer replayer(db2, handles_, std::move(trace_reader));
+ ASSERT_OK(replayer.Replay());
+
+ ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
+
+ for (auto handle : handles) {
+ delete handle;
+ }
+ delete db2;
+ ASSERT_OK(DestroyDB(dbname2, options));
+}
+
+TEST_F(DBTest2, TraceWithSampling) {
+ Options options = CurrentOptions();
+ ReadOptions ro;
+ WriteOptions wo;
+ TraceOptions trace_opts;
+ EnvOptions env_opts;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ Random rnd(301);
+
+ // test the trace file sampling options
+ trace_opts.sampling_frequency = 2;
+ std::string trace_filename = dbname_ + "/rocksdb.trace_sampling";
+ std::unique_ptr<TraceWriter> trace_writer;
+ ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
+ ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
+ ASSERT_OK(Put(0, "a", "1"));
+ ASSERT_OK(Put(0, "b", "2"));
+ ASSERT_OK(Put(0, "c", "3"));
+ ASSERT_OK(Put(0, "d", "4"));
+ ASSERT_OK(Put(0, "e", "5"));
+ ASSERT_OK(db_->EndTrace());
+
+ std::string dbname2 = test::TmpDir(env_) + "/db_replay_sampling";
+ std::string value;
+ ASSERT_OK(DestroyDB(dbname2, options));
+
+ // Using a different name than db2, to pacify infer's use-after-lifetime
+ // warnings (http://fbinfer.com).
+ DB* db2_init = nullptr;
+ options.create_if_missing = true;
+ ASSERT_OK(DB::Open(options, dbname2, &db2_init));
+ ColumnFamilyHandle* cf;
+ ASSERT_OK(
+ db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
+ delete cf;
+ delete db2_init;
+
+ DB* db2 = nullptr;
+ std::vector<ColumnFamilyDescriptor> column_families;
+ ColumnFamilyOptions cf_options;
+ column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+ column_families.push_back(
+ ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+ std::vector<ColumnFamilyHandle*> handles;
+ ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2));
+
+ env_->SleepForMicroseconds(100);
+ ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "d", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "e", &value).IsNotFound());
+
+ std::unique_ptr<TraceReader> trace_reader;
+ ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
+ Replayer replayer(db2, handles_, std::move(trace_reader));
+ ASSERT_OK(replayer.Replay());
+
+ ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+ ASSERT_FALSE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
+ ASSERT_FALSE(db2->Get(ro, handles[0], "d", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "e", &value).IsNotFound());
+
+ for (auto handle : handles) {
+ delete handle;
+ }
+ delete db2;
+ ASSERT_OK(DestroyDB(dbname2, options));
+}
+
+TEST_F(DBTest2, TraceWithFilter) {
+ Options options = CurrentOptions();
+ options.merge_operator = MergeOperators::CreatePutOperator();
+ ReadOptions ro;
+ WriteOptions wo;
+ TraceOptions trace_opts;
+ EnvOptions env_opts;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ Random rnd(301);
+ Iterator* single_iter = nullptr;
+
+ trace_opts.filter = TraceFilterType::kTraceFilterWrite;
+
+ std::string trace_filename = dbname_ + "/rocksdb.trace";
+ std::unique_ptr<TraceWriter> trace_writer;
+ ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
+ ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
+
+ ASSERT_OK(Put(0, "a", "1"));
+ ASSERT_OK(Merge(0, "b", "2"));
+ ASSERT_OK(Delete(0, "c"));
+ ASSERT_OK(SingleDelete(0, "d"));
+ ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f"));
+
+ WriteBatch batch;
+ ASSERT_OK(batch.Put("f", "11"));
+ ASSERT_OK(batch.Merge("g", "12"));
+ ASSERT_OK(batch.Delete("h"));
+ ASSERT_OK(batch.SingleDelete("i"));
+ ASSERT_OK(batch.DeleteRange("j", "k"));
+ ASSERT_OK(db_->Write(wo, &batch));
+
+ single_iter = db_->NewIterator(ro);
+ single_iter->Seek("f");
+ single_iter->SeekForPrev("g");
+ delete single_iter;
+
+ ASSERT_EQ("1", Get(0, "a"));
+ ASSERT_EQ("12", Get(0, "g"));
+
+ ASSERT_OK(Put(1, "foo", "bar"));
+ ASSERT_OK(Put(1, "rocksdb", "rocks"));
+ ASSERT_EQ("NOT_FOUND", Get(1, "leveldb"));
+
+ ASSERT_OK(db_->EndTrace());
+ // These should not get into the trace file as it is after EndTrace.
+ Put("hello", "world");
+ Merge("foo", "bar");
+
+ // Open another db, replay, and verify the data
+ std::string value;
+ std::string dbname2 = test::TmpDir(env_) + "/db_replay";
+ ASSERT_OK(DestroyDB(dbname2, options));
+
+ // Using a different name than db2, to pacify infer's use-after-lifetime
+ // warnings (http://fbinfer.com).
+ DB* db2_init = nullptr;
+ options.create_if_missing = true;
+ ASSERT_OK(DB::Open(options, dbname2, &db2_init));
+ ColumnFamilyHandle* cf;
+ ASSERT_OK(
+ db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
+ delete cf;
+ delete db2_init;
+
+ DB* db2 = nullptr;
+ std::vector<ColumnFamilyDescriptor> column_families;
+ ColumnFamilyOptions cf_options;
+ cf_options.merge_operator = MergeOperators::CreatePutOperator();
+ column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+ column_families.push_back(
+ ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+ std::vector<ColumnFamilyHandle*> handles;
+ ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2));
+
+ env_->SleepForMicroseconds(100);
+ // Verify that the keys don't already exist
+ ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());
+
+ std::unique_ptr<TraceReader> trace_reader;
+ ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
+ Replayer replayer(db2, handles_, std::move(trace_reader));
+ ASSERT_OK(replayer.Replay());
+
+ // All the key-values should not present since we filter out the WRITE ops.
+ ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "foo", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "rocksdb", &value).IsNotFound());
+
+ for (auto handle : handles) {
+ delete handle;
+ }
+ delete db2;
+ ASSERT_OK(DestroyDB(dbname2, options));
+
+ // Set up a new db.
+ std::string dbname3 = test::TmpDir(env_) + "/db_not_trace_read";
+ ASSERT_OK(DestroyDB(dbname3, options));
+
+ DB* db3_init = nullptr;
+ options.create_if_missing = true;
+ ColumnFamilyHandle* cf3;
+ ASSERT_OK(DB::Open(options, dbname3, &db3_init));
+ ASSERT_OK(
+ db3_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf3));
+ delete cf3;
+ delete db3_init;
+
+ column_families.clear();
+ column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+ column_families.push_back(
+ ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+ handles.clear();
+
+ DB* db3 = nullptr;
+ ASSERT_OK(DB::Open(DBOptions(), dbname3, column_families, &handles, &db3));
+
+ env_->SleepForMicroseconds(100);
+ // Verify that the keys don't already exist
+ ASSERT_TRUE(db3->Get(ro, handles[0], "a", &value).IsNotFound());
+ ASSERT_TRUE(db3->Get(ro, handles[0], "g", &value).IsNotFound());
+
+ //The tracer will not record the READ ops.
+ trace_opts.filter = TraceFilterType::kTraceFilterGet;
+ std::string trace_filename3 = dbname_ + "/rocksdb.trace_3";
+ std::unique_ptr<TraceWriter> trace_writer3;
+ ASSERT_OK(
+ NewFileTraceWriter(env_, env_opts, trace_filename3, &trace_writer3));
+ ASSERT_OK(db3->StartTrace(trace_opts, std::move(trace_writer3)));
+
+ ASSERT_OK(db3->Put(wo, handles[0], "a", "1"));
+ ASSERT_OK(db3->Merge(wo, handles[0], "b", "2"));
+ ASSERT_OK(db3->Delete(wo, handles[0], "c"));
+ ASSERT_OK(db3->SingleDelete(wo, handles[0], "d"));
+
+ ASSERT_OK(db3->Get(ro, handles[0], "a", &value));
+ ASSERT_EQ(value, "1");
+ ASSERT_TRUE(db3->Get(ro, handles[0], "c", &value).IsNotFound());
+
+ ASSERT_OK(db3->EndTrace());
+
+ for (auto handle : handles) {
+ delete handle;
+ }
+ delete db3;
+ ASSERT_OK(DestroyDB(dbname3, options));
+
+ std::unique_ptr<TraceReader> trace_reader3;
+ ASSERT_OK(
+ NewFileTraceReader(env_, env_opts, trace_filename3, &trace_reader3));
+
+ // Count the number of records in the trace file;
+ int count = 0;
+ std::string data;
+ Status s;
+ while (true) {
+ s = trace_reader3->Read(&data);
+ if (!s.ok()) {
+ break;
+ }
+ count += 1;
+ }
+ // We also need to count the header and footer
+ // 4 WRITE + HEADER + FOOTER = 6
+ ASSERT_EQ(count, 6);
+}
+
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTest2, PinnableSliceAndMmapReads) {
+ Options options = CurrentOptions();
+ options.allow_mmap_reads = true;
+ options.max_open_files = 100;
+ options.compression = kNoCompression;
+ Reopen(options);
+
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Flush());
+
+ PinnableSlice pinned_value;
+ ASSERT_EQ(Get("foo", &pinned_value), Status::OK());
+ // It is not safe to pin mmap files as they might disappear by compaction
+ ASSERT_FALSE(pinned_value.IsPinned());
+ ASSERT_EQ(pinned_value.ToString(), "bar");
+
+ dbfull()->TEST_CompactRange(0 /* level */, nullptr /* begin */,
+ nullptr /* end */, nullptr /* column_family */,
+ true /* disallow_trivial_move */);
+
+ // Ensure pinned_value doesn't rely on memory munmap'd by the above
+ // compaction. It crashes if it does.
+ ASSERT_EQ(pinned_value.ToString(), "bar");
+
+#ifndef ROCKSDB_LITE
+ pinned_value.Reset();
+ // Unsafe to pin mmap files when they could be kicked out of table cache
+ Close();
+ ASSERT_OK(ReadOnlyReopen(options));
+ ASSERT_EQ(Get("foo", &pinned_value), Status::OK());
+ ASSERT_FALSE(pinned_value.IsPinned());
+ ASSERT_EQ(pinned_value.ToString(), "bar");
+
+ pinned_value.Reset();
+ // In read-only mode with infinite capacity on table cache it should pin the
+ // value and avoid the memcpy
+ Close();
+ options.max_open_files = -1;
+ ASSERT_OK(ReadOnlyReopen(options));
+ ASSERT_EQ(Get("foo", &pinned_value), Status::OK());
+ ASSERT_TRUE(pinned_value.IsPinned());
+ ASSERT_EQ(pinned_value.ToString(), "bar");
+#endif
+}
+
+TEST_F(DBTest2, DISABLED_IteratorPinnedMemory) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ BlockBasedTableOptions bbto;
+ bbto.no_block_cache = false;
+ bbto.cache_index_and_filter_blocks = false;
+ bbto.block_cache = NewLRUCache(100000);
+ bbto.block_size = 400; // small block size
+ options.table_factory.reset(new BlockBasedTableFactory(bbto));
+ Reopen(options);
+
+ Random rnd(301);
+ std::string v = RandomString(&rnd, 400);
+
+ // Since v is the size of a block, each key should take a block
+ // of 400+ bytes.
+ Put("1", v);
+ Put("3", v);
+ Put("5", v);
+ Put("7", v);
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ(0, bbto.block_cache->GetPinnedUsage());
+
+ // Verify that iterators don't pin more than one data block in block cache
+ // at each time.
+ {
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+ iter->SeekToFirst();
+
+ for (int i = 0; i < 4; i++) {
+ ASSERT_TRUE(iter->Valid());
+ // Block cache should contain exactly one block.
+ ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0);
+ ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800);
+ iter->Next();
+ }
+ ASSERT_FALSE(iter->Valid());
+
+ iter->Seek("4");
+ ASSERT_TRUE(iter->Valid());
+
+ ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0);
+ ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800);
+
+ iter->Seek("3");
+ ASSERT_TRUE(iter->Valid());
+
+ ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0);
+ ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800);
+ }
+ ASSERT_EQ(0, bbto.block_cache->GetPinnedUsage());
+
+ // Test compaction case
+ Put("2", v);
+ Put("5", v);
+ Put("6", v);
+ Put("8", v);
+ ASSERT_OK(Flush());
+
+ // Clear existing data in block cache
+ bbto.block_cache->SetCapacity(0);
+ bbto.block_cache->SetCapacity(100000);
+
+ // Verify compaction input iterators don't hold more than one data blocks at
+ // one time.
+ std::atomic<bool> finished(false);
+ std::atomic<int> block_newed(0);
+ std::atomic<int> block_destroyed(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "Block::Block:0", [&](void* /*arg*/) {
+ if (finished) {
+ return;
+ }
+ // Two iterators. At most 2 outstanding blocks.
+ EXPECT_GE(block_newed.load(), block_destroyed.load());
+ EXPECT_LE(block_newed.load(), block_destroyed.load() + 1);
+ block_newed.fetch_add(1);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "Block::~Block", [&](void* /*arg*/) {
+ if (finished) {
+ return;
+ }
+ // Two iterators. At most 2 outstanding blocks.
+ EXPECT_GE(block_newed.load(), block_destroyed.load() + 1);
+ EXPECT_LE(block_newed.load(), block_destroyed.load() + 2);
+ block_destroyed.fetch_add(1);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::Run:BeforeVerify",
+ [&](void* /*arg*/) { finished = true; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ // Two input files. Each of them has 4 data blocks.
+ ASSERT_EQ(8, block_newed.load());
+ ASSERT_EQ(8, block_destroyed.load());
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, TestBBTTailPrefetch) {
+ std::atomic<bool> called(false);
+ size_t expected_lower_bound = 512 * 1024;
+ size_t expected_higher_bound = 512 * 1024;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) {
+ size_t* prefetch_size = static_cast<size_t*>(arg);
+ EXPECT_LE(expected_lower_bound, *prefetch_size);
+ EXPECT_GE(expected_higher_bound, *prefetch_size);
+ called = true;
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Put("1", "1");
+ Put("9", "1");
+ Flush();
+
+ expected_lower_bound = 0;
+ expected_higher_bound = 8 * 1024;
+
+ Put("1", "1");
+ Put("9", "1");
+ Flush();
+
+ Put("1", "1");
+ Put("9", "1");
+ Flush();
+
+ // Full compaction to make sure there is no L0 file after the open.
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ ASSERT_TRUE(called.load());
+ called = false;
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ std::atomic<bool> first_call(true);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) {
+ size_t* prefetch_size = static_cast<size_t*>(arg);
+ if (first_call) {
+ EXPECT_EQ(4 * 1024, *prefetch_size);
+ first_call = false;
+ } else {
+ EXPECT_GE(4 * 1024, *prefetch_size);
+ }
+ called = true;
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.max_file_opening_threads = 1; // one thread
+ BlockBasedTableOptions table_options;
+ table_options.cache_index_and_filter_blocks = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.max_open_files = -1;
+ Reopen(options);
+
+ Put("1", "1");
+ Put("9", "1");
+ Flush();
+
+ Put("1", "1");
+ Put("9", "1");
+ Flush();
+
+ ASSERT_TRUE(called.load());
+ called = false;
+
+ // Parallel loading SST files
+ options.max_file_opening_threads = 16;
+ Reopen(options);
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ ASSERT_TRUE(called.load());
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBTest2, TestGetColumnFamilyHandleUnlocked) {
+ // Setup sync point dependency to reproduce the race condition of
+ // DBImpl::GetColumnFamilyHandleUnlocked
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked1",
+ "TestGetColumnFamilyHandleUnlocked::PreGetColumnFamilyHandleUnlocked2"},
+ {"TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked2",
+ "TestGetColumnFamilyHandleUnlocked::ReadColumnFamilyHandle1"},
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ CreateColumnFamilies({"test1", "test2"}, Options());
+ ASSERT_EQ(handles_.size(), 2);
+
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+ port::Thread user_thread1([&]() {
+ auto cfh = dbi->GetColumnFamilyHandleUnlocked(handles_[0]->GetID());
+ ASSERT_EQ(cfh->GetID(), handles_[0]->GetID());
+ TEST_SYNC_POINT("TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked1");
+ TEST_SYNC_POINT("TestGetColumnFamilyHandleUnlocked::ReadColumnFamilyHandle1");
+ ASSERT_EQ(cfh->GetID(), handles_[0]->GetID());
+ });
+
+ port::Thread user_thread2([&]() {
+ TEST_SYNC_POINT("TestGetColumnFamilyHandleUnlocked::PreGetColumnFamilyHandleUnlocked2");
+ auto cfh = dbi->GetColumnFamilyHandleUnlocked(handles_[1]->GetID());
+ ASSERT_EQ(cfh->GetID(), handles_[1]->GetID());
+ TEST_SYNC_POINT("TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked2");
+ ASSERT_EQ(cfh->GetID(), handles_[1]->GetID());
+ });
+
+ user_thread1.join();
+ user_thread2.join();
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, TestCompactFiles) {
+ // Setup sync point dependency to reproduce the race condition of
+ // DBImpl::GetColumnFamilyHandleUnlocked
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"TestCompactFiles::IngestExternalFile1",
+ "TestCompactFiles::IngestExternalFile2"},
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options;
+ options.num_levels = 2;
+ options.disable_auto_compactions = true;
+ Reopen(options);
+ auto* handle = db_->DefaultColumnFamily();
+ ASSERT_EQ(db_->NumberLevels(handle), 2);
+
+ ROCKSDB_NAMESPACE::SstFileWriter sst_file_writer{
+ ROCKSDB_NAMESPACE::EnvOptions(), options};
+ std::string external_file1 = dbname_ + "/test_compact_files1.sst_t";
+ std::string external_file2 = dbname_ + "/test_compact_files2.sst_t";
+ std::string external_file3 = dbname_ + "/test_compact_files3.sst_t";
+
+ ASSERT_OK(sst_file_writer.Open(external_file1));
+ ASSERT_OK(sst_file_writer.Put("1", "1"));
+ ASSERT_OK(sst_file_writer.Put("2", "2"));
+ ASSERT_OK(sst_file_writer.Finish());
+
+ ASSERT_OK(sst_file_writer.Open(external_file2));
+ ASSERT_OK(sst_file_writer.Put("3", "3"));
+ ASSERT_OK(sst_file_writer.Put("4", "4"));
+ ASSERT_OK(sst_file_writer.Finish());
+
+ ASSERT_OK(sst_file_writer.Open(external_file3));
+ ASSERT_OK(sst_file_writer.Put("5", "5"));
+ ASSERT_OK(sst_file_writer.Put("6", "6"));
+ ASSERT_OK(sst_file_writer.Finish());
+
+ ASSERT_OK(db_->IngestExternalFile(handle, {external_file1, external_file3},
+ IngestExternalFileOptions()));
+ ASSERT_EQ(NumTableFilesAtLevel(1, 0), 2);
+ std::vector<std::string> files;
+ GetSstFiles(env_, dbname_, &files);
+ ASSERT_EQ(files.size(), 2);
+
+ port::Thread user_thread1(
+ [&]() { db_->CompactFiles(CompactionOptions(), handle, files, 1); });
+
+ port::Thread user_thread2([&]() {
+ ASSERT_OK(db_->IngestExternalFile(handle, {external_file2},
+ IngestExternalFileOptions()));
+ TEST_SYNC_POINT("TestCompactFiles::IngestExternalFile1");
+ });
+
+ user_thread1.join();
+ user_thread2.join();
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif // ROCKSDB_LITE
+
+// TODO: figure out why this test fails in appveyor
+#ifndef OS_WIN
+TEST_F(DBTest2, MultiDBParallelOpenTest) {
+ const int kNumDbs = 2;
+ Options options = CurrentOptions();
+ std::vector<std::string> dbnames;
+ for (int i = 0; i < kNumDbs; ++i) {
+ dbnames.emplace_back(test::TmpDir(env_) + "/db" + ToString(i));
+ ASSERT_OK(DestroyDB(dbnames.back(), options));
+ }
+
+ // Verify empty DBs can be created in parallel
+ std::vector<std::thread> open_threads;
+ std::vector<DB*> dbs{static_cast<unsigned int>(kNumDbs), nullptr};
+ options.create_if_missing = true;
+ for (int i = 0; i < kNumDbs; ++i) {
+ open_threads.emplace_back(
+ [&](int dbnum) {
+ ASSERT_OK(DB::Open(options, dbnames[dbnum], &dbs[dbnum]));
+ },
+ i);
+ }
+
+ // Now add some data and close, so next we can verify non-empty DBs can be
+ // recovered in parallel
+ for (int i = 0; i < kNumDbs; ++i) {
+ open_threads[i].join();
+ ASSERT_OK(dbs[i]->Put(WriteOptions(), "xi", "gua"));
+ delete dbs[i];
+ }
+
+ // Verify non-empty DBs can be recovered in parallel
+ dbs.clear();
+ open_threads.clear();
+ for (int i = 0; i < kNumDbs; ++i) {
+ open_threads.emplace_back(
+ [&](int dbnum) {
+ ASSERT_OK(DB::Open(options, dbnames[dbnum], &dbs[dbnum]));
+ },
+ i);
+ }
+
+ // Wait and cleanup
+ for (int i = 0; i < kNumDbs; ++i) {
+ open_threads[i].join();
+ delete dbs[i];
+ ASSERT_OK(DestroyDB(dbnames[i], options));
+ }
+}
+#endif // OS_WIN
+
+namespace {
+class DummyOldStats : public Statistics {
+ public:
+ uint64_t getTickerCount(uint32_t /*ticker_type*/) const override { return 0; }
+ void recordTick(uint32_t /* ticker_type */, uint64_t /* count */) override {
+ num_rt++;
+ }
+ void setTickerCount(uint32_t /*ticker_type*/, uint64_t /*count*/) override {}
+ uint64_t getAndResetTickerCount(uint32_t /*ticker_type*/) override {
+ return 0;
+ }
+ void measureTime(uint32_t /*histogram_type*/, uint64_t /*count*/) override {
+ num_mt++;
+ }
+ void histogramData(
+ uint32_t /*histogram_type*/,
+ ROCKSDB_NAMESPACE::HistogramData* const /*data*/) const override {}
+ std::string getHistogramString(uint32_t /*type*/) const override {
+ return "";
+ }
+ bool HistEnabledForType(uint32_t /*type*/) const override { return false; }
+ std::string ToString() const override { return ""; }
+ int num_rt = 0;
+ int num_mt = 0;
+};
+} // namespace
+
+TEST_F(DBTest2, OldStatsInterface) {
+ DummyOldStats* dos = new DummyOldStats();
+ std::shared_ptr<Statistics> stats(dos);
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.statistics = stats;
+ Reopen(options);
+
+ Put("foo", "bar");
+ ASSERT_EQ("bar", Get("foo"));
+ ASSERT_OK(Flush());
+ ASSERT_EQ("bar", Get("foo"));
+
+ ASSERT_GT(dos->num_rt, 0);
+ ASSERT_GT(dos->num_mt, 0);
+}
+
+TEST_F(DBTest2, CloseWithUnreleasedSnapshot) {
+ const Snapshot* ss = db_->GetSnapshot();
+
+ for (auto h : handles_) {
+ db_->DestroyColumnFamilyHandle(h);
+ }
+ handles_.clear();
+
+ ASSERT_NOK(db_->Close());
+ db_->ReleaseSnapshot(ss);
+ ASSERT_OK(db_->Close());
+ delete db_;
+ db_ = nullptr;
+}
+
+TEST_F(DBTest2, PrefixBloomReseek) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.prefix_extractor.reset(NewCappedPrefixTransform(3));
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ bbto.whole_key_filtering = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ DestroyAndReopen(options);
+
+ // Construct two L1 files with keys:
+ // f1:[aaa1 ccc1] f2:[ddd0]
+ ASSERT_OK(Put("aaa1", ""));
+ ASSERT_OK(Put("ccc1", ""));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("ddd0", ""));
+ ASSERT_OK(Flush());
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ ASSERT_OK(Put("bbb1", ""));
+
+ Iterator* iter = db_->NewIterator(ReadOptions());
+
+ // Seeking into f1, the iterator will check bloom filter which returns the
+ // file iterator ot be invalidate, and the cursor will put into f2, with
+ // the next key to be "ddd0".
+ iter->Seek("bbb1");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("bbb1", iter->key().ToString());
+
+ // Reseek ccc1, the L1 iterator needs to go back to f1 and reseek.
+ iter->Seek("ccc1");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("ccc1", iter->key().ToString());
+
+ delete iter;
+}
+
+TEST_F(DBTest2, PrefixBloomFilteredOut) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.prefix_extractor.reset(NewCappedPrefixTransform(3));
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ bbto.whole_key_filtering = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ DestroyAndReopen(options);
+
+ // Construct two L1 files with keys:
+ // f1:[aaa1 ccc1] f2:[ddd0]
+ ASSERT_OK(Put("aaa1", ""));
+ ASSERT_OK(Put("ccc1", ""));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("ddd0", ""));
+ ASSERT_OK(Flush());
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ Iterator* iter = db_->NewIterator(ReadOptions());
+
+ // Bloom filter is filterd out by f1.
+ // This is just one of several valid position following the contract.
+ // Postioning to ccc1 or ddd0 is also valid. This is just to validate
+ // the behavior of the current implementation. If underlying implementation
+ // changes, the test might fail here.
+ iter->Seek("bbb1");
+ ASSERT_FALSE(iter->Valid());
+
+ delete iter;
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, RowCacheSnapshot) {
+ Options options = CurrentOptions();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.row_cache = NewLRUCache(8 * 8192);
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("foo", "bar1"));
+
+ const Snapshot* s1 = db_->GetSnapshot();
+
+ ASSERT_OK(Put("foo", "bar2"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("foo2", "bar"));
+ const Snapshot* s2 = db_->GetSnapshot();
+ ASSERT_OK(Put("foo3", "bar"));
+ const Snapshot* s3 = db_->GetSnapshot();
+
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0);
+ ASSERT_EQ(Get("foo"), "bar2");
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
+ ASSERT_EQ(Get("foo"), "bar2");
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
+ ASSERT_EQ(Get("foo", s1), "bar1");
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+ ASSERT_EQ(Get("foo", s2), "bar2");
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 2);
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+ ASSERT_EQ(Get("foo", s1), "bar1");
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 3);
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+ ASSERT_EQ(Get("foo", s3), "bar2");
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 4);
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+
+ db_->ReleaseSnapshot(s1);
+ db_->ReleaseSnapshot(s2);
+ db_->ReleaseSnapshot(s3);
+}
+#endif // ROCKSDB_LITE
+
+// When DB is reopened with multiple column families, the manifest file
+// is written after the first CF is flushed, and it is written again
+// after each flush. If DB crashes between the flushes, the flushed CF
+// flushed will pass the latest log file, and now we require it not
+// to be corrupted, and triggering a corruption report.
+// We need to fix the bug and enable the test.
+TEST_F(DBTest2, CrashInRecoveryMultipleCF) {
+ const std::vector<std::string> sync_points = {
+ "DBImpl::RecoverLogFiles:BeforeFlushFinalMemtable",
+ "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0"};
+ for (const auto& test_sync_point : sync_points) {
+ Options options = CurrentOptions();
+ // First destroy original db to ensure a clean start.
+ DestroyAndReopen(options);
+ options.create_if_missing = true;
+ options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put(1, "foo", "bar"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Put(1, "foo", "bar"));
+ // The value is large enough to be divided to two blocks.
+ std::string large_value(400, ' ');
+ ASSERT_OK(Put("foo1", large_value));
+ ASSERT_OK(Put("foo2", large_value));
+ Close();
+
+ // Corrupt the log file in the middle, so that it is not corrupted
+ // in the tail.
+ std::vector<std::string> filenames;
+ ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+ for (const auto& f : filenames) {
+ uint64_t number;
+ FileType type;
+ if (ParseFileName(f, &number, &type) && type == FileType::kLogFile) {
+ std::string fname = dbname_ + "/" + f;
+ std::string file_content;
+ ASSERT_OK(ReadFileToString(env_, fname, &file_content));
+ file_content[400] = 'h';
+ file_content[401] = 'a';
+ ASSERT_OK(WriteStringToFile(env_, file_content, fname));
+ break;
+ }
+ }
+
+ // Reopen and freeze the file system after the first manifest write.
+ FaultInjectionTestEnv fit_env(options.env);
+ options.env = &fit_env;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ test_sync_point,
+ [&](void* /*arg*/) { fit_env.SetFilesystemActive(false); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_NOK(TryReopenWithColumnFamilies(
+ {kDefaultColumnFamilyName, "pikachu"}, options));
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ fit_env.SetFilesystemActive(true);
+ // If we continue using failure ingestion Env, it will conplain something
+ // when renaming current file, which is not expected. Need to investigate
+ // why.
+ options.env = env_;
+ ASSERT_OK(TryReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"},
+ options));
+ }
+}
+
+TEST_F(DBTest2, SeekFileRangeDeleteTail) {
+ Options options = CurrentOptions();
+ options.prefix_extractor.reset(NewCappedPrefixTransform(1));
+ options.num_levels = 3;
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("a", "a"));
+ const Snapshot* s1 = db_->GetSnapshot();
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "f"));
+ ASSERT_OK(Put("b", "a"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("x", "a"));
+ ASSERT_OK(Put("z", "a"));
+ ASSERT_OK(Flush());
+
+ CompactRangeOptions cro;
+ cro.change_level = true;
+ cro.target_level = 2;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ {
+ ReadOptions ro;
+ ro.total_order_seek = true;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+ iter->Seek("e");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("x", iter->key().ToString());
+ }
+ db_->ReleaseSnapshot(s1);
+}
+
+TEST_F(DBTest2, BackgroundPurgeTest) {
+ Options options = CurrentOptions();
+ options.write_buffer_manager =
+ std::make_shared<ROCKSDB_NAMESPACE::WriteBufferManager>(1 << 20);
+ options.avoid_unnecessary_blocking_io = true;
+ DestroyAndReopen(options);
+ size_t base_value = options.write_buffer_manager->memory_usage();
+
+ ASSERT_OK(Put("a", "a"));
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ ASSERT_OK(Flush());
+ size_t value = options.write_buffer_manager->memory_usage();
+ ASSERT_GT(value, base_value);
+
+ db_->GetEnv()->SetBackgroundThreads(1, Env::Priority::HIGH);
+ test::SleepingBackgroundTask sleeping_task_after;
+ db_->GetEnv()->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+ &sleeping_task_after, Env::Priority::HIGH);
+ delete iter;
+
+ Env::Default()->SleepForMicroseconds(100000);
+ value = options.write_buffer_manager->memory_usage();
+ ASSERT_GT(value, base_value);
+
+ sleeping_task_after.WakeUp();
+ sleeping_task_after.WaitUntilDone();
+
+ test::SleepingBackgroundTask sleeping_task_after2;
+ db_->GetEnv()->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+ &sleeping_task_after2, Env::Priority::HIGH);
+ sleeping_task_after2.WakeUp();
+ sleeping_task_after2.WaitUntilDone();
+
+ value = options.write_buffer_manager->memory_usage();
+ ASSERT_EQ(base_value, value);
+}
+
+TEST_F(DBTest2, SwitchMemtableRaceWithNewManifest) {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+ options.max_manifest_file_size = 10;
+ options.create_if_missing = true;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ASSERT_EQ(2, handles_.size());
+
+ ASSERT_OK(Put("foo", "value"));
+ const int kL0Files = options.level0_file_num_compaction_trigger;
+ for (int i = 0; i < kL0Files; ++i) {
+ ASSERT_OK(Put(/*cf=*/1, "a", std::to_string(i)));
+ ASSERT_OK(Flush(/*cf=*/1));
+ }
+
+ port::Thread thread([&]() { ASSERT_OK(Flush()); });
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ thread.join();
+}
+
+TEST_F(DBTest2, SameSmallestInSameLevel) {
+ // This test validates fractional casacading logic when several files at one
+ // one level only contains the same user key.
+ Options options = CurrentOptions();
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("key", "1"));
+ ASSERT_OK(Put("key", "2"));
+ ASSERT_OK(db_->Merge(WriteOptions(), "key", "3"));
+ ASSERT_OK(db_->Merge(WriteOptions(), "key", "4"));
+ Flush();
+ CompactRangeOptions cro;
+ cro.change_level = true;
+ cro.target_level = 2;
+ ASSERT_OK(dbfull()->CompactRange(cro, db_->DefaultColumnFamily(), nullptr,
+ nullptr));
+
+ ASSERT_OK(db_->Merge(WriteOptions(), "key", "5"));
+ Flush();
+ ASSERT_OK(db_->Merge(WriteOptions(), "key", "6"));
+ Flush();
+ ASSERT_OK(db_->Merge(WriteOptions(), "key", "7"));
+ Flush();
+ ASSERT_OK(db_->Merge(WriteOptions(), "key", "8"));
+ Flush();
+ dbfull()->TEST_WaitForCompact(true);
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("0,4,1", FilesPerLevel());
+#endif // ROCKSDB_LITE
+
+ ASSERT_EQ("2,3,4,5,6,7,8", Get("key"));
+}
+
+TEST_F(DBTest2, BlockBasedTablePrefixIndexSeekForPrev) {
+ // create a DB with block prefix index
+ BlockBasedTableOptions table_options;
+ Options options = CurrentOptions();
+ table_options.block_size = 300;
+ table_options.index_type = BlockBasedTableOptions::kHashSearch;
+ table_options.index_shortening =
+ BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+
+ Reopen(options);
+
+ Random rnd(301);
+ std::string large_value = RandomString(&rnd, 500);
+
+ ASSERT_OK(Put("a1", large_value));
+ ASSERT_OK(Put("x1", large_value));
+ ASSERT_OK(Put("y1", large_value));
+ Flush();
+
+ {
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
+ iterator->SeekForPrev("x3");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("x1", iterator->key().ToString());
+
+ iterator->SeekForPrev("a3");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("a1", iterator->key().ToString());
+
+ iterator->SeekForPrev("y3");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("y1", iterator->key().ToString());
+
+ // Query more than one non-existing prefix to cover the case both
+ // of empty hash bucket and hash bucket conflict.
+ iterator->SeekForPrev("b1");
+ // Result should be not valid or "a1".
+ if (iterator->Valid()) {
+ ASSERT_EQ("a1", iterator->key().ToString());
+ }
+
+ iterator->SeekForPrev("c1");
+ // Result should be not valid or "a1".
+ if (iterator->Valid()) {
+ ASSERT_EQ("a1", iterator->key().ToString());
+ }
+
+ iterator->SeekForPrev("d1");
+ // Result should be not valid or "a1".
+ if (iterator->Valid()) {
+ ASSERT_EQ("a1", iterator->key().ToString());
+ }
+
+ iterator->SeekForPrev("y3");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("y1", iterator->key().ToString());
+ }
+}
+
+TEST_F(DBTest2, ChangePrefixExtractor) {
+ for (bool use_partitioned_filter : {true, false}) {
+ // create a DB with block prefix index
+ BlockBasedTableOptions table_options;
+ Options options = CurrentOptions();
+
+ // Sometimes filter is checked based on upper bound. Assert counters
+ // for that case. Otherwise, only check data correctness.
+#ifndef ROCKSDB_LITE
+ bool expect_filter_check = !use_partitioned_filter;
+#else
+ bool expect_filter_check = false;
+#endif
+ table_options.partition_filters = use_partitioned_filter;
+ if (use_partitioned_filter) {
+ table_options.index_type =
+ BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+ }
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.statistics = CreateDBStatistics();
+
+ options.prefix_extractor.reset(NewFixedPrefixTransform(2));
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+
+ ASSERT_OK(Put("aa", ""));
+ ASSERT_OK(Put("xb", ""));
+ ASSERT_OK(Put("xx1", ""));
+ ASSERT_OK(Put("xz1", ""));
+ ASSERT_OK(Put("zz", ""));
+ Flush();
+
+ // After reopening DB with prefix size 2 => 1, prefix extractor
+ // won't take effective unless it won't change results based
+ // on upper bound and seek key.
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+ Reopen(options);
+
+ {
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
+ iterator->Seek("xa");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("xb", iterator->key().ToString());
+ // It's a bug that the counter BLOOM_FILTER_PREFIX_CHECKED is not
+ // correct in this case. So don't check counters in this case.
+ if (expect_filter_check) {
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+ }
+
+ iterator->Seek("xz");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("xz1", iterator->key().ToString());
+ if (expect_filter_check) {
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+ }
+ }
+
+ std::string ub_str = "xg9";
+ Slice ub(ub_str);
+ ReadOptions ro;
+ ro.iterate_upper_bound = &ub;
+
+ {
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+
+ // SeekForPrev() never uses prefix bloom if it is changed.
+ iterator->SeekForPrev("xg0");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("xb", iterator->key().ToString());
+ if (expect_filter_check) {
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+ }
+ }
+
+ ub_str = "xx9";
+ ub = Slice(ub_str);
+ {
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+
+ iterator->Seek("x");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("xb", iterator->key().ToString());
+ if (expect_filter_check) {
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+ }
+
+ iterator->Seek("xx0");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("xx1", iterator->key().ToString());
+ if (expect_filter_check) {
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+ }
+ }
+
+ CompactRangeOptions compact_range_opts;
+ compact_range_opts.bottommost_level_compaction =
+ BottommostLevelCompaction::kForce;
+ ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr));
+ ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr));
+
+ // Re-execute similar queries after a full compaction
+ {
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
+
+ iterator->Seek("x");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("xb", iterator->key().ToString());
+ if (expect_filter_check) {
+ ASSERT_EQ(2, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+ }
+
+ iterator->Seek("xg");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("xx1", iterator->key().ToString());
+ if (expect_filter_check) {
+ ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+ }
+
+ iterator->Seek("xz");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("xz1", iterator->key().ToString());
+ if (expect_filter_check) {
+ ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+ }
+ }
+ {
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+
+ iterator->SeekForPrev("xx0");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("xb", iterator->key().ToString());
+ if (expect_filter_check) {
+ ASSERT_EQ(5, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+ }
+
+ iterator->Seek("xx0");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("xx1", iterator->key().ToString());
+ if (expect_filter_check) {
+ ASSERT_EQ(6, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+ }
+ }
+
+ ub_str = "xg9";
+ ub = Slice(ub_str);
+ {
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+ iterator->SeekForPrev("xg0");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("xb", iterator->key().ToString());
+ if (expect_filter_check) {
+ ASSERT_EQ(7, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+ }
+ }
+ }
+}
+
+TEST_F(DBTest2, BlockBasedTablePrefixGetIndexNotFound) {
+ // create a DB with block prefix index
+ BlockBasedTableOptions table_options;
+ Options options = CurrentOptions();
+ table_options.block_size = 300;
+ table_options.index_type = BlockBasedTableOptions::kHashSearch;
+ table_options.index_shortening =
+ BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+ options.level0_file_num_compaction_trigger = 8;
+
+ Reopen(options);
+
+ ASSERT_OK(Put("b1", "ok"));
+ Flush();
+
+ // Flushing several files so that the chance that hash bucket
+ // is empty fo "b" in at least one of the files is high.
+ ASSERT_OK(Put("a1", ""));
+ ASSERT_OK(Put("c1", ""));
+ Flush();
+
+ ASSERT_OK(Put("a2", ""));
+ ASSERT_OK(Put("c2", ""));
+ Flush();
+
+ ASSERT_OK(Put("a3", ""));
+ ASSERT_OK(Put("c3", ""));
+ Flush();
+
+ ASSERT_OK(Put("a4", ""));
+ ASSERT_OK(Put("c4", ""));
+ Flush();
+
+ ASSERT_OK(Put("a5", ""));
+ ASSERT_OK(Put("c5", ""));
+ Flush();
+
+ ASSERT_EQ("ok", Get("b1"));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, AutoPrefixMode1) {
+ // create a DB with block prefix index
+ BlockBasedTableOptions table_options;
+ Options options = CurrentOptions();
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+ options.statistics = CreateDBStatistics();
+
+ Reopen(options);
+
+ Random rnd(301);
+ std::string large_value = RandomString(&rnd, 500);
+
+ ASSERT_OK(Put("a1", large_value));
+ ASSERT_OK(Put("x1", large_value));
+ ASSERT_OK(Put("y1", large_value));
+ Flush();
+
+ ReadOptions ro;
+ ro.total_order_seek = false;
+ ro.auto_prefix_mode = true;
+ {
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+ iterator->Seek("b1");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("x1", iterator->key().ToString());
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+ }
+
+ std::string ub_str = "b9";
+ Slice ub(ub_str);
+ ro.iterate_upper_bound = &ub;
+
+ {
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+ iterator->Seek("b1");
+ ASSERT_FALSE(iterator->Valid());
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+ }
+
+ ub_str = "z";
+ ub = Slice(ub_str);
+ {
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+ iterator->Seek("b1");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("x1", iterator->key().ToString());
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+ }
+
+ ub_str = "c";
+ ub = Slice(ub_str);
+ {
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+ iterator->Seek("b1");
+ ASSERT_FALSE(iterator->Valid());
+ ASSERT_EQ(2, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+ }
+
+ // The same queries without recreating iterator
+ {
+ ub_str = "b9";
+ ub = Slice(ub_str);
+ ro.iterate_upper_bound = &ub;
+
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+ iterator->Seek("b1");
+ ASSERT_FALSE(iterator->Valid());
+ ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+
+ ub_str = "z";
+ ub = Slice(ub_str);
+
+ iterator->Seek("b1");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("x1", iterator->key().ToString());
+ ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+
+ ub_str = "c";
+ ub = Slice(ub_str);
+
+ iterator->Seek("b1");
+ ASSERT_FALSE(iterator->Valid());
+ ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+
+ ub_str = "b9";
+ ub = Slice(ub_str);
+ ro.iterate_upper_bound = &ub;
+ iterator->SeekForPrev("b1");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("a1", iterator->key().ToString());
+ ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+
+ ub_str = "zz";
+ ub = Slice(ub_str);
+ ro.iterate_upper_bound = &ub;
+ iterator->SeekToLast();
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("y1", iterator->key().ToString());
+
+ iterator->SeekToFirst();
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("a1", iterator->key().ToString());
+ }
+}
+#endif // ROCKSDB_LITE
+} // namespace ROCKSDB_NAMESPACE
+
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_test_util.cc b/src/rocksdb/db/db_test_util.cc
new file mode 100644
index 000000000..c73abde41
--- /dev/null
+++ b/src/rocksdb/db/db_test_util.cc
@@ -0,0 +1,1564 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "db/forward_iterator.h"
+#include "rocksdb/env_encryption.h"
+#include "rocksdb/utilities/object_registry.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Special Env used to delay background operations
+
+SpecialEnv::SpecialEnv(Env* base)
+ : EnvWrapper(base),
+ rnd_(301),
+ sleep_counter_(this),
+ addon_time_(0),
+ time_elapse_only_sleep_(false),
+ no_slowdown_(false) {
+ delay_sstable_sync_.store(false, std::memory_order_release);
+ drop_writes_.store(false, std::memory_order_release);
+ no_space_.store(false, std::memory_order_release);
+ non_writable_.store(false, std::memory_order_release);
+ count_random_reads_ = false;
+ count_sequential_reads_ = false;
+ manifest_sync_error_.store(false, std::memory_order_release);
+ manifest_write_error_.store(false, std::memory_order_release);
+ log_write_error_.store(false, std::memory_order_release);
+ random_file_open_counter_.store(0, std::memory_order_relaxed);
+ delete_count_.store(0, std::memory_order_relaxed);
+ num_open_wal_file_.store(0);
+ log_write_slowdown_ = 0;
+ bytes_written_ = 0;
+ sync_counter_ = 0;
+ non_writeable_rate_ = 0;
+ new_writable_count_ = 0;
+ non_writable_count_ = 0;
+ table_write_callback_ = nullptr;
+}
+#ifndef ROCKSDB_LITE
+ROT13BlockCipher rot13Cipher_(16);
+#endif // ROCKSDB_LITE
+
+DBTestBase::DBTestBase(const std::string path)
+ : mem_env_(nullptr), encrypted_env_(nullptr), option_config_(kDefault) {
+ Env* base_env = Env::Default();
+#ifndef ROCKSDB_LITE
+ const char* test_env_uri = getenv("TEST_ENV_URI");
+ if (test_env_uri) {
+ Env* test_env = nullptr;
+ Status s = Env::LoadEnv(test_env_uri, &test_env, &env_guard_);
+ base_env = test_env;
+ EXPECT_OK(s);
+ EXPECT_NE(Env::Default(), base_env);
+ }
+#endif // !ROCKSDB_LITE
+ EXPECT_NE(nullptr, base_env);
+ if (getenv("MEM_ENV")) {
+ mem_env_ = new MockEnv(base_env);
+ }
+#ifndef ROCKSDB_LITE
+ if (getenv("ENCRYPTED_ENV")) {
+ encrypted_env_ = NewEncryptedEnv(mem_env_ ? mem_env_ : base_env,
+ new CTREncryptionProvider(rot13Cipher_));
+ }
+#endif // !ROCKSDB_LITE
+ env_ = new SpecialEnv(encrypted_env_ ? encrypted_env_
+ : (mem_env_ ? mem_env_ : base_env));
+ env_->SetBackgroundThreads(1, Env::LOW);
+ env_->SetBackgroundThreads(1, Env::HIGH);
+ dbname_ = test::PerThreadDBPath(env_, path);
+ alternative_wal_dir_ = dbname_ + "/wal";
+ alternative_db_log_dir_ = dbname_ + "/db_log_dir";
+ auto options = CurrentOptions();
+ options.env = env_;
+ auto delete_options = options;
+ delete_options.wal_dir = alternative_wal_dir_;
+ EXPECT_OK(DestroyDB(dbname_, delete_options));
+ // Destroy it for not alternative WAL dir is used.
+ EXPECT_OK(DestroyDB(dbname_, options));
+ db_ = nullptr;
+ Reopen(options);
+ Random::GetTLSInstance()->Reset(0xdeadbeef);
+}
+
+DBTestBase::~DBTestBase() {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ Close();
+ Options options;
+ options.db_paths.emplace_back(dbname_, 0);
+ options.db_paths.emplace_back(dbname_ + "_2", 0);
+ options.db_paths.emplace_back(dbname_ + "_3", 0);
+ options.db_paths.emplace_back(dbname_ + "_4", 0);
+ options.env = env_;
+
+ if (getenv("KEEP_DB")) {
+ printf("DB is still at %s\n", dbname_.c_str());
+ } else {
+ EXPECT_OK(DestroyDB(dbname_, options));
+ }
+ delete env_;
+}
+
+bool DBTestBase::ShouldSkipOptions(int option_config, int skip_mask) {
+#ifdef ROCKSDB_LITE
+ // These options are not supported in ROCKSDB_LITE
+ if (option_config == kHashSkipList ||
+ option_config == kPlainTableFirstBytePrefix ||
+ option_config == kPlainTableCappedPrefix ||
+ option_config == kPlainTableCappedPrefixNonMmap ||
+ option_config == kPlainTableAllBytesPrefix ||
+ option_config == kVectorRep || option_config == kHashLinkList ||
+ option_config == kUniversalCompaction ||
+ option_config == kUniversalCompactionMultiLevel ||
+ option_config == kUniversalSubcompactions ||
+ option_config == kFIFOCompaction ||
+ option_config == kConcurrentSkipList) {
+ return true;
+ }
+#endif
+
+ if ((skip_mask & kSkipUniversalCompaction) &&
+ (option_config == kUniversalCompaction ||
+ option_config == kUniversalCompactionMultiLevel ||
+ option_config == kUniversalSubcompactions)) {
+ return true;
+ }
+ if ((skip_mask & kSkipMergePut) && option_config == kMergePut) {
+ return true;
+ }
+ if ((skip_mask & kSkipNoSeekToLast) &&
+ (option_config == kHashLinkList || option_config == kHashSkipList)) {
+ return true;
+ }
+ if ((skip_mask & kSkipPlainTable) &&
+ (option_config == kPlainTableAllBytesPrefix ||
+ option_config == kPlainTableFirstBytePrefix ||
+ option_config == kPlainTableCappedPrefix ||
+ option_config == kPlainTableCappedPrefixNonMmap)) {
+ return true;
+ }
+ if ((skip_mask & kSkipHashIndex) &&
+ (option_config == kBlockBasedTableWithPrefixHashIndex ||
+ option_config == kBlockBasedTableWithWholeKeyHashIndex)) {
+ return true;
+ }
+ if ((skip_mask & kSkipFIFOCompaction) && option_config == kFIFOCompaction) {
+ return true;
+ }
+ if ((skip_mask & kSkipMmapReads) && option_config == kWalDirAndMmapReads) {
+ return true;
+ }
+ return false;
+}
+
+// Switch to a fresh database with the next option configuration to
+// test. Return false if there are no more configurations to test.
+bool DBTestBase::ChangeOptions(int skip_mask) {
+ for (option_config_++; option_config_ < kEnd; option_config_++) {
+ if (ShouldSkipOptions(option_config_, skip_mask)) {
+ continue;
+ }
+ break;
+ }
+
+ if (option_config_ >= kEnd) {
+ Destroy(last_options_);
+ return false;
+ } else {
+ auto options = CurrentOptions();
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+ return true;
+ }
+}
+
+// Switch between different compaction styles.
+bool DBTestBase::ChangeCompactOptions() {
+ if (option_config_ == kDefault) {
+ option_config_ = kUniversalCompaction;
+ Destroy(last_options_);
+ auto options = CurrentOptions();
+ options.create_if_missing = true;
+ TryReopen(options);
+ return true;
+ } else if (option_config_ == kUniversalCompaction) {
+ option_config_ = kUniversalCompactionMultiLevel;
+ Destroy(last_options_);
+ auto options = CurrentOptions();
+ options.create_if_missing = true;
+ TryReopen(options);
+ return true;
+ } else if (option_config_ == kUniversalCompactionMultiLevel) {
+ option_config_ = kLevelSubcompactions;
+ Destroy(last_options_);
+ auto options = CurrentOptions();
+ assert(options.max_subcompactions > 1);
+ TryReopen(options);
+ return true;
+ } else if (option_config_ == kLevelSubcompactions) {
+ option_config_ = kUniversalSubcompactions;
+ Destroy(last_options_);
+ auto options = CurrentOptions();
+ assert(options.max_subcompactions > 1);
+ TryReopen(options);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+// Switch between different WAL settings
+bool DBTestBase::ChangeWalOptions() {
+ if (option_config_ == kDefault) {
+ option_config_ = kDBLogDir;
+ Destroy(last_options_);
+ auto options = CurrentOptions();
+ Destroy(options);
+ options.create_if_missing = true;
+ TryReopen(options);
+ return true;
+ } else if (option_config_ == kDBLogDir) {
+ option_config_ = kWalDirAndMmapReads;
+ Destroy(last_options_);
+ auto options = CurrentOptions();
+ Destroy(options);
+ options.create_if_missing = true;
+ TryReopen(options);
+ return true;
+ } else if (option_config_ == kWalDirAndMmapReads) {
+ option_config_ = kRecycleLogFiles;
+ Destroy(last_options_);
+ auto options = CurrentOptions();
+ Destroy(options);
+ TryReopen(options);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+// Switch between different filter policy
+// Jump from kDefault to kFilter to kFullFilter
+bool DBTestBase::ChangeFilterOptions() {
+ if (option_config_ == kDefault) {
+ option_config_ = kFilter;
+ } else if (option_config_ == kFilter) {
+ option_config_ = kFullFilterWithNewTableReaderForCompactions;
+ } else if (option_config_ == kFullFilterWithNewTableReaderForCompactions) {
+ option_config_ = kPartitionedFilterWithNewTableReaderForCompactions;
+ } else {
+ return false;
+ }
+ Destroy(last_options_);
+
+ auto options = CurrentOptions();
+ options.create_if_missing = true;
+ TryReopen(options);
+ return true;
+}
+
+// Switch between different DB options for file ingestion tests.
+bool DBTestBase::ChangeOptionsForFileIngestionTest() {
+ if (option_config_ == kDefault) {
+ option_config_ = kUniversalCompaction;
+ Destroy(last_options_);
+ auto options = CurrentOptions();
+ options.create_if_missing = true;
+ TryReopen(options);
+ return true;
+ } else if (option_config_ == kUniversalCompaction) {
+ option_config_ = kUniversalCompactionMultiLevel;
+ Destroy(last_options_);
+ auto options = CurrentOptions();
+ options.create_if_missing = true;
+ TryReopen(options);
+ return true;
+ } else if (option_config_ == kUniversalCompactionMultiLevel) {
+ option_config_ = kLevelSubcompactions;
+ Destroy(last_options_);
+ auto options = CurrentOptions();
+ assert(options.max_subcompactions > 1);
+ TryReopen(options);
+ return true;
+ } else if (option_config_ == kLevelSubcompactions) {
+ option_config_ = kUniversalSubcompactions;
+ Destroy(last_options_);
+ auto options = CurrentOptions();
+ assert(options.max_subcompactions > 1);
+ TryReopen(options);
+ return true;
+ } else if (option_config_ == kUniversalSubcompactions) {
+ option_config_ = kDirectIO;
+ Destroy(last_options_);
+ auto options = CurrentOptions();
+ TryReopen(options);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+// Return the current option configuration.
+Options DBTestBase::CurrentOptions(
+ const anon::OptionsOverride& options_override) const {
+ return GetOptions(option_config_, GetDefaultOptions(), options_override);
+}
+
+Options DBTestBase::CurrentOptions(
+ const Options& default_options,
+ const anon::OptionsOverride& options_override) const {
+ return GetOptions(option_config_, default_options, options_override);
+}
+
+Options DBTestBase::GetDefaultOptions() {
+ Options options;
+ options.write_buffer_size = 4090 * 4096;
+ options.target_file_size_base = 2 * 1024 * 1024;
+ options.max_bytes_for_level_base = 10 * 1024 * 1024;
+ options.max_open_files = 5000;
+ options.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords;
+ options.compaction_pri = CompactionPri::kByCompensatedSize;
+ return options;
+}
+
+Options DBTestBase::GetOptions(
+ int option_config, const Options& default_options,
+ const anon::OptionsOverride& options_override) const {
+ // this redundant copy is to minimize code change w/o having lint error.
+ Options options = default_options;
+ BlockBasedTableOptions table_options;
+ bool set_block_based_table_factory = true;
+#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \
+ !defined(OS_AIX)
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+ "NewRandomAccessFile:O_DIRECT");
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+ "NewWritableFile:O_DIRECT");
+#endif
+
+ bool can_allow_mmap = IsMemoryMappedAccessSupported();
+ switch (option_config) {
+#ifndef ROCKSDB_LITE
+ case kHashSkipList:
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+ options.memtable_factory.reset(NewHashSkipListRepFactory(16));
+ options.allow_concurrent_memtable_write = false;
+ options.unordered_write = false;
+ break;
+ case kPlainTableFirstBytePrefix:
+ options.table_factory.reset(new PlainTableFactory());
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+ options.allow_mmap_reads = can_allow_mmap;
+ options.max_sequential_skip_in_iterations = 999999;
+ set_block_based_table_factory = false;
+ break;
+ case kPlainTableCappedPrefix:
+ options.table_factory.reset(new PlainTableFactory());
+ options.prefix_extractor.reset(NewCappedPrefixTransform(8));
+ options.allow_mmap_reads = can_allow_mmap;
+ options.max_sequential_skip_in_iterations = 999999;
+ set_block_based_table_factory = false;
+ break;
+ case kPlainTableCappedPrefixNonMmap:
+ options.table_factory.reset(new PlainTableFactory());
+ options.prefix_extractor.reset(NewCappedPrefixTransform(8));
+ options.allow_mmap_reads = false;
+ options.max_sequential_skip_in_iterations = 999999;
+ set_block_based_table_factory = false;
+ break;
+ case kPlainTableAllBytesPrefix:
+ options.table_factory.reset(new PlainTableFactory());
+ options.prefix_extractor.reset(NewNoopTransform());
+ options.allow_mmap_reads = can_allow_mmap;
+ options.max_sequential_skip_in_iterations = 999999;
+ set_block_based_table_factory = false;
+ break;
+ case kVectorRep:
+ options.memtable_factory.reset(new VectorRepFactory(100));
+ options.allow_concurrent_memtable_write = false;
+ options.unordered_write = false;
+ break;
+ case kHashLinkList:
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+ options.memtable_factory.reset(
+ NewHashLinkListRepFactory(4, 0, 3, true, 4));
+ options.allow_concurrent_memtable_write = false;
+ options.unordered_write = false;
+ break;
+ case kDirectIO: {
+ options.use_direct_reads = true;
+ options.use_direct_io_for_flush_and_compaction = true;
+ options.compaction_readahead_size = 2 * 1024 * 1024;
+ #if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \
+ !defined(OS_AIX) && !defined(OS_OPENBSD)
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "NewWritableFile:O_DIRECT", [&](void* arg) {
+ int* val = static_cast<int*>(arg);
+ *val &= ~O_DIRECT;
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "NewRandomAccessFile:O_DIRECT", [&](void* arg) {
+ int* val = static_cast<int*>(arg);
+ *val &= ~O_DIRECT;
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+#endif
+ break;
+ }
+#endif // ROCKSDB_LITE
+ case kMergePut:
+ options.merge_operator = MergeOperators::CreatePutOperator();
+ break;
+ case kFilter:
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+ break;
+ case kFullFilterWithNewTableReaderForCompactions:
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ options.new_table_reader_for_compaction_inputs = true;
+ options.compaction_readahead_size = 10 * 1024 * 1024;
+ break;
+ case kPartitionedFilterWithNewTableReaderForCompactions:
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ table_options.partition_filters = true;
+ table_options.index_type =
+ BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+ options.new_table_reader_for_compaction_inputs = true;
+ options.compaction_readahead_size = 10 * 1024 * 1024;
+ break;
+ case kUncompressed:
+ options.compression = kNoCompression;
+ break;
+ case kNumLevel_3:
+ options.num_levels = 3;
+ break;
+ case kDBLogDir:
+ options.db_log_dir = alternative_db_log_dir_;
+ break;
+ case kWalDirAndMmapReads:
+ options.wal_dir = alternative_wal_dir_;
+ // mmap reads should be orthogonal to WalDir setting, so we piggyback to
+ // this option config to test mmap reads as well
+ options.allow_mmap_reads = can_allow_mmap;
+ break;
+ case kManifestFileSize:
+ options.max_manifest_file_size = 50; // 50 bytes
+ break;
+ case kPerfOptions:
+ options.soft_rate_limit = 2.0;
+ options.delayed_write_rate = 8 * 1024 * 1024;
+ options.report_bg_io_stats = true;
+ // TODO(3.13) -- test more options
+ break;
+ case kUniversalCompaction:
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = 1;
+ break;
+ case kUniversalCompactionMultiLevel:
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = 8;
+ break;
+ case kCompressedBlockCache:
+ options.allow_mmap_writes = can_allow_mmap;
+ table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024);
+ break;
+ case kInfiniteMaxOpenFiles:
+ options.max_open_files = -1;
+ break;
+ case kxxHashChecksum: {
+ table_options.checksum = kxxHash;
+ break;
+ }
+ case kxxHash64Checksum: {
+ table_options.checksum = kxxHash64;
+ break;
+ }
+ case kFIFOCompaction: {
+ options.compaction_style = kCompactionStyleFIFO;
+ break;
+ }
+ case kBlockBasedTableWithPrefixHashIndex: {
+ table_options.index_type = BlockBasedTableOptions::kHashSearch;
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+ break;
+ }
+ case kBlockBasedTableWithWholeKeyHashIndex: {
+ table_options.index_type = BlockBasedTableOptions::kHashSearch;
+ options.prefix_extractor.reset(NewNoopTransform());
+ break;
+ }
+ case kBlockBasedTableWithPartitionedIndex: {
+ table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
+ options.prefix_extractor.reset(NewNoopTransform());
+ break;
+ }
+ case kBlockBasedTableWithPartitionedIndexFormat4: {
+ table_options.format_version = 4;
+ // Format 4 changes the binary index format. Since partitioned index is a
+ // super-set of simple indexes, we are also using kTwoLevelIndexSearch to
+ // test this format.
+ table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
+ // The top-level index in partition filters are also affected by format 4.
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ table_options.partition_filters = true;
+ table_options.index_block_restart_interval = 8;
+ break;
+ }
+ case kBlockBasedTableWithIndexRestartInterval: {
+ table_options.index_block_restart_interval = 8;
+ break;
+ }
+ case kOptimizeFiltersForHits: {
+ options.optimize_filters_for_hits = true;
+ set_block_based_table_factory = true;
+ break;
+ }
+ case kRowCache: {
+ options.row_cache = NewLRUCache(1024 * 1024);
+ break;
+ }
+ case kRecycleLogFiles: {
+ options.recycle_log_file_num = 2;
+ break;
+ }
+ case kLevelSubcompactions: {
+ options.max_subcompactions = 4;
+ break;
+ }
+ case kUniversalSubcompactions: {
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = 8;
+ options.max_subcompactions = 4;
+ break;
+ }
+ case kConcurrentSkipList: {
+ options.allow_concurrent_memtable_write = true;
+ options.enable_write_thread_adaptive_yield = true;
+ break;
+ }
+ case kPipelinedWrite: {
+ options.enable_pipelined_write = true;
+ break;
+ }
+ case kConcurrentWALWrites: {
+ // This options optimize 2PC commit path
+ options.two_write_queues = true;
+ options.manual_wal_flush = true;
+ break;
+ }
+ case kUnorderedWrite: {
+ options.allow_concurrent_memtable_write = false;
+ options.unordered_write = false;
+ break;
+ }
+
+ default:
+ break;
+ }
+
+ if (options_override.filter_policy) {
+ table_options.filter_policy = options_override.filter_policy;
+ table_options.partition_filters = options_override.partition_filters;
+ table_options.metadata_block_size = options_override.metadata_block_size;
+ }
+ if (set_block_based_table_factory) {
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ }
+ options.env = env_;
+ options.create_if_missing = true;
+ options.fail_if_options_file_error = true;
+ return options;
+}
+
+void DBTestBase::CreateColumnFamilies(const std::vector<std::string>& cfs,
+ const Options& options) {
+ ColumnFamilyOptions cf_opts(options);
+ size_t cfi = handles_.size();
+ handles_.resize(cfi + cfs.size());
+ for (auto cf : cfs) {
+ Status s = db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]);
+ ASSERT_OK(s);
+ }
+}
+
+void DBTestBase::CreateAndReopenWithCF(const std::vector<std::string>& cfs,
+ const Options& options) {
+ CreateColumnFamilies(cfs, options);
+ std::vector<std::string> cfs_plus_default = cfs;
+ cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName);
+ ReopenWithColumnFamilies(cfs_plus_default, options);
+}
+
+void DBTestBase::ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+ const std::vector<Options>& options) {
+ ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+}
+
+void DBTestBase::ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+ const Options& options) {
+ ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+}
+
+Status DBTestBase::TryReopenWithColumnFamilies(
+ const std::vector<std::string>& cfs, const std::vector<Options>& options) {
+ Close();
+ EXPECT_EQ(cfs.size(), options.size());
+ std::vector<ColumnFamilyDescriptor> column_families;
+ for (size_t i = 0; i < cfs.size(); ++i) {
+ column_families.push_back(ColumnFamilyDescriptor(cfs[i], options[i]));
+ }
+ DBOptions db_opts = DBOptions(options[0]);
+ last_options_ = options[0];
+ return DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
+}
+
+Status DBTestBase::TryReopenWithColumnFamilies(
+ const std::vector<std::string>& cfs, const Options& options) {
+ Close();
+ std::vector<Options> v_opts(cfs.size(), options);
+ return TryReopenWithColumnFamilies(cfs, v_opts);
+}
+
+void DBTestBase::Reopen(const Options& options) {
+ ASSERT_OK(TryReopen(options));
+}
+
+void DBTestBase::Close() {
+ for (auto h : handles_) {
+ db_->DestroyColumnFamilyHandle(h);
+ }
+ handles_.clear();
+ delete db_;
+ db_ = nullptr;
+}
+
+void DBTestBase::DestroyAndReopen(const Options& options) {
+ // Destroy using last options
+ Destroy(last_options_);
+ ASSERT_OK(TryReopen(options));
+}
+
+void DBTestBase::Destroy(const Options& options, bool delete_cf_paths) {
+ std::vector<ColumnFamilyDescriptor> column_families;
+ if (delete_cf_paths) {
+ for (size_t i = 0; i < handles_.size(); ++i) {
+ ColumnFamilyDescriptor cfdescriptor;
+ handles_[i]->GetDescriptor(&cfdescriptor);
+ column_families.push_back(cfdescriptor);
+ }
+ }
+ Close();
+ ASSERT_OK(DestroyDB(dbname_, options, column_families));
+}
+
+Status DBTestBase::ReadOnlyReopen(const Options& options) {
+ return DB::OpenForReadOnly(options, dbname_, &db_);
+}
+
+Status DBTestBase::TryReopen(const Options& options) {
+ Close();
+ last_options_.table_factory.reset();
+ // Note: operator= is an unsafe approach here since it destructs
+ // std::shared_ptr in the same order of their creation, in contrast to
+ // destructors which destructs them in the opposite order of creation. One
+ // particular problme is that the cache destructor might invoke callback
+ // functions that use Option members such as statistics. To work around this
+ // problem, we manually call destructor of table_facotry which eventually
+ // clears the block cache.
+ last_options_ = options;
+ return DB::Open(options, dbname_, &db_);
+}
+
+bool DBTestBase::IsDirectIOSupported() {
+ return test::IsDirectIOSupported(env_, dbname_);
+}
+
+bool DBTestBase::IsMemoryMappedAccessSupported() const {
+ return (!encrypted_env_);
+}
+
+Status DBTestBase::Flush(int cf) {
+ if (cf == 0) {
+ return db_->Flush(FlushOptions());
+ } else {
+ return db_->Flush(FlushOptions(), handles_[cf]);
+ }
+}
+
+Status DBTestBase::Flush(const std::vector<int>& cf_ids) {
+ std::vector<ColumnFamilyHandle*> cfhs;
+ std::for_each(cf_ids.begin(), cf_ids.end(),
+ [&cfhs, this](int id) { cfhs.emplace_back(handles_[id]); });
+ return db_->Flush(FlushOptions(), cfhs);
+}
+
+Status DBTestBase::Put(const Slice& k, const Slice& v, WriteOptions wo) {
+ if (kMergePut == option_config_) {
+ return db_->Merge(wo, k, v);
+ } else {
+ return db_->Put(wo, k, v);
+ }
+}
+
+Status DBTestBase::Put(int cf, const Slice& k, const Slice& v,
+ WriteOptions wo) {
+ if (kMergePut == option_config_) {
+ return db_->Merge(wo, handles_[cf], k, v);
+ } else {
+ return db_->Put(wo, handles_[cf], k, v);
+ }
+}
+
+Status DBTestBase::Merge(const Slice& k, const Slice& v, WriteOptions wo) {
+ return db_->Merge(wo, k, v);
+}
+
+Status DBTestBase::Merge(int cf, const Slice& k, const Slice& v,
+ WriteOptions wo) {
+ return db_->Merge(wo, handles_[cf], k, v);
+}
+
+Status DBTestBase::Delete(const std::string& k) {
+ return db_->Delete(WriteOptions(), k);
+}
+
+Status DBTestBase::Delete(int cf, const std::string& k) {
+ return db_->Delete(WriteOptions(), handles_[cf], k);
+}
+
+Status DBTestBase::SingleDelete(const std::string& k) {
+ return db_->SingleDelete(WriteOptions(), k);
+}
+
+Status DBTestBase::SingleDelete(int cf, const std::string& k) {
+ return db_->SingleDelete(WriteOptions(), handles_[cf], k);
+}
+
+bool DBTestBase::SetPreserveDeletesSequenceNumber(SequenceNumber sn) {
+ return db_->SetPreserveDeletesSequenceNumber(sn);
+}
+
+std::string DBTestBase::Get(const std::string& k, const Snapshot* snapshot) {
+ ReadOptions options;
+ options.verify_checksums = true;
+ options.snapshot = snapshot;
+ std::string result;
+ Status s = db_->Get(options, k, &result);
+ if (s.IsNotFound()) {
+ result = "NOT_FOUND";
+ } else if (!s.ok()) {
+ result = s.ToString();
+ }
+ return result;
+}
+
+std::string DBTestBase::Get(int cf, const std::string& k,
+ const Snapshot* snapshot) {
+ ReadOptions options;
+ options.verify_checksums = true;
+ options.snapshot = snapshot;
+ std::string result;
+ Status s = db_->Get(options, handles_[cf], k, &result);
+ if (s.IsNotFound()) {
+ result = "NOT_FOUND";
+ } else if (!s.ok()) {
+ result = s.ToString();
+ }
+ return result;
+}
+
+std::vector<std::string> DBTestBase::MultiGet(std::vector<int> cfs,
+ const std::vector<std::string>& k,
+ const Snapshot* snapshot,
+ const bool batched) {
+ ReadOptions options;
+ options.verify_checksums = true;
+ options.snapshot = snapshot;
+ std::vector<ColumnFamilyHandle*> handles;
+ std::vector<Slice> keys;
+ std::vector<std::string> result;
+
+ for (unsigned int i = 0; i < cfs.size(); ++i) {
+ handles.push_back(handles_[cfs[i]]);
+ keys.push_back(k[i]);
+ }
+ std::vector<Status> s;
+ if (!batched) {
+ s = db_->MultiGet(options, handles, keys, &result);
+ for (unsigned int i = 0; i < s.size(); ++i) {
+ if (s[i].IsNotFound()) {
+ result[i] = "NOT_FOUND";
+ } else if (!s[i].ok()) {
+ result[i] = s[i].ToString();
+ }
+ }
+ } else {
+ std::vector<PinnableSlice> pin_values(cfs.size());
+ result.resize(cfs.size());
+ s.resize(cfs.size());
+ db_->MultiGet(options, cfs.size(), handles.data(), keys.data(),
+ pin_values.data(), s.data());
+ for (unsigned int i = 0; i < s.size(); ++i) {
+ if (s[i].IsNotFound()) {
+ result[i] = "NOT_FOUND";
+ } else if (!s[i].ok()) {
+ result[i] = s[i].ToString();
+ } else {
+ result[i].assign(pin_values[i].data(), pin_values[i].size());
+ }
+ }
+ }
+ return result;
+}
+
+std::vector<std::string> DBTestBase::MultiGet(const std::vector<std::string>& k,
+ const Snapshot* snapshot) {
+ ReadOptions options;
+ options.verify_checksums = true;
+ options.snapshot = snapshot;
+ std::vector<Slice> keys;
+ std::vector<std::string> result;
+ std::vector<Status> statuses(k.size());
+ std::vector<PinnableSlice> pin_values(k.size());
+
+ for (unsigned int i = 0; i < k.size(); ++i) {
+ keys.push_back(k[i]);
+ }
+ db_->MultiGet(options, dbfull()->DefaultColumnFamily(), keys.size(),
+ keys.data(), pin_values.data(), statuses.data());
+ result.resize(k.size());
+ for (auto iter = result.begin(); iter != result.end(); ++iter) {
+ iter->assign(pin_values[iter - result.begin()].data(),
+ pin_values[iter - result.begin()].size());
+ }
+ for (unsigned int i = 0; i < statuses.size(); ++i) {
+ if (statuses[i].IsNotFound()) {
+ result[i] = "NOT_FOUND";
+ }
+ }
+ return result;
+}
+
+Status DBTestBase::Get(const std::string& k, PinnableSlice* v) {
+ ReadOptions options;
+ options.verify_checksums = true;
+ Status s = dbfull()->Get(options, dbfull()->DefaultColumnFamily(), k, v);
+ return s;
+}
+
+uint64_t DBTestBase::GetNumSnapshots() {
+ uint64_t int_num;
+ EXPECT_TRUE(dbfull()->GetIntProperty("rocksdb.num-snapshots", &int_num));
+ return int_num;
+}
+
+uint64_t DBTestBase::GetTimeOldestSnapshots() {
+ uint64_t int_num;
+ EXPECT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.oldest-snapshot-time", &int_num));
+ return int_num;
+}
+
+uint64_t DBTestBase::GetSequenceOldestSnapshots() {
+ uint64_t int_num;
+ EXPECT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.oldest-snapshot-sequence", &int_num));
+ return int_num;
+}
+
+// Return a string that contains all key,value pairs in order,
+// formatted like "(k1->v1)(k2->v2)".
+std::string DBTestBase::Contents(int cf) {
+ std::vector<std::string> forward;
+ std::string result;
+ Iterator* iter = (cf == 0) ? db_->NewIterator(ReadOptions())
+ : db_->NewIterator(ReadOptions(), handles_[cf]);
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ std::string s = IterStatus(iter);
+ result.push_back('(');
+ result.append(s);
+ result.push_back(')');
+ forward.push_back(s);
+ }
+
+ // Check reverse iteration results are the reverse of forward results
+ unsigned int matched = 0;
+ for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+ EXPECT_LT(matched, forward.size());
+ EXPECT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]);
+ matched++;
+ }
+ EXPECT_EQ(matched, forward.size());
+
+ delete iter;
+ return result;
+}
+
+std::string DBTestBase::AllEntriesFor(const Slice& user_key, int cf) {
+ Arena arena;
+ auto options = CurrentOptions();
+ InternalKeyComparator icmp(options.comparator);
+ ReadRangeDelAggregator range_del_agg(&icmp,
+ kMaxSequenceNumber /* upper_bound */);
+ ScopedArenaIterator iter;
+ if (cf == 0) {
+ iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg,
+ kMaxSequenceNumber));
+ } else {
+ iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg,
+ kMaxSequenceNumber, handles_[cf]));
+ }
+ InternalKey target(user_key, kMaxSequenceNumber, kTypeValue);
+ iter->Seek(target.Encode());
+ std::string result;
+ if (!iter->status().ok()) {
+ result = iter->status().ToString();
+ } else {
+ result = "[ ";
+ bool first = true;
+ while (iter->Valid()) {
+ ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+ if (!ParseInternalKey(iter->key(), &ikey)) {
+ result += "CORRUPTED";
+ } else {
+ if (!last_options_.comparator->Equal(ikey.user_key, user_key)) {
+ break;
+ }
+ if (!first) {
+ result += ", ";
+ }
+ first = false;
+ switch (ikey.type) {
+ case kTypeValue:
+ result += iter->value().ToString();
+ break;
+ case kTypeMerge:
+ // keep it the same as kTypeValue for testing kMergePut
+ result += iter->value().ToString();
+ break;
+ case kTypeDeletion:
+ result += "DEL";
+ break;
+ case kTypeSingleDeletion:
+ result += "SDEL";
+ break;
+ default:
+ assert(false);
+ break;
+ }
+ }
+ iter->Next();
+ }
+ if (!first) {
+ result += " ";
+ }
+ result += "]";
+ }
+ return result;
+}
+
+#ifndef ROCKSDB_LITE
+int DBTestBase::NumSortedRuns(int cf) {
+ ColumnFamilyMetaData cf_meta;
+ if (cf == 0) {
+ db_->GetColumnFamilyMetaData(&cf_meta);
+ } else {
+ db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta);
+ }
+ int num_sr = static_cast<int>(cf_meta.levels[0].files.size());
+ for (size_t i = 1U; i < cf_meta.levels.size(); i++) {
+ if (cf_meta.levels[i].files.size() > 0) {
+ num_sr++;
+ }
+ }
+ return num_sr;
+}
+
+uint64_t DBTestBase::TotalSize(int cf) {
+ ColumnFamilyMetaData cf_meta;
+ if (cf == 0) {
+ db_->GetColumnFamilyMetaData(&cf_meta);
+ } else {
+ db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta);
+ }
+ return cf_meta.size;
+}
+
+uint64_t DBTestBase::SizeAtLevel(int level) {
+ std::vector<LiveFileMetaData> metadata;
+ db_->GetLiveFilesMetaData(&metadata);
+ uint64_t sum = 0;
+ for (const auto& m : metadata) {
+ if (m.level == level) {
+ sum += m.size;
+ }
+ }
+ return sum;
+}
+
+size_t DBTestBase::TotalLiveFiles(int cf) {
+ ColumnFamilyMetaData cf_meta;
+ if (cf == 0) {
+ db_->GetColumnFamilyMetaData(&cf_meta);
+ } else {
+ db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta);
+ }
+ size_t num_files = 0;
+ for (auto& level : cf_meta.levels) {
+ num_files += level.files.size();
+ }
+ return num_files;
+}
+
+size_t DBTestBase::CountLiveFiles() {
+ std::vector<LiveFileMetaData> metadata;
+ db_->GetLiveFilesMetaData(&metadata);
+ return metadata.size();
+}
+
+int DBTestBase::NumTableFilesAtLevel(int level, int cf) {
+ std::string property;
+ if (cf == 0) {
+ // default cfd
+ EXPECT_TRUE(db_->GetProperty(
+ "rocksdb.num-files-at-level" + NumberToString(level), &property));
+ } else {
+ EXPECT_TRUE(db_->GetProperty(
+ handles_[cf], "rocksdb.num-files-at-level" + NumberToString(level),
+ &property));
+ }
+ return atoi(property.c_str());
+}
+
+double DBTestBase::CompressionRatioAtLevel(int level, int cf) {
+ std::string property;
+ if (cf == 0) {
+ // default cfd
+ EXPECT_TRUE(db_->GetProperty(
+ "rocksdb.compression-ratio-at-level" + NumberToString(level),
+ &property));
+ } else {
+ EXPECT_TRUE(db_->GetProperty(
+ handles_[cf],
+ "rocksdb.compression-ratio-at-level" + NumberToString(level),
+ &property));
+ }
+ return std::stod(property);
+}
+
+int DBTestBase::TotalTableFiles(int cf, int levels) {
+ if (levels == -1) {
+ levels = (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]);
+ }
+ int result = 0;
+ for (int level = 0; level < levels; level++) {
+ result += NumTableFilesAtLevel(level, cf);
+ }
+ return result;
+}
+
+// Return spread of files per level
+std::string DBTestBase::FilesPerLevel(int cf) {
+ int num_levels =
+ (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]);
+ std::string result;
+ size_t last_non_zero_offset = 0;
+ for (int level = 0; level < num_levels; level++) {
+ int f = NumTableFilesAtLevel(level, cf);
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+ result += buf;
+ if (f > 0) {
+ last_non_zero_offset = result.size();
+ }
+ }
+ result.resize(last_non_zero_offset);
+ return result;
+}
+#endif // !ROCKSDB_LITE
+
+size_t DBTestBase::CountFiles() {
+ std::vector<std::string> files;
+ env_->GetChildren(dbname_, &files);
+
+ std::vector<std::string> logfiles;
+ if (dbname_ != last_options_.wal_dir) {
+ env_->GetChildren(last_options_.wal_dir, &logfiles);
+ }
+
+ return files.size() + logfiles.size();
+}
+
+uint64_t DBTestBase::Size(const Slice& start, const Slice& limit, int cf) {
+ Range r(start, limit);
+ uint64_t size;
+ if (cf == 0) {
+ db_->GetApproximateSizes(&r, 1, &size);
+ } else {
+ db_->GetApproximateSizes(handles_[1], &r, 1, &size);
+ }
+ return size;
+}
+
+void DBTestBase::Compact(int cf, const Slice& start, const Slice& limit,
+ uint32_t target_path_id) {
+ CompactRangeOptions compact_options;
+ compact_options.target_path_id = target_path_id;
+ ASSERT_OK(db_->CompactRange(compact_options, handles_[cf], &start, &limit));
+}
+
+void DBTestBase::Compact(int cf, const Slice& start, const Slice& limit) {
+ ASSERT_OK(
+ db_->CompactRange(CompactRangeOptions(), handles_[cf], &start, &limit));
+}
+
+void DBTestBase::Compact(const Slice& start, const Slice& limit) {
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &limit));
+}
+
+// Do n memtable compactions, each of which produces an sstable
+// covering the range [small,large].
+void DBTestBase::MakeTables(int n, const std::string& small,
+ const std::string& large, int cf) {
+ for (int i = 0; i < n; i++) {
+ ASSERT_OK(Put(cf, small, "begin"));
+ ASSERT_OK(Put(cf, large, "end"));
+ ASSERT_OK(Flush(cf));
+ MoveFilesToLevel(n - i - 1, cf);
+ }
+}
+
+// Prevent pushing of new sstables into deeper levels by adding
+// tables that cover a specified range to all levels.
+void DBTestBase::FillLevels(const std::string& smallest,
+ const std::string& largest, int cf) {
+ MakeTables(db_->NumberLevels(handles_[cf]), smallest, largest, cf);
+}
+
+void DBTestBase::MoveFilesToLevel(int level, int cf) {
+ for (int l = 0; l < level; ++l) {
+ if (cf > 0) {
+ dbfull()->TEST_CompactRange(l, nullptr, nullptr, handles_[cf]);
+ } else {
+ dbfull()->TEST_CompactRange(l, nullptr, nullptr);
+ }
+ }
+}
+
+#ifndef ROCKSDB_LITE
+void DBTestBase::DumpFileCounts(const char* label) {
+ fprintf(stderr, "---\n%s:\n", label);
+ fprintf(stderr, "maxoverlap: %" PRIu64 "\n",
+ dbfull()->TEST_MaxNextLevelOverlappingBytes());
+ for (int level = 0; level < db_->NumberLevels(); level++) {
+ int num = NumTableFilesAtLevel(level);
+ if (num > 0) {
+ fprintf(stderr, " level %3d : %d files\n", level, num);
+ }
+ }
+}
+#endif // !ROCKSDB_LITE
+
+std::string DBTestBase::DumpSSTableList() {
+ std::string property;
+ db_->GetProperty("rocksdb.sstables", &property);
+ return property;
+}
+
+void DBTestBase::GetSstFiles(Env* env, std::string path,
+ std::vector<std::string>* files) {
+ env->GetChildren(path, files);
+
+ files->erase(
+ std::remove_if(files->begin(), files->end(), [](std::string name) {
+ uint64_t number;
+ FileType type;
+ return !(ParseFileName(name, &number, &type) && type == kTableFile);
+ }), files->end());
+}
+
+int DBTestBase::GetSstFileCount(std::string path) {
+ std::vector<std::string> files;
+ DBTestBase::GetSstFiles(env_, path, &files);
+ return static_cast<int>(files.size());
+}
+
+// this will generate non-overlapping files since it keeps increasing key_idx
+void DBTestBase::GenerateNewFile(int cf, Random* rnd, int* key_idx,
+ bool nowait) {
+ for (int i = 0; i < KNumKeysByGenerateNewFile; i++) {
+ ASSERT_OK(Put(cf, Key(*key_idx), RandomString(rnd, (i == 99) ? 1 : 990)));
+ (*key_idx)++;
+ }
+ if (!nowait) {
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+ }
+}
+
+// this will generate non-overlapping files since it keeps increasing key_idx
+void DBTestBase::GenerateNewFile(Random* rnd, int* key_idx, bool nowait) {
+ for (int i = 0; i < KNumKeysByGenerateNewFile; i++) {
+ ASSERT_OK(Put(Key(*key_idx), RandomString(rnd, (i == 99) ? 1 : 990)));
+ (*key_idx)++;
+ }
+ if (!nowait) {
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+ }
+}
+
+const int DBTestBase::kNumKeysByGenerateNewRandomFile = 51;
+
+void DBTestBase::GenerateNewRandomFile(Random* rnd, bool nowait) {
+ for (int i = 0; i < kNumKeysByGenerateNewRandomFile; i++) {
+ ASSERT_OK(Put("key" + RandomString(rnd, 7), RandomString(rnd, 2000)));
+ }
+ ASSERT_OK(Put("key" + RandomString(rnd, 7), RandomString(rnd, 200)));
+ if (!nowait) {
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+ }
+}
+
+std::string DBTestBase::IterStatus(Iterator* iter) {
+ std::string result;
+ if (iter->Valid()) {
+ result = iter->key().ToString() + "->" + iter->value().ToString();
+ } else {
+ result = "(invalid)";
+ }
+ return result;
+}
+
+Options DBTestBase::OptionsForLogIterTest() {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.WAL_ttl_seconds = 1000;
+ return options;
+}
+
+std::string DBTestBase::DummyString(size_t len, char c) {
+ return std::string(len, c);
+}
+
+void DBTestBase::VerifyIterLast(std::string expected_key, int cf) {
+ Iterator* iter;
+ ReadOptions ro;
+ if (cf == 0) {
+ iter = db_->NewIterator(ro);
+ } else {
+ iter = db_->NewIterator(ro, handles_[cf]);
+ }
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), expected_key);
+ delete iter;
+}
+
+// Used to test InplaceUpdate
+
+// If previous value is nullptr or delta is > than previous value,
+// sets newValue with delta
+// If previous value is not empty,
+// updates previous value with 'b' string of previous value size - 1.
+UpdateStatus DBTestBase::updateInPlaceSmallerSize(char* prevValue,
+ uint32_t* prevSize,
+ Slice delta,
+ std::string* newValue) {
+ if (prevValue == nullptr) {
+ *newValue = std::string(delta.size(), 'c');
+ return UpdateStatus::UPDATED;
+ } else {
+ *prevSize = *prevSize - 1;
+ std::string str_b = std::string(*prevSize, 'b');
+ memcpy(prevValue, str_b.c_str(), str_b.size());
+ return UpdateStatus::UPDATED_INPLACE;
+ }
+}
+
+UpdateStatus DBTestBase::updateInPlaceSmallerVarintSize(char* prevValue,
+ uint32_t* prevSize,
+ Slice delta,
+ std::string* newValue) {
+ if (prevValue == nullptr) {
+ *newValue = std::string(delta.size(), 'c');
+ return UpdateStatus::UPDATED;
+ } else {
+ *prevSize = 1;
+ std::string str_b = std::string(*prevSize, 'b');
+ memcpy(prevValue, str_b.c_str(), str_b.size());
+ return UpdateStatus::UPDATED_INPLACE;
+ }
+}
+
+UpdateStatus DBTestBase::updateInPlaceLargerSize(char* /*prevValue*/,
+ uint32_t* /*prevSize*/,
+ Slice delta,
+ std::string* newValue) {
+ *newValue = std::string(delta.size(), 'c');
+ return UpdateStatus::UPDATED;
+}
+
+UpdateStatus DBTestBase::updateInPlaceNoAction(char* /*prevValue*/,
+ uint32_t* /*prevSize*/,
+ Slice /*delta*/,
+ std::string* /*newValue*/) {
+ return UpdateStatus::UPDATE_FAILED;
+}
+
+// Utility method to test InplaceUpdate
+void DBTestBase::validateNumberOfEntries(int numValues, int cf) {
+ Arena arena;
+ auto options = CurrentOptions();
+ InternalKeyComparator icmp(options.comparator);
+ ReadRangeDelAggregator range_del_agg(&icmp,
+ kMaxSequenceNumber /* upper_bound */);
+ // This should be defined after range_del_agg so that it destructs the
+ // assigned iterator before it range_del_agg is already destructed.
+ ScopedArenaIterator iter;
+ if (cf != 0) {
+ iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg,
+ kMaxSequenceNumber, handles_[cf]));
+ } else {
+ iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg,
+ kMaxSequenceNumber));
+ }
+ iter->SeekToFirst();
+ ASSERT_EQ(iter->status().ok(), true);
+ int seq = numValues;
+ while (iter->Valid()) {
+ ParsedInternalKey ikey;
+ ikey.clear();
+ ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+
+ // checks sequence number for updates
+ ASSERT_EQ(ikey.sequence, (unsigned)seq--);
+ iter->Next();
+ }
+ ASSERT_EQ(0, seq);
+}
+
+void DBTestBase::CopyFile(const std::string& source,
+ const std::string& destination, uint64_t size) {
+ const EnvOptions soptions;
+ std::unique_ptr<SequentialFile> srcfile;
+ ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions));
+ std::unique_ptr<WritableFile> destfile;
+ ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions));
+
+ if (size == 0) {
+ // default argument means copy everything
+ ASSERT_OK(env_->GetFileSize(source, &size));
+ }
+
+ char buffer[4096];
+ Slice slice;
+ while (size > 0) {
+ uint64_t one = std::min(uint64_t(sizeof(buffer)), size);
+ ASSERT_OK(srcfile->Read(one, &slice, buffer));
+ ASSERT_OK(destfile->Append(slice));
+ size -= slice.size();
+ }
+ ASSERT_OK(destfile->Close());
+}
+
+std::unordered_map<std::string, uint64_t> DBTestBase::GetAllSSTFiles(
+ uint64_t* total_size) {
+ std::unordered_map<std::string, uint64_t> res;
+
+ if (total_size) {
+ *total_size = 0;
+ }
+ std::vector<std::string> files;
+ env_->GetChildren(dbname_, &files);
+ for (auto& file_name : files) {
+ uint64_t number;
+ FileType type;
+ std::string file_path = dbname_ + "/" + file_name;
+ if (ParseFileName(file_name, &number, &type) && type == kTableFile) {
+ uint64_t file_size = 0;
+ env_->GetFileSize(file_path, &file_size);
+ res[file_path] = file_size;
+ if (total_size) {
+ *total_size += file_size;
+ }
+ }
+ }
+ return res;
+}
+
+std::vector<std::uint64_t> DBTestBase::ListTableFiles(Env* env,
+ const std::string& path) {
+ std::vector<std::string> files;
+ std::vector<uint64_t> file_numbers;
+ env->GetChildren(path, &files);
+ uint64_t number;
+ FileType type;
+ for (size_t i = 0; i < files.size(); ++i) {
+ if (ParseFileName(files[i], &number, &type)) {
+ if (type == kTableFile) {
+ file_numbers.push_back(number);
+ }
+ }
+ }
+ return file_numbers;
+}
+
+void DBTestBase::VerifyDBFromMap(std::map<std::string, std::string> true_data,
+ size_t* total_reads_res, bool tailing_iter,
+ std::map<std::string, Status> status) {
+ size_t total_reads = 0;
+
+ for (auto& kv : true_data) {
+ Status s = status[kv.first];
+ if (s.ok()) {
+ ASSERT_EQ(Get(kv.first), kv.second);
+ } else {
+ std::string value;
+ ASSERT_EQ(s, db_->Get(ReadOptions(), kv.first, &value));
+ }
+ total_reads++;
+ }
+
+ // Normal Iterator
+ {
+ int iter_cnt = 0;
+ ReadOptions ro;
+ ro.total_order_seek = true;
+ Iterator* iter = db_->NewIterator(ro);
+ // Verify Iterator::Next()
+ iter_cnt = 0;
+ auto data_iter = true_data.begin();
+ Status s;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next(), data_iter++) {
+ ASSERT_EQ(iter->key().ToString(), data_iter->first);
+ Status current_status = status[data_iter->first];
+ if (!current_status.ok()) {
+ s = current_status;
+ }
+ ASSERT_EQ(iter->status(), s);
+ if (current_status.ok()) {
+ ASSERT_EQ(iter->value().ToString(), data_iter->second);
+ }
+ iter_cnt++;
+ total_reads++;
+ }
+ ASSERT_EQ(data_iter, true_data.end()) << iter_cnt << " / "
+ << true_data.size();
+ delete iter;
+
+ // Verify Iterator::Prev()
+ // Use a new iterator to make sure its status is clean.
+ iter = db_->NewIterator(ro);
+ iter_cnt = 0;
+ s = Status::OK();
+ auto data_rev = true_data.rbegin();
+ for (iter->SeekToLast(); iter->Valid(); iter->Prev(), data_rev++) {
+ ASSERT_EQ(iter->key().ToString(), data_rev->first);
+ Status current_status = status[data_rev->first];
+ if (!current_status.ok()) {
+ s = current_status;
+ }
+ ASSERT_EQ(iter->status(), s);
+ if (current_status.ok()) {
+ ASSERT_EQ(iter->value().ToString(), data_rev->second);
+ }
+ iter_cnt++;
+ total_reads++;
+ }
+ ASSERT_EQ(data_rev, true_data.rend()) << iter_cnt << " / "
+ << true_data.size();
+
+ // Verify Iterator::Seek()
+ for (auto kv : true_data) {
+ iter->Seek(kv.first);
+ ASSERT_EQ(kv.first, iter->key().ToString());
+ ASSERT_EQ(kv.second, iter->value().ToString());
+ total_reads++;
+ }
+ delete iter;
+ }
+
+ if (tailing_iter) {
+#ifndef ROCKSDB_LITE
+ // Tailing iterator
+ int iter_cnt = 0;
+ ReadOptions ro;
+ ro.tailing = true;
+ ro.total_order_seek = true;
+ Iterator* iter = db_->NewIterator(ro);
+
+ // Verify ForwardIterator::Next()
+ iter_cnt = 0;
+ auto data_iter = true_data.begin();
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next(), data_iter++) {
+ ASSERT_EQ(iter->key().ToString(), data_iter->first);
+ ASSERT_EQ(iter->value().ToString(), data_iter->second);
+ iter_cnt++;
+ total_reads++;
+ }
+ ASSERT_EQ(data_iter, true_data.end()) << iter_cnt << " / "
+ << true_data.size();
+
+ // Verify ForwardIterator::Seek()
+ for (auto kv : true_data) {
+ iter->Seek(kv.first);
+ ASSERT_EQ(kv.first, iter->key().ToString());
+ ASSERT_EQ(kv.second, iter->value().ToString());
+ total_reads++;
+ }
+
+ delete iter;
+#endif // ROCKSDB_LITE
+ }
+
+ if (total_reads_res) {
+ *total_reads_res = total_reads;
+ }
+}
+
+void DBTestBase::VerifyDBInternal(
+ std::vector<std::pair<std::string, std::string>> true_data) {
+ Arena arena;
+ InternalKeyComparator icmp(last_options_.comparator);
+ ReadRangeDelAggregator range_del_agg(&icmp,
+ kMaxSequenceNumber /* upper_bound */);
+ auto iter =
+ dbfull()->NewInternalIterator(&arena, &range_del_agg, kMaxSequenceNumber);
+ iter->SeekToFirst();
+ for (auto p : true_data) {
+ ASSERT_TRUE(iter->Valid());
+ ParsedInternalKey ikey;
+ ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey));
+ ASSERT_EQ(p.first, ikey.user_key);
+ ASSERT_EQ(p.second, iter->value());
+ iter->Next();
+ };
+ ASSERT_FALSE(iter->Valid());
+ iter->~InternalIterator();
+}
+
+#ifndef ROCKSDB_LITE
+
+uint64_t DBTestBase::GetNumberOfSstFilesForColumnFamily(
+ DB* db, std::string column_family_name) {
+ std::vector<LiveFileMetaData> metadata;
+ db->GetLiveFilesMetaData(&metadata);
+ uint64_t result = 0;
+ for (auto& fileMetadata : metadata) {
+ result += (fileMetadata.column_family_name == column_family_name);
+ }
+ return result;
+}
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_test_util.h b/src/rocksdb/db/db_test_util.h
new file mode 100644
index 000000000..eeabea9bd
--- /dev/null
+++ b/src/rocksdb/db/db_test_util.h
@@ -0,0 +1,1000 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <fcntl.h>
+#include <cinttypes>
+
+#include <algorithm>
+#include <map>
+#include <set>
+#include <string>
+#include <thread>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "memtable/hash_linklist_rep.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/sst_file_writer.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/mock_table.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/scoped_arena_iterator.h"
+#include "test_util/mock_time_env.h"
+#include "util/compression.h"
+#include "util/mutexlock.h"
+
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace anon {
+class AtomicCounter {
+ public:
+ explicit AtomicCounter(Env* env = NULL)
+ : env_(env), cond_count_(&mu_), count_(0) {}
+
+ void Increment() {
+ MutexLock l(&mu_);
+ count_++;
+ cond_count_.SignalAll();
+ }
+
+ int Read() {
+ MutexLock l(&mu_);
+ return count_;
+ }
+
+ bool WaitFor(int count) {
+ MutexLock l(&mu_);
+
+ uint64_t start = env_->NowMicros();
+ while (count_ < count) {
+ uint64_t now = env_->NowMicros();
+ cond_count_.TimedWait(now + /*1s*/ 1 * 1000 * 1000);
+ if (env_->NowMicros() - start > /*10s*/ 10 * 1000 * 1000) {
+ return false;
+ }
+ if (count_ < count) {
+ GTEST_LOG_(WARNING) << "WaitFor is taking more time than usual";
+ }
+ }
+
+ return true;
+ }
+
+ void Reset() {
+ MutexLock l(&mu_);
+ count_ = 0;
+ cond_count_.SignalAll();
+ }
+
+ private:
+ Env* env_;
+ port::Mutex mu_;
+ port::CondVar cond_count_;
+ int count_;
+};
+
+struct OptionsOverride {
+ std::shared_ptr<const FilterPolicy> filter_policy = nullptr;
+ // These will be used only if filter_policy is set
+ bool partition_filters = false;
+ uint64_t metadata_block_size = 1024;
+
+ // Used as a bit mask of individual enums in which to skip an XF test point
+ int skip_policy = 0;
+};
+
+} // namespace anon
+
+enum SkipPolicy { kSkipNone = 0, kSkipNoSnapshot = 1, kSkipNoPrefix = 2 };
+
+// A hacky skip list mem table that triggers flush after number of entries.
+class SpecialMemTableRep : public MemTableRep {
+ public:
+ explicit SpecialMemTableRep(Allocator* allocator, MemTableRep* memtable,
+ int num_entries_flush)
+ : MemTableRep(allocator),
+ memtable_(memtable),
+ num_entries_flush_(num_entries_flush),
+ num_entries_(0) {}
+
+ virtual KeyHandle Allocate(const size_t len, char** buf) override {
+ return memtable_->Allocate(len, buf);
+ }
+
+ // Insert key into the list.
+ // REQUIRES: nothing that compares equal to key is currently in the list.
+ virtual void Insert(KeyHandle handle) override {
+ num_entries_++;
+ memtable_->Insert(handle);
+ }
+
+ void InsertConcurrently(KeyHandle handle) override {
+ num_entries_++;
+ memtable_->Insert(handle);
+ }
+
+ // Returns true iff an entry that compares equal to key is in the list.
+ virtual bool Contains(const char* key) const override {
+ return memtable_->Contains(key);
+ }
+
+ virtual size_t ApproximateMemoryUsage() override {
+ // Return a high memory usage when number of entries exceeds the threshold
+ // to trigger a flush.
+ return (num_entries_ < num_entries_flush_) ? 0 : 1024 * 1024 * 1024;
+ }
+
+ virtual void Get(const LookupKey& k, void* callback_args,
+ bool (*callback_func)(void* arg,
+ const char* entry)) override {
+ memtable_->Get(k, callback_args, callback_func);
+ }
+
+ uint64_t ApproximateNumEntries(const Slice& start_ikey,
+ const Slice& end_ikey) override {
+ return memtable_->ApproximateNumEntries(start_ikey, end_ikey);
+ }
+
+ virtual MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override {
+ return memtable_->GetIterator(arena);
+ }
+
+ virtual ~SpecialMemTableRep() override {}
+
+ private:
+ std::unique_ptr<MemTableRep> memtable_;
+ int num_entries_flush_;
+ int num_entries_;
+};
+
+// The factory for the hacky skip list mem table that triggers flush after
+// number of entries exceeds a threshold.
+class SpecialSkipListFactory : public MemTableRepFactory {
+ public:
+ // After number of inserts exceeds `num_entries_flush` in a mem table, trigger
+ // flush.
+ explicit SpecialSkipListFactory(int num_entries_flush)
+ : num_entries_flush_(num_entries_flush) {}
+
+ using MemTableRepFactory::CreateMemTableRep;
+ virtual MemTableRep* CreateMemTableRep(
+ const MemTableRep::KeyComparator& compare, Allocator* allocator,
+ const SliceTransform* transform, Logger* /*logger*/) override {
+ return new SpecialMemTableRep(
+ allocator, factory_.CreateMemTableRep(compare, allocator, transform, 0),
+ num_entries_flush_);
+ }
+ virtual const char* Name() const override { return "SkipListFactory"; }
+
+ bool IsInsertConcurrentlySupported() const override {
+ return factory_.IsInsertConcurrentlySupported();
+ }
+
+ private:
+ SkipListFactory factory_;
+ int num_entries_flush_;
+};
+
+// Special Env used to delay background operations
+class SpecialEnv : public EnvWrapper {
+ public:
+ explicit SpecialEnv(Env* base);
+
+ Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
+ const EnvOptions& soptions) override {
+ class SSTableFile : public WritableFile {
+ private:
+ SpecialEnv* env_;
+ std::unique_ptr<WritableFile> base_;
+
+ public:
+ SSTableFile(SpecialEnv* env, std::unique_ptr<WritableFile>&& base)
+ : env_(env), base_(std::move(base)) {}
+ Status Append(const Slice& data) override {
+ if (env_->table_write_callback_) {
+ (*env_->table_write_callback_)();
+ }
+ if (env_->drop_writes_.load(std::memory_order_acquire)) {
+ // Drop writes on the floor
+ return Status::OK();
+ } else if (env_->no_space_.load(std::memory_order_acquire)) {
+ return Status::NoSpace("No space left on device");
+ } else {
+ env_->bytes_written_ += data.size();
+ return base_->Append(data);
+ }
+ }
+ Status PositionedAppend(const Slice& data, uint64_t offset) override {
+ if (env_->table_write_callback_) {
+ (*env_->table_write_callback_)();
+ }
+ if (env_->drop_writes_.load(std::memory_order_acquire)) {
+ // Drop writes on the floor
+ return Status::OK();
+ } else if (env_->no_space_.load(std::memory_order_acquire)) {
+ return Status::NoSpace("No space left on device");
+ } else {
+ env_->bytes_written_ += data.size();
+ return base_->PositionedAppend(data, offset);
+ }
+ }
+ Status Truncate(uint64_t size) override { return base_->Truncate(size); }
+ Status RangeSync(uint64_t offset, uint64_t nbytes) override {
+ Status s = base_->RangeSync(offset, nbytes);
+#if !(defined NDEBUG) || !defined(OS_WIN)
+ TEST_SYNC_POINT_CALLBACK("SpecialEnv::SStableFile::RangeSync", &s);
+#endif // !(defined NDEBUG) || !defined(OS_WIN)
+ return s;
+ }
+ Status Close() override {
+// SyncPoint is not supported in Released Windows Mode.
+#if !(defined NDEBUG) || !defined(OS_WIN)
+ // Check preallocation size
+ // preallocation size is never passed to base file.
+ size_t preallocation_size = preallocation_block_size();
+ TEST_SYNC_POINT_CALLBACK("DBTestWritableFile.GetPreallocationStatus",
+ &preallocation_size);
+#endif // !(defined NDEBUG) || !defined(OS_WIN)
+ Status s = base_->Close();
+#if !(defined NDEBUG) || !defined(OS_WIN)
+ TEST_SYNC_POINT_CALLBACK("SpecialEnv::SStableFile::Close", &s);
+#endif // !(defined NDEBUG) || !defined(OS_WIN)
+ return s;
+ }
+ Status Flush() override { return base_->Flush(); }
+ Status Sync() override {
+ ++env_->sync_counter_;
+ while (env_->delay_sstable_sync_.load(std::memory_order_acquire)) {
+ env_->SleepForMicroseconds(100000);
+ }
+ Status s = base_->Sync();
+#if !(defined NDEBUG) || !defined(OS_WIN)
+ TEST_SYNC_POINT_CALLBACK("SpecialEnv::SStableFile::Sync", &s);
+#endif // !(defined NDEBUG) || !defined(OS_WIN)
+ return s;
+ }
+ void SetIOPriority(Env::IOPriority pri) override {
+ base_->SetIOPriority(pri);
+ }
+ Env::IOPriority GetIOPriority() override {
+ return base_->GetIOPriority();
+ }
+ bool use_direct_io() const override {
+ return base_->use_direct_io();
+ }
+ Status Allocate(uint64_t offset, uint64_t len) override {
+ return base_->Allocate(offset, len);
+ }
+ };
+ class ManifestFile : public WritableFile {
+ public:
+ ManifestFile(SpecialEnv* env, std::unique_ptr<WritableFile>&& b)
+ : env_(env), base_(std::move(b)) {}
+ Status Append(const Slice& data) override {
+ if (env_->manifest_write_error_.load(std::memory_order_acquire)) {
+ return Status::IOError("simulated writer error");
+ } else {
+ return base_->Append(data);
+ }
+ }
+ Status Truncate(uint64_t size) override { return base_->Truncate(size); }
+ Status Close() override { return base_->Close(); }
+ Status Flush() override { return base_->Flush(); }
+ Status Sync() override {
+ ++env_->sync_counter_;
+ if (env_->manifest_sync_error_.load(std::memory_order_acquire)) {
+ return Status::IOError("simulated sync error");
+ } else {
+ return base_->Sync();
+ }
+ }
+ uint64_t GetFileSize() override { return base_->GetFileSize(); }
+ Status Allocate(uint64_t offset, uint64_t len) override {
+ return base_->Allocate(offset, len);
+ }
+
+ private:
+ SpecialEnv* env_;
+ std::unique_ptr<WritableFile> base_;
+ };
+ class WalFile : public WritableFile {
+ public:
+ WalFile(SpecialEnv* env, std::unique_ptr<WritableFile>&& b)
+ : env_(env), base_(std::move(b)) {
+ env_->num_open_wal_file_.fetch_add(1);
+ }
+ virtual ~WalFile() { env_->num_open_wal_file_.fetch_add(-1); }
+ Status Append(const Slice& data) override {
+#if !(defined NDEBUG) || !defined(OS_WIN)
+ TEST_SYNC_POINT("SpecialEnv::WalFile::Append:1");
+#endif
+ Status s;
+ if (env_->log_write_error_.load(std::memory_order_acquire)) {
+ s = Status::IOError("simulated writer error");
+ } else {
+ int slowdown =
+ env_->log_write_slowdown_.load(std::memory_order_acquire);
+ if (slowdown > 0) {
+ env_->SleepForMicroseconds(slowdown);
+ }
+ s = base_->Append(data);
+ }
+#if !(defined NDEBUG) || !defined(OS_WIN)
+ TEST_SYNC_POINT("SpecialEnv::WalFile::Append:2");
+#endif
+ return s;
+ }
+ Status Truncate(uint64_t size) override { return base_->Truncate(size); }
+ Status Close() override {
+// SyncPoint is not supported in Released Windows Mode.
+#if !(defined NDEBUG) || !defined(OS_WIN)
+ // Check preallocation size
+ // preallocation size is never passed to base file.
+ size_t preallocation_size = preallocation_block_size();
+ TEST_SYNC_POINT_CALLBACK("DBTestWalFile.GetPreallocationStatus",
+ &preallocation_size);
+#endif // !(defined NDEBUG) || !defined(OS_WIN)
+
+ return base_->Close();
+ }
+ Status Flush() override { return base_->Flush(); }
+ Status Sync() override {
+ ++env_->sync_counter_;
+ return base_->Sync();
+ }
+ bool IsSyncThreadSafe() const override {
+ return env_->is_wal_sync_thread_safe_.load();
+ }
+ Status Allocate(uint64_t offset, uint64_t len) override {
+ return base_->Allocate(offset, len);
+ }
+
+ private:
+ SpecialEnv* env_;
+ std::unique_ptr<WritableFile> base_;
+ };
+
+ if (non_writeable_rate_.load(std::memory_order_acquire) > 0) {
+ uint32_t random_number;
+ {
+ MutexLock l(&rnd_mutex_);
+ random_number = rnd_.Uniform(100);
+ }
+ if (random_number < non_writeable_rate_.load()) {
+ return Status::IOError("simulated random write error");
+ }
+ }
+
+ new_writable_count_++;
+
+ if (non_writable_count_.load() > 0) {
+ non_writable_count_--;
+ return Status::IOError("simulated write error");
+ }
+
+ EnvOptions optimized = soptions;
+ if (strstr(f.c_str(), "MANIFEST") != nullptr ||
+ strstr(f.c_str(), "log") != nullptr) {
+ optimized.use_mmap_writes = false;
+ optimized.use_direct_writes = false;
+ }
+
+ Status s = target()->NewWritableFile(f, r, optimized);
+ if (s.ok()) {
+ if (strstr(f.c_str(), ".sst") != nullptr) {
+ r->reset(new SSTableFile(this, std::move(*r)));
+ } else if (strstr(f.c_str(), "MANIFEST") != nullptr) {
+ r->reset(new ManifestFile(this, std::move(*r)));
+ } else if (strstr(f.c_str(), "log") != nullptr) {
+ r->reset(new WalFile(this, std::move(*r)));
+ }
+ }
+ return s;
+ }
+
+ Status NewRandomAccessFile(const std::string& f,
+ std::unique_ptr<RandomAccessFile>* r,
+ const EnvOptions& soptions) override {
+ class CountingFile : public RandomAccessFile {
+ public:
+ CountingFile(std::unique_ptr<RandomAccessFile>&& target,
+ anon::AtomicCounter* counter,
+ std::atomic<size_t>* bytes_read)
+ : target_(std::move(target)),
+ counter_(counter),
+ bytes_read_(bytes_read) {}
+ virtual Status Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const override {
+ counter_->Increment();
+ Status s = target_->Read(offset, n, result, scratch);
+ *bytes_read_ += result->size();
+ return s;
+ }
+
+ virtual Status Prefetch(uint64_t offset, size_t n) override {
+ Status s = target_->Prefetch(offset, n);
+ *bytes_read_ += n;
+ return s;
+ }
+
+ private:
+ std::unique_ptr<RandomAccessFile> target_;
+ anon::AtomicCounter* counter_;
+ std::atomic<size_t>* bytes_read_;
+ };
+
+ Status s = target()->NewRandomAccessFile(f, r, soptions);
+ random_file_open_counter_++;
+ if (s.ok() && count_random_reads_) {
+ r->reset(new CountingFile(std::move(*r), &random_read_counter_,
+ &random_read_bytes_counter_));
+ }
+ if (s.ok() && soptions.compaction_readahead_size > 0) {
+ compaction_readahead_size_ = soptions.compaction_readahead_size;
+ }
+ return s;
+ }
+
+ virtual Status NewSequentialFile(const std::string& f,
+ std::unique_ptr<SequentialFile>* r,
+ const EnvOptions& soptions) override {
+ class CountingFile : public SequentialFile {
+ public:
+ CountingFile(std::unique_ptr<SequentialFile>&& target,
+ anon::AtomicCounter* counter)
+ : target_(std::move(target)), counter_(counter) {}
+ virtual Status Read(size_t n, Slice* result, char* scratch) override {
+ counter_->Increment();
+ return target_->Read(n, result, scratch);
+ }
+ virtual Status Skip(uint64_t n) override { return target_->Skip(n); }
+
+ private:
+ std::unique_ptr<SequentialFile> target_;
+ anon::AtomicCounter* counter_;
+ };
+
+ Status s = target()->NewSequentialFile(f, r, soptions);
+ if (s.ok() && count_sequential_reads_) {
+ r->reset(new CountingFile(std::move(*r), &sequential_read_counter_));
+ }
+ return s;
+ }
+
+ virtual void SleepForMicroseconds(int micros) override {
+ sleep_counter_.Increment();
+ if (no_slowdown_ || time_elapse_only_sleep_) {
+ addon_time_.fetch_add(micros);
+ }
+ if (!no_slowdown_) {
+ target()->SleepForMicroseconds(micros);
+ }
+ }
+
+ virtual Status GetCurrentTime(int64_t* unix_time) override {
+ Status s;
+ if (!time_elapse_only_sleep_) {
+ s = target()->GetCurrentTime(unix_time);
+ }
+ if (s.ok()) {
+ *unix_time += addon_time_.load();
+ }
+ return s;
+ }
+
+ virtual uint64_t NowCPUNanos() override {
+ now_cpu_count_.fetch_add(1);
+ return target()->NowCPUNanos();
+ }
+
+ virtual uint64_t NowNanos() override {
+ return (time_elapse_only_sleep_ ? 0 : target()->NowNanos()) +
+ addon_time_.load() * 1000;
+ }
+
+ virtual uint64_t NowMicros() override {
+ return (time_elapse_only_sleep_ ? 0 : target()->NowMicros()) +
+ addon_time_.load();
+ }
+
+ virtual Status DeleteFile(const std::string& fname) override {
+ delete_count_.fetch_add(1);
+ return target()->DeleteFile(fname);
+ }
+
+ Random rnd_;
+ port::Mutex rnd_mutex_; // Lock to pretect rnd_
+
+ // sstable Sync() calls are blocked while this pointer is non-nullptr.
+ std::atomic<bool> delay_sstable_sync_;
+
+ // Drop writes on the floor while this pointer is non-nullptr.
+ std::atomic<bool> drop_writes_;
+
+ // Simulate no-space errors while this pointer is non-nullptr.
+ std::atomic<bool> no_space_;
+
+ // Simulate non-writable file system while this pointer is non-nullptr
+ std::atomic<bool> non_writable_;
+
+ // Force sync of manifest files to fail while this pointer is non-nullptr
+ std::atomic<bool> manifest_sync_error_;
+
+ // Force write to manifest files to fail while this pointer is non-nullptr
+ std::atomic<bool> manifest_write_error_;
+
+ // Force write to log files to fail while this pointer is non-nullptr
+ std::atomic<bool> log_write_error_;
+
+ // Slow down every log write, in micro-seconds.
+ std::atomic<int> log_write_slowdown_;
+
+ // Number of WAL files that are still open for write.
+ std::atomic<int> num_open_wal_file_;
+
+ bool count_random_reads_;
+ anon::AtomicCounter random_read_counter_;
+ std::atomic<size_t> random_read_bytes_counter_;
+ std::atomic<int> random_file_open_counter_;
+
+ bool count_sequential_reads_;
+ anon::AtomicCounter sequential_read_counter_;
+
+ anon::AtomicCounter sleep_counter_;
+
+ std::atomic<int64_t> bytes_written_;
+
+ std::atomic<int> sync_counter_;
+
+ std::atomic<uint32_t> non_writeable_rate_;
+
+ std::atomic<uint32_t> new_writable_count_;
+
+ std::atomic<uint32_t> non_writable_count_;
+
+ std::function<void()>* table_write_callback_;
+
+ std::atomic<int64_t> addon_time_;
+
+ std::atomic<int> now_cpu_count_;
+
+ std::atomic<int> delete_count_;
+
+ std::atomic<bool> time_elapse_only_sleep_;
+
+ bool no_slowdown_;
+
+ std::atomic<bool> is_wal_sync_thread_safe_{true};
+
+ std::atomic<size_t> compaction_readahead_size_{};
+};
+
+#ifndef ROCKSDB_LITE
+class OnFileDeletionListener : public EventListener {
+ public:
+ OnFileDeletionListener() : matched_count_(0), expected_file_name_("") {}
+
+ void SetExpectedFileName(const std::string file_name) {
+ expected_file_name_ = file_name;
+ }
+
+ void VerifyMatchedCount(size_t expected_value) {
+ ASSERT_EQ(matched_count_, expected_value);
+ }
+
+ void OnTableFileDeleted(const TableFileDeletionInfo& info) override {
+ if (expected_file_name_ != "") {
+ ASSERT_EQ(expected_file_name_, info.file_path);
+ expected_file_name_ = "";
+ matched_count_++;
+ }
+ }
+
+ private:
+ size_t matched_count_;
+ std::string expected_file_name_;
+};
+#endif
+
+// A test merge operator mimics put but also fails if one of merge operands is
+// "corrupted".
+class TestPutOperator : public MergeOperator {
+ public:
+ virtual bool FullMergeV2(const MergeOperationInput& merge_in,
+ MergeOperationOutput* merge_out) const override {
+ if (merge_in.existing_value != nullptr &&
+ *(merge_in.existing_value) == "corrupted") {
+ return false;
+ }
+ for (auto value : merge_in.operand_list) {
+ if (value == "corrupted") {
+ return false;
+ }
+ }
+ merge_out->existing_operand = merge_in.operand_list.back();
+ return true;
+ }
+
+ virtual const char* Name() const override { return "TestPutOperator"; }
+};
+
+class DBTestBase : public testing::Test {
+ public:
+ // Sequence of option configurations to try
+ enum OptionConfig : int {
+ kDefault = 0,
+ kBlockBasedTableWithPrefixHashIndex = 1,
+ kBlockBasedTableWithWholeKeyHashIndex = 2,
+ kPlainTableFirstBytePrefix = 3,
+ kPlainTableCappedPrefix = 4,
+ kPlainTableCappedPrefixNonMmap = 5,
+ kPlainTableAllBytesPrefix = 6,
+ kVectorRep = 7,
+ kHashLinkList = 8,
+ kMergePut = 9,
+ kFilter = 10,
+ kFullFilterWithNewTableReaderForCompactions = 11,
+ kUncompressed = 12,
+ kNumLevel_3 = 13,
+ kDBLogDir = 14,
+ kWalDirAndMmapReads = 15,
+ kManifestFileSize = 16,
+ kPerfOptions = 17,
+ kHashSkipList = 18,
+ kUniversalCompaction = 19,
+ kUniversalCompactionMultiLevel = 20,
+ kCompressedBlockCache = 21,
+ kInfiniteMaxOpenFiles = 22,
+ kxxHashChecksum = 23,
+ kFIFOCompaction = 24,
+ kOptimizeFiltersForHits = 25,
+ kRowCache = 26,
+ kRecycleLogFiles = 27,
+ kConcurrentSkipList = 28,
+ kPipelinedWrite = 29,
+ kConcurrentWALWrites = 30,
+ kDirectIO,
+ kLevelSubcompactions,
+ kBlockBasedTableWithIndexRestartInterval,
+ kBlockBasedTableWithPartitionedIndex,
+ kBlockBasedTableWithPartitionedIndexFormat4,
+ kPartitionedFilterWithNewTableReaderForCompactions,
+ kUniversalSubcompactions,
+ kxxHash64Checksum,
+ kUnorderedWrite,
+ // This must be the last line
+ kEnd,
+ };
+
+ public:
+ std::string dbname_;
+ std::string alternative_wal_dir_;
+ std::string alternative_db_log_dir_;
+ MockEnv* mem_env_;
+ Env* encrypted_env_;
+ SpecialEnv* env_;
+ std::shared_ptr<Env> env_guard_;
+ DB* db_;
+ std::vector<ColumnFamilyHandle*> handles_;
+
+ int option_config_;
+ Options last_options_;
+
+ // Skip some options, as they may not be applicable to a specific test.
+ // To add more skip constants, use values 4, 8, 16, etc.
+ enum OptionSkip {
+ kNoSkip = 0,
+ kSkipDeletesFilterFirst = 1,
+ kSkipUniversalCompaction = 2,
+ kSkipMergePut = 4,
+ kSkipPlainTable = 8,
+ kSkipHashIndex = 16,
+ kSkipNoSeekToLast = 32,
+ kSkipFIFOCompaction = 128,
+ kSkipMmapReads = 256,
+ };
+
+ const int kRangeDelSkipConfigs =
+ // Plain tables do not support range deletions.
+ kSkipPlainTable |
+ // MmapReads disables the iterator pinning that RangeDelAggregator
+ // requires.
+ kSkipMmapReads;
+
+ explicit DBTestBase(const std::string path);
+
+ ~DBTestBase();
+
+ static std::string RandomString(Random* rnd, int len) {
+ std::string r;
+ test::RandomString(rnd, len, &r);
+ return r;
+ }
+
+ static std::string Key(int i) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "key%06d", i);
+ return std::string(buf);
+ }
+
+ static bool ShouldSkipOptions(int option_config, int skip_mask = kNoSkip);
+
+ // Switch to a fresh database with the next option configuration to
+ // test. Return false if there are no more configurations to test.
+ bool ChangeOptions(int skip_mask = kNoSkip);
+
+ // Switch between different compaction styles.
+ bool ChangeCompactOptions();
+
+ // Switch between different WAL-realted options.
+ bool ChangeWalOptions();
+
+ // Switch between different filter policy
+ // Jump from kDefault to kFilter to kFullFilter
+ bool ChangeFilterOptions();
+
+ // Switch between different DB options for file ingestion tests.
+ bool ChangeOptionsForFileIngestionTest();
+
+ // Return the current option configuration.
+ Options CurrentOptions(const anon::OptionsOverride& options_override =
+ anon::OptionsOverride()) const;
+
+ Options CurrentOptions(const Options& default_options,
+ const anon::OptionsOverride& options_override =
+ anon::OptionsOverride()) const;
+
+ static Options GetDefaultOptions();
+
+ Options GetOptions(int option_config,
+ const Options& default_options = GetDefaultOptions(),
+ const anon::OptionsOverride& options_override =
+ anon::OptionsOverride()) const;
+
+ DBImpl* dbfull() { return reinterpret_cast<DBImpl*>(db_); }
+
+ void CreateColumnFamilies(const std::vector<std::string>& cfs,
+ const Options& options);
+
+ void CreateAndReopenWithCF(const std::vector<std::string>& cfs,
+ const Options& options);
+
+ void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+ const std::vector<Options>& options);
+
+ void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+ const Options& options);
+
+ Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+ const std::vector<Options>& options);
+
+ Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+ const Options& options);
+
+ void Reopen(const Options& options);
+
+ void Close();
+
+ void DestroyAndReopen(const Options& options);
+
+ void Destroy(const Options& options, bool delete_cf_paths = false);
+
+ Status ReadOnlyReopen(const Options& options);
+
+ Status TryReopen(const Options& options);
+
+ bool IsDirectIOSupported();
+
+ bool IsMemoryMappedAccessSupported() const;
+
+ Status Flush(int cf = 0);
+
+ Status Flush(const std::vector<int>& cf_ids);
+
+ Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions());
+
+ Status Put(int cf, const Slice& k, const Slice& v,
+ WriteOptions wo = WriteOptions());
+
+ Status Merge(const Slice& k, const Slice& v,
+ WriteOptions wo = WriteOptions());
+
+ Status Merge(int cf, const Slice& k, const Slice& v,
+ WriteOptions wo = WriteOptions());
+
+ Status Delete(const std::string& k);
+
+ Status Delete(int cf, const std::string& k);
+
+ Status SingleDelete(const std::string& k);
+
+ Status SingleDelete(int cf, const std::string& k);
+
+ bool SetPreserveDeletesSequenceNumber(SequenceNumber sn);
+
+ std::string Get(const std::string& k, const Snapshot* snapshot = nullptr);
+
+ std::string Get(int cf, const std::string& k,
+ const Snapshot* snapshot = nullptr);
+
+ Status Get(const std::string& k, PinnableSlice* v);
+
+ std::vector<std::string> MultiGet(std::vector<int> cfs,
+ const std::vector<std::string>& k,
+ const Snapshot* snapshot,
+ const bool batched);
+
+ std::vector<std::string> MultiGet(const std::vector<std::string>& k,
+ const Snapshot* snapshot = nullptr);
+
+ uint64_t GetNumSnapshots();
+
+ uint64_t GetTimeOldestSnapshots();
+
+ uint64_t GetSequenceOldestSnapshots();
+
+ // Return a string that contains all key,value pairs in order,
+ // formatted like "(k1->v1)(k2->v2)".
+ std::string Contents(int cf = 0);
+
+ std::string AllEntriesFor(const Slice& user_key, int cf = 0);
+
+#ifndef ROCKSDB_LITE
+ int NumSortedRuns(int cf = 0);
+
+ uint64_t TotalSize(int cf = 0);
+
+ uint64_t SizeAtLevel(int level);
+
+ size_t TotalLiveFiles(int cf = 0);
+
+ size_t CountLiveFiles();
+
+ int NumTableFilesAtLevel(int level, int cf = 0);
+
+ double CompressionRatioAtLevel(int level, int cf = 0);
+
+ int TotalTableFiles(int cf = 0, int levels = -1);
+#endif // ROCKSDB_LITE
+
+ // Return spread of files per level
+ std::string FilesPerLevel(int cf = 0);
+
+ size_t CountFiles();
+
+ uint64_t Size(const Slice& start, const Slice& limit, int cf = 0);
+
+ void Compact(int cf, const Slice& start, const Slice& limit,
+ uint32_t target_path_id);
+
+ void Compact(int cf, const Slice& start, const Slice& limit);
+
+ void Compact(const Slice& start, const Slice& limit);
+
+ // Do n memtable compactions, each of which produces an sstable
+ // covering the range [small,large].
+ void MakeTables(int n, const std::string& small, const std::string& large,
+ int cf = 0);
+
+ // Prevent pushing of new sstables into deeper levels by adding
+ // tables that cover a specified range to all levels.
+ void FillLevels(const std::string& smallest, const std::string& largest,
+ int cf);
+
+ void MoveFilesToLevel(int level, int cf = 0);
+
+#ifndef ROCKSDB_LITE
+ void DumpFileCounts(const char* label);
+#endif // ROCKSDB_LITE
+
+ std::string DumpSSTableList();
+
+ static void GetSstFiles(Env* env, std::string path,
+ std::vector<std::string>* files);
+
+ int GetSstFileCount(std::string path);
+
+ // this will generate non-overlapping files since it keeps increasing key_idx
+ void GenerateNewFile(Random* rnd, int* key_idx, bool nowait = false);
+
+ void GenerateNewFile(int fd, Random* rnd, int* key_idx, bool nowait = false);
+
+ static const int kNumKeysByGenerateNewRandomFile;
+ static const int KNumKeysByGenerateNewFile = 100;
+
+ void GenerateNewRandomFile(Random* rnd, bool nowait = false);
+
+ std::string IterStatus(Iterator* iter);
+
+ Options OptionsForLogIterTest();
+
+ std::string DummyString(size_t len, char c = 'a');
+
+ void VerifyIterLast(std::string expected_key, int cf = 0);
+
+ // Used to test InplaceUpdate
+
+ // If previous value is nullptr or delta is > than previous value,
+ // sets newValue with delta
+ // If previous value is not empty,
+ // updates previous value with 'b' string of previous value size - 1.
+ static UpdateStatus updateInPlaceSmallerSize(char* prevValue,
+ uint32_t* prevSize, Slice delta,
+ std::string* newValue);
+
+ static UpdateStatus updateInPlaceSmallerVarintSize(char* prevValue,
+ uint32_t* prevSize,
+ Slice delta,
+ std::string* newValue);
+
+ static UpdateStatus updateInPlaceLargerSize(char* prevValue,
+ uint32_t* prevSize, Slice delta,
+ std::string* newValue);
+
+ static UpdateStatus updateInPlaceNoAction(char* prevValue, uint32_t* prevSize,
+ Slice delta, std::string* newValue);
+
+ // Utility method to test InplaceUpdate
+ void validateNumberOfEntries(int numValues, int cf = 0);
+
+ void CopyFile(const std::string& source, const std::string& destination,
+ uint64_t size = 0);
+
+ std::unordered_map<std::string, uint64_t> GetAllSSTFiles(
+ uint64_t* total_size = nullptr);
+
+ std::vector<std::uint64_t> ListTableFiles(Env* env, const std::string& path);
+
+ void VerifyDBFromMap(
+ std::map<std::string, std::string> true_data,
+ size_t* total_reads_res = nullptr, bool tailing_iter = false,
+ std::map<std::string, Status> status = std::map<std::string, Status>());
+
+ void VerifyDBInternal(
+ std::vector<std::pair<std::string, std::string>> true_data);
+
+#ifndef ROCKSDB_LITE
+ uint64_t GetNumberOfSstFilesForColumnFamily(DB* db,
+ std::string column_family_name);
+#endif // ROCKSDB_LITE
+
+ uint64_t TestGetTickerCount(const Options& options, Tickers ticker_type) {
+ return options.statistics->getTickerCount(ticker_type);
+ }
+
+ uint64_t TestGetAndResetTickerCount(const Options& options,
+ Tickers ticker_type) {
+ return options.statistics->getAndResetTickerCount(ticker_type);
+ }
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_universal_compaction_test.cc b/src/rocksdb/db/db_universal_compaction_test.cc
new file mode 100644
index 000000000..61531ae16
--- /dev/null
+++ b/src/rocksdb/db/db_universal_compaction_test.cc
@@ -0,0 +1,2254 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#if !defined(ROCKSDB_LITE)
+#include "rocksdb/utilities/table_properties_collectors.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static std::string CompressibleString(Random* rnd, int len) {
+ std::string r;
+ test::CompressibleString(rnd, 0.8, len, &r);
+ return r;
+}
+
+class DBTestUniversalCompactionBase
+ : public DBTestBase,
+ public ::testing::WithParamInterface<std::tuple<int, bool>> {
+ public:
+ explicit DBTestUniversalCompactionBase(
+ const std::string& path) : DBTestBase(path) {}
+ void SetUp() override {
+ num_levels_ = std::get<0>(GetParam());
+ exclusive_manual_compaction_ = std::get<1>(GetParam());
+ }
+ int num_levels_;
+ bool exclusive_manual_compaction_;
+};
+
+class DBTestUniversalCompaction : public DBTestUniversalCompactionBase {
+ public:
+ DBTestUniversalCompaction() :
+ DBTestUniversalCompactionBase("/db_universal_compaction_test") {}
+};
+
+class DBTestUniversalCompaction2 : public DBTestBase {
+ public:
+ DBTestUniversalCompaction2() : DBTestBase("/db_universal_compaction_test2") {}
+};
+
+namespace {
+void VerifyCompactionResult(
+ const ColumnFamilyMetaData& cf_meta,
+ const std::set<std::string>& overlapping_file_numbers) {
+#ifndef NDEBUG
+ for (auto& level : cf_meta.levels) {
+ for (auto& file : level.files) {
+ assert(overlapping_file_numbers.find(file.name) ==
+ overlapping_file_numbers.end());
+ }
+ }
+#endif
+}
+
+class KeepFilter : public CompactionFilter {
+ public:
+ bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+ std::string* /*new_value*/,
+ bool* /*value_changed*/) const override {
+ return false;
+ }
+
+ const char* Name() const override { return "KeepFilter"; }
+};
+
+class KeepFilterFactory : public CompactionFilterFactory {
+ public:
+ explicit KeepFilterFactory(bool check_context = false)
+ : check_context_(check_context) {}
+
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& context) override {
+ if (check_context_) {
+ EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction);
+ EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction);
+ }
+ return std::unique_ptr<CompactionFilter>(new KeepFilter());
+ }
+
+ const char* Name() const override { return "KeepFilterFactory"; }
+ bool check_context_;
+ std::atomic_bool expect_full_compaction_;
+ std::atomic_bool expect_manual_compaction_;
+};
+
+class DelayFilter : public CompactionFilter {
+ public:
+ explicit DelayFilter(DBTestBase* d) : db_test(d) {}
+ bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+ std::string* /*new_value*/,
+ bool* /*value_changed*/) const override {
+ db_test->env_->addon_time_.fetch_add(1000);
+ return true;
+ }
+
+ const char* Name() const override { return "DelayFilter"; }
+
+ private:
+ DBTestBase* db_test;
+};
+
+class DelayFilterFactory : public CompactionFilterFactory {
+ public:
+ explicit DelayFilterFactory(DBTestBase* d) : db_test(d) {}
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& /*context*/) override {
+ return std::unique_ptr<CompactionFilter>(new DelayFilter(db_test));
+ }
+
+ const char* Name() const override { return "DelayFilterFactory"; }
+
+ private:
+ DBTestBase* db_test;
+};
+} // namespace
+
+// Make sure we don't trigger a problem if the trigger condtion is given
+// to be 0, which is invalid.
+TEST_P(DBTestUniversalCompaction, UniversalCompactionSingleSortedRun) {
+ Options options = CurrentOptions();
+
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = num_levels_;
+ // Config universal compaction to always compact to one single sorted run.
+ options.level0_file_num_compaction_trigger = 0;
+ options.compaction_options_universal.size_ratio = 10;
+ options.compaction_options_universal.min_merge_width = 2;
+ options.compaction_options_universal.max_size_amplification_percent = 0;
+
+ options.write_buffer_size = 105 << 10; // 105KB
+ options.arena_block_size = 4 << 10;
+ options.target_file_size_base = 32 << 10; // 32KB
+ // trigger compaction if there are >= 4 files
+ KeepFilterFactory* filter = new KeepFilterFactory(true);
+ filter->expect_manual_compaction_.store(false);
+ options.compaction_filter_factory.reset(filter);
+
+ DestroyAndReopen(options);
+ ASSERT_EQ(1, db_->GetOptions().level0_file_num_compaction_trigger);
+
+ Random rnd(301);
+ int key_idx = 0;
+
+ filter->expect_full_compaction_.store(true);
+
+ for (int num = 0; num < 16; num++) {
+ // Write 100KB file. And immediately it should be compacted to one file.
+ GenerateNewFile(&rnd, &key_idx);
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(NumSortedRuns(0), 1);
+ }
+ ASSERT_OK(Put(Key(key_idx), ""));
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(NumSortedRuns(0), 1);
+}
+
+TEST_P(DBTestUniversalCompaction, OptimizeFiltersForHits) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.compaction_options_universal.size_ratio = 5;
+ options.num_levels = num_levels_;
+ options.write_buffer_size = 105 << 10; // 105KB
+ options.arena_block_size = 4 << 10;
+ options.target_file_size_base = 32 << 10; // 32KB
+ // trigger compaction if there are >= 4 files
+ options.level0_file_num_compaction_trigger = 4;
+ BlockBasedTableOptions bbto;
+ bbto.cache_index_and_filter_blocks = true;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ bbto.whole_key_filtering = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ options.optimize_filters_for_hits = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.memtable_factory.reset(new SpecialSkipListFactory(3));
+
+ DestroyAndReopen(options);
+
+ // block compaction from happening
+ env_->SetBackgroundThreads(1, Env::LOW);
+ test::SleepingBackgroundTask sleeping_task_low;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+
+ for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+ Put(Key(num * 10), "val");
+ if (num) {
+ dbfull()->TEST_WaitForFlushMemTable();
+ }
+ Put(Key(30 + num * 10), "val");
+ Put(Key(60 + num * 10), "val");
+ }
+ Put("", "");
+ dbfull()->TEST_WaitForFlushMemTable();
+
+ // Query set of non existing keys
+ for (int i = 5; i < 90; i += 10) {
+ ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+ }
+
+ // Make sure bloom filter is used at least once.
+ ASSERT_GT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+ auto prev_counter = TestGetTickerCount(options, BLOOM_FILTER_USEFUL);
+
+ // Make sure bloom filter is used for all but the last L0 file when looking
+ // up a non-existent key that's in the range of all L0 files.
+ ASSERT_EQ(Get(Key(35)), "NOT_FOUND");
+ ASSERT_EQ(prev_counter + NumTableFilesAtLevel(0) - 1,
+ TestGetTickerCount(options, BLOOM_FILTER_USEFUL));
+ prev_counter = TestGetTickerCount(options, BLOOM_FILTER_USEFUL);
+
+ // Unblock compaction and wait it for happening.
+ sleeping_task_low.WakeUp();
+ dbfull()->TEST_WaitForCompact();
+
+ // The same queries will not trigger bloom filter
+ for (int i = 5; i < 90; i += 10) {
+ ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+ }
+ ASSERT_EQ(prev_counter, TestGetTickerCount(options, BLOOM_FILTER_USEFUL));
+}
+
+// TODO(kailiu) The tests on UniversalCompaction has some issues:
+// 1. A lot of magic numbers ("11" or "12").
+// 2. Made assumption on the memtable flush conditions, which may change from
+// time to time.
+TEST_P(DBTestUniversalCompaction, UniversalCompactionTrigger) {
+ Options options;
+ options.compaction_style = kCompactionStyleUniversal;
+ options.compaction_options_universal.size_ratio = 5;
+ options.num_levels = num_levels_;
+ options.write_buffer_size = 105 << 10; // 105KB
+ options.arena_block_size = 4 << 10;
+ options.target_file_size_base = 32 << 10; // 32KB
+ // trigger compaction if there are >= 4 files
+ options.level0_file_num_compaction_trigger = 4;
+ KeepFilterFactory* filter = new KeepFilterFactory(true);
+ filter->expect_manual_compaction_.store(false);
+ options.compaction_filter_factory.reset(filter);
+
+ options = CurrentOptions(options);
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBTestWritableFile.GetPreallocationStatus", [&](void* arg) {
+ ASSERT_TRUE(arg != nullptr);
+ size_t preallocation_size = *(static_cast<size_t*>(arg));
+ if (num_levels_ > 3) {
+ ASSERT_LE(preallocation_size, options.target_file_size_base * 1.1);
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ int key_idx = 0;
+
+ filter->expect_full_compaction_.store(true);
+ // Stage 1:
+ // Generate a set of files at level 0, but don't trigger level-0
+ // compaction.
+ for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+ num++) {
+ // Write 100KB
+ GenerateNewFile(1, &rnd, &key_idx);
+ }
+
+ // Generate one more file at level-0, which should trigger level-0
+ // compaction.
+ GenerateNewFile(1, &rnd, &key_idx);
+ // Suppose each file flushed from mem table has size 1. Now we compact
+ // (level0_file_num_compaction_trigger+1)=4 files and should have a big
+ // file of size 4.
+ ASSERT_EQ(NumSortedRuns(1), 1);
+
+ // Stage 2:
+ // Now we have one file at level 0, with size 4. We also have some data in
+ // mem table. Let's continue generating new files at level 0, but don't
+ // trigger level-0 compaction.
+ // First, clean up memtable before inserting new data. This will generate
+ // a level-0 file, with size around 0.4 (according to previously written
+ // data amount).
+ filter->expect_full_compaction_.store(false);
+ ASSERT_OK(Flush(1));
+ for (int num = 0; num < options.level0_file_num_compaction_trigger - 3;
+ num++) {
+ GenerateNewFile(1, &rnd, &key_idx);
+ ASSERT_EQ(NumSortedRuns(1), num + 3);
+ }
+
+ // Generate one more file at level-0, which should trigger level-0
+ // compaction.
+ GenerateNewFile(1, &rnd, &key_idx);
+ // Before compaction, we have 4 files at level 0, with size 4, 0.4, 1, 1.
+ // After compaction, we should have 2 files, with size 4, 2.4.
+ ASSERT_EQ(NumSortedRuns(1), 2);
+
+ // Stage 3:
+ // Now we have 2 files at level 0, with size 4 and 2.4. Continue
+ // generating new files at level 0.
+ for (int num = 0; num < options.level0_file_num_compaction_trigger - 3;
+ num++) {
+ GenerateNewFile(1, &rnd, &key_idx);
+ ASSERT_EQ(NumSortedRuns(1), num + 3);
+ }
+
+ // Generate one more file at level-0, which should trigger level-0
+ // compaction.
+ GenerateNewFile(1, &rnd, &key_idx);
+ // Before compaction, we have 4 files at level 0, with size 4, 2.4, 1, 1.
+ // After compaction, we should have 3 files, with size 4, 2.4, 2.
+ ASSERT_EQ(NumSortedRuns(1), 3);
+
+ // Stage 4:
+ // Now we have 3 files at level 0, with size 4, 2.4, 2. Let's generate a
+ // new file of size 1.
+ GenerateNewFile(1, &rnd, &key_idx);
+ dbfull()->TEST_WaitForCompact();
+ // Level-0 compaction is triggered, but no file will be picked up.
+ ASSERT_EQ(NumSortedRuns(1), 4);
+
+ // Stage 5:
+ // Now we have 4 files at level 0, with size 4, 2.4, 2, 1. Let's generate
+ // a new file of size 1.
+ filter->expect_full_compaction_.store(true);
+ GenerateNewFile(1, &rnd, &key_idx);
+ dbfull()->TEST_WaitForCompact();
+ // All files at level 0 will be compacted into a single one.
+ ASSERT_EQ(NumSortedRuns(1), 1);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionSizeAmplification) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = num_levels_;
+ options.write_buffer_size = 100 << 10; // 100KB
+ options.target_file_size_base = 32 << 10; // 32KB
+ options.level0_file_num_compaction_trigger = 3;
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Trigger compaction if size amplification exceeds 110%
+ options.compaction_options_universal.max_size_amplification_percent = 110;
+ options = CurrentOptions(options);
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ Random rnd(301);
+ int key_idx = 0;
+
+ // Generate two files in Level 0. Both files are approx the same size.
+ for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+ num++) {
+ // Write 110KB (11 values, each 10K)
+ for (int i = 0; i < 11; i++) {
+ ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000)));
+ key_idx++;
+ }
+ dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+ ASSERT_EQ(NumSortedRuns(1), num + 1);
+ }
+ ASSERT_EQ(NumSortedRuns(1), 2);
+
+ // Flush whatever is remaining in memtable. This is typically
+ // small, which should not trigger size ratio based compaction
+ // but will instead trigger size amplification.
+ ASSERT_OK(Flush(1));
+
+ dbfull()->TEST_WaitForCompact();
+
+ // Verify that size amplification did occur
+ ASSERT_EQ(NumSortedRuns(1), 1);
+}
+
+TEST_P(DBTestUniversalCompaction, DynamicUniversalCompactionSizeAmplification) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = 1;
+ options.write_buffer_size = 100 << 10; // 100KB
+ options.target_file_size_base = 32 << 10; // 32KB
+ options.level0_file_num_compaction_trigger = 3;
+ // Initial setup of compaction_options_universal will prevent universal
+ // compaction from happening
+ options.compaction_options_universal.size_ratio = 100;
+ options.compaction_options_universal.min_merge_width = 100;
+ DestroyAndReopen(options);
+
+ int total_picked_compactions = 0;
+ int total_size_amp_compactions = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
+ if (arg) {
+ total_picked_compactions++;
+ Compaction* c = static_cast<Compaction*>(arg);
+ if (c->compaction_reason() ==
+ CompactionReason::kUniversalSizeAmplification) {
+ total_size_amp_compactions++;
+ }
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ MutableCFOptions mutable_cf_options;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ Random rnd(301);
+ int key_idx = 0;
+
+ // Generate two files in Level 0. Both files are approx the same size.
+ for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+ num++) {
+ // Write 110KB (11 values, each 10K)
+ for (int i = 0; i < 11; i++) {
+ ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000)));
+ key_idx++;
+ }
+ dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+ ASSERT_EQ(NumSortedRuns(1), num + 1);
+ }
+ ASSERT_EQ(NumSortedRuns(1), 2);
+
+ // Flush whatever is remaining in memtable. This is typically
+ // small, which should not trigger size ratio based compaction
+ // but could instead trigger size amplification if it's set
+ // to 110.
+ ASSERT_OK(Flush(1));
+ dbfull()->TEST_WaitForCompact();
+ // Verify compaction did not happen
+ ASSERT_EQ(NumSortedRuns(1), 3);
+
+ // Trigger compaction if size amplification exceeds 110% without reopening DB
+ ASSERT_EQ(dbfull()
+ ->GetOptions(handles_[1])
+ .compaction_options_universal.max_size_amplification_percent,
+ 200U);
+ ASSERT_OK(dbfull()->SetOptions(handles_[1],
+ {{"compaction_options_universal",
+ "{max_size_amplification_percent=110;}"}}));
+ ASSERT_EQ(dbfull()
+ ->GetOptions(handles_[1])
+ .compaction_options_universal.max_size_amplification_percent,
+ 110u);
+ ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
+ &mutable_cf_options));
+ ASSERT_EQ(110u, mutable_cf_options.compaction_options_universal
+ .max_size_amplification_percent);
+
+ dbfull()->TEST_WaitForCompact();
+ // Verify that size amplification did happen
+ ASSERT_EQ(NumSortedRuns(1), 1);
+ ASSERT_EQ(total_picked_compactions, 1);
+ ASSERT_EQ(total_size_amp_compactions, 1);
+}
+
+TEST_P(DBTestUniversalCompaction, DynamicUniversalCompactionReadAmplification) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = 1;
+ options.write_buffer_size = 100 << 10; // 100KB
+ options.target_file_size_base = 32 << 10; // 32KB
+ options.level0_file_num_compaction_trigger = 3;
+ // Initial setup of compaction_options_universal will prevent universal
+ // compaction from happening
+ options.compaction_options_universal.max_size_amplification_percent = 2000;
+ options.compaction_options_universal.size_ratio = 0;
+ options.compaction_options_universal.min_merge_width = 100;
+ DestroyAndReopen(options);
+
+ int total_picked_compactions = 0;
+ int total_size_ratio_compactions = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
+ if (arg) {
+ total_picked_compactions++;
+ Compaction* c = static_cast<Compaction*>(arg);
+ if (c->compaction_reason() == CompactionReason::kUniversalSizeRatio) {
+ total_size_ratio_compactions++;
+ }
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ MutableCFOptions mutable_cf_options;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ Random rnd(301);
+ int key_idx = 0;
+
+ // Generate three files in Level 0. All files are approx the same size.
+ for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+ // Write 110KB (11 values, each 10K)
+ for (int i = 0; i < 11; i++) {
+ ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000)));
+ key_idx++;
+ }
+ dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+ ASSERT_EQ(NumSortedRuns(1), num + 1);
+ }
+ ASSERT_EQ(NumSortedRuns(1), options.level0_file_num_compaction_trigger);
+
+ // Flush whatever is remaining in memtable. This is typically small, about
+ // 30KB.
+ ASSERT_OK(Flush(1));
+ dbfull()->TEST_WaitForCompact();
+ // Verify compaction did not happen
+ ASSERT_EQ(NumSortedRuns(1), options.level0_file_num_compaction_trigger + 1);
+ ASSERT_EQ(total_picked_compactions, 0);
+
+ ASSERT_OK(dbfull()->SetOptions(
+ handles_[1],
+ {{"compaction_options_universal",
+ "{min_merge_width=2;max_merge_width=2;size_ratio=100;}"}}));
+ ASSERT_EQ(dbfull()
+ ->GetOptions(handles_[1])
+ .compaction_options_universal.min_merge_width,
+ 2u);
+ ASSERT_EQ(dbfull()
+ ->GetOptions(handles_[1])
+ .compaction_options_universal.max_merge_width,
+ 2u);
+ ASSERT_EQ(
+ dbfull()->GetOptions(handles_[1]).compaction_options_universal.size_ratio,
+ 100u);
+
+ ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
+ &mutable_cf_options));
+ ASSERT_EQ(mutable_cf_options.compaction_options_universal.size_ratio, 100u);
+ ASSERT_EQ(mutable_cf_options.compaction_options_universal.min_merge_width,
+ 2u);
+ ASSERT_EQ(mutable_cf_options.compaction_options_universal.max_merge_width,
+ 2u);
+
+ dbfull()->TEST_WaitForCompact();
+
+ // Files in L0 are approx: 0.3 (30KB), 1, 1, 1.
+ // On compaction: the files are below the size amp threshold, so we
+ // fallthrough to checking read amp conditions. The configured size ratio is
+ // not big enough to take 0.3 into consideration. So the next files 1 and 1
+ // are compacted together first as they satisfy size ratio condition and
+ // (min_merge_width, max_merge_width) condition, to give out a file size of 2.
+ // Next, the newly generated 2 and the last file 1 are compacted together. So
+ // at the end: #sortedRuns = 2, #picked_compactions = 2, and all the picked
+ // ones are size ratio based compactions.
+ ASSERT_EQ(NumSortedRuns(1), 2);
+ // If max_merge_width had not been changed dynamically above, and if it
+ // continued to be the default value of UINIT_MAX, total_picked_compactions
+ // would have been 1.
+ ASSERT_EQ(total_picked_compactions, 2);
+ ASSERT_EQ(total_size_ratio_compactions, 2);
+}
+
+TEST_P(DBTestUniversalCompaction, CompactFilesOnUniversalCompaction) {
+ const int kTestKeySize = 16;
+ const int kTestValueSize = 984;
+ const int kEntrySize = kTestKeySize + kTestValueSize;
+ const int kEntriesPerBuffer = 10;
+
+ ChangeCompactOptions();
+ Options options;
+ options.create_if_missing = true;
+ options.compaction_style = kCompactionStyleLevel;
+ options.num_levels = 1;
+ options.target_file_size_base = options.write_buffer_size;
+ options.compression = kNoCompression;
+ options = CurrentOptions(options);
+ options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ASSERT_EQ(options.compaction_style, kCompactionStyleUniversal);
+ Random rnd(301);
+ for (int key = 1024 * kEntriesPerBuffer; key >= 0; --key) {
+ ASSERT_OK(Put(1, ToString(key), RandomString(&rnd, kTestValueSize)));
+ }
+ dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+ dbfull()->TEST_WaitForCompact();
+ ColumnFamilyMetaData cf_meta;
+ dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+ std::vector<std::string> compaction_input_file_names;
+ for (auto file : cf_meta.levels[0].files) {
+ if (rnd.OneIn(2)) {
+ compaction_input_file_names.push_back(file.name);
+ }
+ }
+
+ if (compaction_input_file_names.size() == 0) {
+ compaction_input_file_names.push_back(
+ cf_meta.levels[0].files[0].name);
+ }
+
+ // expect fail since universal compaction only allow L0 output
+ ASSERT_FALSE(dbfull()
+ ->CompactFiles(CompactionOptions(), handles_[1],
+ compaction_input_file_names, 1)
+ .ok());
+
+ // expect ok and verify the compacted files no longer exist.
+ ASSERT_OK(dbfull()->CompactFiles(
+ CompactionOptions(), handles_[1],
+ compaction_input_file_names, 0));
+
+ dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+ VerifyCompactionResult(
+ cf_meta,
+ std::set<std::string>(compaction_input_file_names.begin(),
+ compaction_input_file_names.end()));
+
+ compaction_input_file_names.clear();
+
+ // Pick the first and the last file, expect everything is
+ // compacted into one single file.
+ compaction_input_file_names.push_back(
+ cf_meta.levels[0].files[0].name);
+ compaction_input_file_names.push_back(
+ cf_meta.levels[0].files[
+ cf_meta.levels[0].files.size() - 1].name);
+ ASSERT_OK(dbfull()->CompactFiles(
+ CompactionOptions(), handles_[1],
+ compaction_input_file_names, 0));
+
+ dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+ ASSERT_EQ(cf_meta.levels[0].files.size(), 1U);
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionTargetLevel) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.write_buffer_size = 100 << 10; // 100KB
+ options.num_levels = 7;
+ options.disable_auto_compactions = true;
+ DestroyAndReopen(options);
+
+ // Generate 3 overlapping files
+ Random rnd(301);
+ for (int i = 0; i < 210; i++) {
+ ASSERT_OK(Put(Key(i), RandomString(&rnd, 100)));
+ }
+ ASSERT_OK(Flush());
+
+ for (int i = 200; i < 300; i++) {
+ ASSERT_OK(Put(Key(i), RandomString(&rnd, 100)));
+ }
+ ASSERT_OK(Flush());
+
+ for (int i = 250; i < 260; i++) {
+ ASSERT_OK(Put(Key(i), RandomString(&rnd, 100)));
+ }
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ("3", FilesPerLevel(0));
+ // Compact all files into 1 file and put it in L4
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 4;
+ compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+ db_->CompactRange(compact_options, nullptr, nullptr);
+ ASSERT_EQ("0,0,0,0,1", FilesPerLevel(0));
+}
+
+#ifndef ROCKSDB_VALGRIND_RUN
+class DBTestUniversalCompactionMultiLevels
+ : public DBTestUniversalCompactionBase {
+ public:
+ DBTestUniversalCompactionMultiLevels() :
+ DBTestUniversalCompactionBase(
+ "/db_universal_compaction_multi_levels_test") {}
+};
+
+TEST_P(DBTestUniversalCompactionMultiLevels, UniversalCompactionMultiLevels) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = num_levels_;
+ options.write_buffer_size = 100 << 10; // 100KB
+ options.level0_file_num_compaction_trigger = 8;
+ options.max_background_compactions = 3;
+ options.target_file_size_base = 32 * 1024;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Trigger compaction if size amplification exceeds 110%
+ options.compaction_options_universal.max_size_amplification_percent = 110;
+ options = CurrentOptions(options);
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ Random rnd(301);
+ int num_keys = 100000;
+ for (int i = 0; i < num_keys * 2; i++) {
+ ASSERT_OK(Put(1, Key(i % num_keys), Key(i)));
+ }
+
+ dbfull()->TEST_WaitForCompact();
+
+ for (int i = num_keys; i < num_keys * 2; i++) {
+ ASSERT_EQ(Get(1, Key(i % num_keys)), Key(i));
+ }
+}
+
+// Tests universal compaction with trivial move enabled
+TEST_P(DBTestUniversalCompactionMultiLevels, UniversalCompactionTrivialMove) {
+ int32_t trivial_move = 0;
+ int32_t non_trivial_move = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:TrivialMove",
+ [&](void* /*arg*/) { trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial", [&](void* arg) {
+ non_trivial_move++;
+ ASSERT_TRUE(arg != nullptr);
+ int output_level = *(static_cast<int*>(arg));
+ ASSERT_EQ(output_level, 0);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.compaction_options_universal.allow_trivial_move = true;
+ options.num_levels = 3;
+ options.write_buffer_size = 100 << 10; // 100KB
+ options.level0_file_num_compaction_trigger = 3;
+ options.max_background_compactions = 2;
+ options.target_file_size_base = 32 * 1024;
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Trigger compaction if size amplification exceeds 110%
+ options.compaction_options_universal.max_size_amplification_percent = 110;
+ options = CurrentOptions(options);
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ Random rnd(301);
+ int num_keys = 150000;
+ for (int i = 0; i < num_keys; i++) {
+ ASSERT_OK(Put(1, Key(i), Key(i)));
+ }
+ std::vector<std::string> values;
+
+ ASSERT_OK(Flush(1));
+ dbfull()->TEST_WaitForCompact();
+
+ ASSERT_GT(trivial_move, 0);
+ ASSERT_GT(non_trivial_move, 0);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+INSTANTIATE_TEST_CASE_P(MultiLevels, DBTestUniversalCompactionMultiLevels,
+ ::testing::Combine(::testing::Values(3, 20),
+ ::testing::Bool()));
+
+class DBTestUniversalCompactionParallel :
+ public DBTestUniversalCompactionBase {
+ public:
+ DBTestUniversalCompactionParallel() :
+ DBTestUniversalCompactionBase(
+ "/db_universal_compaction_prallel_test") {}
+};
+
+TEST_P(DBTestUniversalCompactionParallel, UniversalCompactionParallel) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = num_levels_;
+ options.write_buffer_size = 1 << 10; // 1KB
+ options.level0_file_num_compaction_trigger = 3;
+ options.max_background_compactions = 3;
+ options.max_background_flushes = 3;
+ options.target_file_size_base = 1 * 1024;
+ options.compaction_options_universal.max_size_amplification_percent = 110;
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Delay every compaction so multiple compactions will happen.
+ std::atomic<int> num_compactions_running(0);
+ std::atomic<bool> has_parallel(false);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::Run():Start", [&](void* /*arg*/) {
+ if (num_compactions_running.fetch_add(1) > 0) {
+ has_parallel.store(true);
+ return;
+ }
+ for (int nwait = 0; nwait < 20000; nwait++) {
+ if (has_parallel.load() || num_compactions_running.load() > 1) {
+ has_parallel.store(true);
+ break;
+ }
+ env_->SleepForMicroseconds(1000);
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::Run():End",
+ [&](void* /*arg*/) { num_compactions_running.fetch_add(-1); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ options = CurrentOptions(options);
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ Random rnd(301);
+ int num_keys = 30000;
+ for (int i = 0; i < num_keys * 2; i++) {
+ ASSERT_OK(Put(1, Key(i % num_keys), Key(i)));
+ }
+ dbfull()->TEST_WaitForCompact();
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ASSERT_EQ(num_compactions_running.load(), 0);
+ ASSERT_TRUE(has_parallel.load());
+
+ for (int i = num_keys; i < num_keys * 2; i++) {
+ ASSERT_EQ(Get(1, Key(i % num_keys)), Key(i));
+ }
+
+ // Reopen and check.
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ for (int i = num_keys; i < num_keys * 2; i++) {
+ ASSERT_EQ(Get(1, Key(i % num_keys)), Key(i));
+ }
+}
+
+TEST_P(DBTestUniversalCompactionParallel, PickByFileNumberBug) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = num_levels_;
+ options.write_buffer_size = 1 * 1024; // 1KB
+ options.level0_file_num_compaction_trigger = 7;
+ options.max_background_compactions = 2;
+ options.target_file_size_base = 1024 * 1024; // 1MB
+
+ // Disable size amplifiction compaction
+ options.compaction_options_universal.max_size_amplification_percent =
+ UINT_MAX;
+ DestroyAndReopen(options);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBTestUniversalCompactionParallel::PickByFileNumberBug:0",
+ "BackgroundCallCompaction:0"},
+ {"UniversalCompactionBuilder::PickCompaction:Return",
+ "DBTestUniversalCompactionParallel::PickByFileNumberBug:1"},
+ {"DBTestUniversalCompactionParallel::PickByFileNumberBug:2",
+ "CompactionJob::Run():Start"}});
+
+ int total_picked_compactions = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
+ if (arg) {
+ total_picked_compactions++;
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Write 7 files to trigger compaction
+ int key_idx = 1;
+ for (int i = 1; i <= 70; i++) {
+ std::string k = Key(key_idx++);
+ ASSERT_OK(Put(k, k));
+ if (i % 10 == 0) {
+ ASSERT_OK(Flush());
+ }
+ }
+
+ // Wait for the 1st background compaction process to start
+ TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:0");
+ TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:1");
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+
+ // Write 3 files while 1st compaction is held
+ // These 3 files have different sizes to avoid compacting based on size_ratio
+ int num_keys = 1000;
+ for (int i = 0; i < 3; i++) {
+ for (int j = 1; j <= num_keys; j++) {
+ std::string k = Key(key_idx++);
+ ASSERT_OK(Put(k, k));
+ }
+ ASSERT_OK(Flush());
+ num_keys -= 100;
+ }
+
+ // Hold the 1st compaction from finishing
+ TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:2");
+ dbfull()->TEST_WaitForCompact();
+
+ // There should only be one picked compaction as the score drops below one
+ // after the first one is picked.
+ EXPECT_EQ(total_picked_compactions, 1);
+ EXPECT_EQ(TotalTableFiles(), 4);
+
+ // Stop SyncPoint and destroy the DB and reopen it again
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ key_idx = 1;
+ total_picked_compactions = 0;
+ DestroyAndReopen(options);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Write 7 files to trigger compaction
+ for (int i = 1; i <= 70; i++) {
+ std::string k = Key(key_idx++);
+ ASSERT_OK(Put(k, k));
+ if (i % 10 == 0) {
+ ASSERT_OK(Flush());
+ }
+ }
+
+ // Wait for the 1st background compaction process to start
+ TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:0");
+ TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:1");
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+
+ // Write 8 files while 1st compaction is held
+ // These 8 files have different sizes to avoid compacting based on size_ratio
+ num_keys = 1000;
+ for (int i = 0; i < 8; i++) {
+ for (int j = 1; j <= num_keys; j++) {
+ std::string k = Key(key_idx++);
+ ASSERT_OK(Put(k, k));
+ }
+ ASSERT_OK(Flush());
+ num_keys -= 100;
+ }
+
+ // Wait for the 2nd background compaction process to start
+ TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:0");
+ TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:1");
+
+ // Hold the 1st and 2nd compaction from finishing
+ TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:2");
+ dbfull()->TEST_WaitForCompact();
+
+ // This time we will trigger a compaction because of size ratio and
+ // another compaction because of number of files that are not compacted
+ // greater than 7
+ EXPECT_GE(total_picked_compactions, 2);
+}
+
+INSTANTIATE_TEST_CASE_P(Parallel, DBTestUniversalCompactionParallel,
+ ::testing::Combine(::testing::Values(1, 10),
+ ::testing::Values(false)));
+#endif // ROCKSDB_VALGRIND_RUN
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionOptions) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.write_buffer_size = 105 << 10; // 105KB
+ options.arena_block_size = 4 << 10; // 4KB
+ options.target_file_size_base = 32 << 10; // 32KB
+ options.level0_file_num_compaction_trigger = 4;
+ options.num_levels = num_levels_;
+ options.compaction_options_universal.compression_size_percent = -1;
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ Random rnd(301);
+ int key_idx = 0;
+
+ for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+ // Write 100KB (100 values, each 1K)
+ for (int i = 0; i < 100; i++) {
+ ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 990)));
+ key_idx++;
+ }
+ dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+
+ if (num < options.level0_file_num_compaction_trigger - 1) {
+ ASSERT_EQ(NumSortedRuns(1), num + 1);
+ }
+ }
+
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(NumSortedRuns(1), 1);
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionStopStyleSimilarSize) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.write_buffer_size = 105 << 10; // 105KB
+ options.arena_block_size = 4 << 10; // 4KB
+ options.target_file_size_base = 32 << 10; // 32KB
+ // trigger compaction if there are >= 4 files
+ options.level0_file_num_compaction_trigger = 4;
+ options.compaction_options_universal.size_ratio = 10;
+ options.compaction_options_universal.stop_style =
+ kCompactionStopStyleSimilarSize;
+ options.num_levels = num_levels_;
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ int key_idx = 0;
+
+ // Stage 1:
+ // Generate a set of files at level 0, but don't trigger level-0
+ // compaction.
+ for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+ num++) {
+ // Write 100KB (100 values, each 1K)
+ for (int i = 0; i < 100; i++) {
+ ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 990)));
+ key_idx++;
+ }
+ dbfull()->TEST_WaitForFlushMemTable();
+ ASSERT_EQ(NumSortedRuns(), num + 1);
+ }
+
+ // Generate one more file at level-0, which should trigger level-0
+ // compaction.
+ for (int i = 0; i < 100; i++) {
+ ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 990)));
+ key_idx++;
+ }
+ dbfull()->TEST_WaitForCompact();
+ // Suppose each file flushed from mem table has size 1. Now we compact
+ // (level0_file_num_compaction_trigger+1)=4 files and should have a big
+ // file of size 4.
+ ASSERT_EQ(NumSortedRuns(), 1);
+
+ // Stage 2:
+ // Now we have one file at level 0, with size 4. We also have some data in
+ // mem table. Let's continue generating new files at level 0, but don't
+ // trigger level-0 compaction.
+ // First, clean up memtable before inserting new data. This will generate
+ // a level-0 file, with size around 0.4 (according to previously written
+ // data amount).
+ dbfull()->Flush(FlushOptions());
+ for (int num = 0; num < options.level0_file_num_compaction_trigger - 3;
+ num++) {
+ // Write 110KB (11 values, each 10K)
+ for (int i = 0; i < 100; i++) {
+ ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 990)));
+ key_idx++;
+ }
+ dbfull()->TEST_WaitForFlushMemTable();
+ ASSERT_EQ(NumSortedRuns(), num + 3);
+ }
+
+ // Generate one more file at level-0, which should trigger level-0
+ // compaction.
+ for (int i = 0; i < 100; i++) {
+ ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 990)));
+ key_idx++;
+ }
+ dbfull()->TEST_WaitForCompact();
+ // Before compaction, we have 4 files at level 0, with size 4, 0.4, 1, 1.
+ // After compaction, we should have 3 files, with size 4, 0.4, 2.
+ ASSERT_EQ(NumSortedRuns(), 3);
+ // Stage 3:
+ // Now we have 3 files at level 0, with size 4, 0.4, 2. Generate one
+ // more file at level-0, which should trigger level-0 compaction.
+ for (int i = 0; i < 100; i++) {
+ ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 990)));
+ key_idx++;
+ }
+ dbfull()->TEST_WaitForCompact();
+ // Level-0 compaction is triggered, but no file will be picked up.
+ ASSERT_EQ(NumSortedRuns(), 4);
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionCompressRatio1) {
+ if (!Snappy_Supported()) {
+ return;
+ }
+
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.write_buffer_size = 100 << 10; // 100KB
+ options.target_file_size_base = 32 << 10; // 32KB
+ options.level0_file_num_compaction_trigger = 2;
+ options.num_levels = num_levels_;
+ options.compaction_options_universal.compression_size_percent = 70;
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ int key_idx = 0;
+
+ // The first compaction (2) is compressed.
+ for (int num = 0; num < 2; num++) {
+ // Write 110KB (11 values, each 10K)
+ for (int i = 0; i < 11; i++) {
+ ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+ key_idx++;
+ }
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+ }
+ ASSERT_LT(TotalSize(), 110000U * 2 * 0.9);
+
+ // The second compaction (4) is compressed
+ for (int num = 0; num < 2; num++) {
+ // Write 110KB (11 values, each 10K)
+ for (int i = 0; i < 11; i++) {
+ ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+ key_idx++;
+ }
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+ }
+ ASSERT_LT(TotalSize(), 110000 * 4 * 0.9);
+
+ // The third compaction (2 4) is compressed since this time it is
+ // (1 1 3.2) and 3.2/5.2 doesn't reach ratio.
+ for (int num = 0; num < 2; num++) {
+ // Write 110KB (11 values, each 10K)
+ for (int i = 0; i < 11; i++) {
+ ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+ key_idx++;
+ }
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+ }
+ ASSERT_LT(TotalSize(), 110000 * 6 * 0.9);
+
+ // When we start for the compaction up to (2 4 8), the latest
+ // compressed is not compressed.
+ for (int num = 0; num < 8; num++) {
+ // Write 110KB (11 values, each 10K)
+ for (int i = 0; i < 11; i++) {
+ ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+ key_idx++;
+ }
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+ }
+ ASSERT_GT(TotalSize(), 110000 * 11 * 0.8 + 110000 * 2);
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionCompressRatio2) {
+ if (!Snappy_Supported()) {
+ return;
+ }
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.write_buffer_size = 100 << 10; // 100KB
+ options.target_file_size_base = 32 << 10; // 32KB
+ options.level0_file_num_compaction_trigger = 2;
+ options.num_levels = num_levels_;
+ options.compaction_options_universal.compression_size_percent = 95;
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ int key_idx = 0;
+
+ // When we start for the compaction up to (2 4 8), the latest
+ // compressed is compressed given the size ratio to compress.
+ for (int num = 0; num < 14; num++) {
+ // Write 120KB (12 values, each 10K)
+ for (int i = 0; i < 12; i++) {
+ ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+ key_idx++;
+ }
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+ }
+ ASSERT_LT(TotalSize(), 120000U * 12 * 0.82 + 120000 * 2);
+}
+
+#ifndef ROCKSDB_VALGRIND_RUN
+// Test that checks trivial move in universal compaction
+TEST_P(DBTestUniversalCompaction, UniversalCompactionTrivialMoveTest1) {
+ int32_t trivial_move = 0;
+ int32_t non_trivial_move = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:TrivialMove",
+ [&](void* /*arg*/) { trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial", [&](void* arg) {
+ non_trivial_move++;
+ ASSERT_TRUE(arg != nullptr);
+ int output_level = *(static_cast<int*>(arg));
+ ASSERT_EQ(output_level, 0);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.compaction_options_universal.allow_trivial_move = true;
+ options.num_levels = 2;
+ options.write_buffer_size = 100 << 10; // 100KB
+ options.level0_file_num_compaction_trigger = 3;
+ options.max_background_compactions = 1;
+ options.target_file_size_base = 32 * 1024;
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Trigger compaction if size amplification exceeds 110%
+ options.compaction_options_universal.max_size_amplification_percent = 110;
+ options = CurrentOptions(options);
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ Random rnd(301);
+ int num_keys = 250000;
+ for (int i = 0; i < num_keys; i++) {
+ ASSERT_OK(Put(1, Key(i), Key(i)));
+ }
+ std::vector<std::string> values;
+
+ ASSERT_OK(Flush(1));
+ dbfull()->TEST_WaitForCompact();
+
+ ASSERT_GT(trivial_move, 0);
+ ASSERT_GT(non_trivial_move, 0);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+// Test that checks trivial move in universal compaction
+TEST_P(DBTestUniversalCompaction, UniversalCompactionTrivialMoveTest2) {
+ int32_t trivial_move = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:TrivialMove",
+ [&](void* /*arg*/) { trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial", [&](void* arg) {
+ ASSERT_TRUE(arg != nullptr);
+ int output_level = *(static_cast<int*>(arg));
+ ASSERT_EQ(output_level, 0);
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.compaction_options_universal.allow_trivial_move = true;
+ options.num_levels = 15;
+ options.write_buffer_size = 100 << 10; // 100KB
+ options.level0_file_num_compaction_trigger = 8;
+ options.max_background_compactions = 2;
+ options.target_file_size_base = 64 * 1024;
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Trigger compaction if size amplification exceeds 110%
+ options.compaction_options_universal.max_size_amplification_percent = 110;
+ options = CurrentOptions(options);
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ Random rnd(301);
+ int num_keys = 500000;
+ for (int i = 0; i < num_keys; i++) {
+ ASSERT_OK(Put(1, Key(i), Key(i)));
+ }
+ std::vector<std::string> values;
+
+ ASSERT_OK(Flush(1));
+ dbfull()->TEST_WaitForCompact();
+
+ ASSERT_GT(trivial_move, 0);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+#endif // ROCKSDB_VALGRIND_RUN
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionFourPaths) {
+ Options options = CurrentOptions();
+ options.db_paths.emplace_back(dbname_, 300 * 1024);
+ options.db_paths.emplace_back(dbname_ + "_2", 300 * 1024);
+ options.db_paths.emplace_back(dbname_ + "_3", 500 * 1024);
+ options.db_paths.emplace_back(dbname_ + "_4", 1024 * 1024 * 1024);
+ options.memtable_factory.reset(
+ new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+ options.compaction_style = kCompactionStyleUniversal;
+ options.compaction_options_universal.size_ratio = 5;
+ options.write_buffer_size = 111 << 10; // 114KB
+ options.arena_block_size = 4 << 10;
+ options.level0_file_num_compaction_trigger = 2;
+ options.num_levels = 1;
+
+ std::vector<std::string> filenames;
+ env_->GetChildren(options.db_paths[1].path, &filenames);
+ // Delete archival files.
+ for (size_t i = 0; i < filenames.size(); ++i) {
+ env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]);
+ }
+ env_->DeleteDir(options.db_paths[1].path);
+ Reopen(options);
+
+ Random rnd(301);
+ int key_idx = 0;
+
+ // First three 110KB files are not going to second path.
+ // After that, (100K, 200K)
+ for (int num = 0; num < 3; num++) {
+ GenerateNewFile(&rnd, &key_idx);
+ }
+
+ // Another 110KB triggers a compaction to 400K file to second path
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+
+ // (1, 4)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1,1,4) -> (2, 4)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ // (1, 2, 4) -> (3, 4)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ // (1, 3, 4) -> (8)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+
+ // (1, 8)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1, 1, 8) -> (2, 8)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+ // (1, 2, 8) -> (3, 8)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ // (1, 3, 8) -> (4, 8)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+
+ // (1, 4, 8) -> (5, 8)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ for (int i = 0; i < key_idx; i++) {
+ auto v = Get(Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ Reopen(options);
+
+ for (int i = 0; i < key_idx; i++) {
+ auto v = Get(Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ Destroy(options);
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionCFPathUse) {
+ Options options = CurrentOptions();
+ options.db_paths.emplace_back(dbname_, 300 * 1024);
+ options.db_paths.emplace_back(dbname_ + "_2", 300 * 1024);
+ options.db_paths.emplace_back(dbname_ + "_3", 500 * 1024);
+ options.db_paths.emplace_back(dbname_ + "_4", 1024 * 1024 * 1024);
+ options.memtable_factory.reset(
+ new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+ options.compaction_style = kCompactionStyleUniversal;
+ options.compaction_options_universal.size_ratio = 10;
+ options.write_buffer_size = 111 << 10; // 114KB
+ options.arena_block_size = 4 << 10;
+ options.level0_file_num_compaction_trigger = 2;
+ options.num_levels = 1;
+
+ std::vector<Options> option_vector;
+ option_vector.emplace_back(options);
+ ColumnFamilyOptions cf_opt1(options), cf_opt2(options);
+ // Configure CF1 specific paths.
+ cf_opt1.cf_paths.emplace_back(dbname_ + "cf1", 300 * 1024);
+ cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_2", 300 * 1024);
+ cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_3", 500 * 1024);
+ cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_4", 1024 * 1024 * 1024);
+ option_vector.emplace_back(DBOptions(options), cf_opt1);
+ CreateColumnFamilies({"one"},option_vector[1]);
+
+ // Configura CF2 specific paths.
+ cf_opt2.cf_paths.emplace_back(dbname_ + "cf2", 300 * 1024);
+ cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_2", 300 * 1024);
+ cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_3", 500 * 1024);
+ cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_4", 1024 * 1024 * 1024);
+ option_vector.emplace_back(DBOptions(options), cf_opt2);
+ CreateColumnFamilies({"two"},option_vector[2]);
+
+ ReopenWithColumnFamilies({"default", "one", "two"}, option_vector);
+
+ Random rnd(301);
+ int key_idx = 0;
+ int key_idx1 = 0;
+ int key_idx2 = 0;
+
+ auto generate_file = [&]() {
+ GenerateNewFile(0, &rnd, &key_idx);
+ GenerateNewFile(1, &rnd, &key_idx1);
+ GenerateNewFile(2, &rnd, &key_idx2);
+ };
+
+ auto check_sstfilecount = [&](int path_id, int expected) {
+ ASSERT_EQ(expected, GetSstFileCount(options.db_paths[path_id].path));
+ ASSERT_EQ(expected, GetSstFileCount(cf_opt1.cf_paths[path_id].path));
+ ASSERT_EQ(expected, GetSstFileCount(cf_opt2.cf_paths[path_id].path));
+ };
+
+ auto check_getvalues = [&]() {
+ for (int i = 0; i < key_idx; i++) {
+ auto v = Get(0, Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ for (int i = 0; i < key_idx1; i++) {
+ auto v = Get(1, Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ for (int i = 0; i < key_idx2; i++) {
+ auto v = Get(2, Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+ };
+
+ // First three 110KB files are not going to second path.
+ // After that, (100K, 200K)
+ for (int num = 0; num < 3; num++) {
+ generate_file();
+ }
+
+ // Another 110KB triggers a compaction to 400K file to second path
+ generate_file();
+ check_sstfilecount(2, 1);
+
+ // (1, 4)
+ generate_file();
+ check_sstfilecount(2, 1);
+ check_sstfilecount(0, 1);
+
+ // (1,1,4) -> (2, 4)
+ generate_file();
+ check_sstfilecount(2, 1);
+ check_sstfilecount(1, 1);
+ check_sstfilecount(0, 0);
+
+ // (1, 2, 4) -> (3, 4)
+ generate_file();
+ check_sstfilecount(2, 1);
+ check_sstfilecount(1, 1);
+ check_sstfilecount(0, 0);
+
+ // (1, 3, 4) -> (8)
+ generate_file();
+ check_sstfilecount(3, 1);
+
+ // (1, 8)
+ generate_file();
+ check_sstfilecount(3, 1);
+ check_sstfilecount(0, 1);
+
+ // (1, 1, 8) -> (2, 8)
+ generate_file();
+ check_sstfilecount(3, 1);
+ check_sstfilecount(1, 1);
+
+ // (1, 2, 8) -> (3, 8)
+ generate_file();
+ check_sstfilecount(3, 1);
+ check_sstfilecount(1, 1);
+ check_sstfilecount(0, 0);
+
+ // (1, 3, 8) -> (4, 8)
+ generate_file();
+ check_sstfilecount(2, 1);
+ check_sstfilecount(3, 1);
+
+ // (1, 4, 8) -> (5, 8)
+ generate_file();
+ check_sstfilecount(3, 1);
+ check_sstfilecount(2, 1);
+ check_sstfilecount(0, 0);
+
+ check_getvalues();
+
+ ReopenWithColumnFamilies({"default", "one", "two"}, option_vector);
+
+ check_getvalues();
+
+ Destroy(options, true);
+}
+
+TEST_P(DBTestUniversalCompaction, IncreaseUniversalCompactionNumLevels) {
+ std::function<void(int)> verify_func = [&](int num_keys_in_db) {
+ std::string keys_in_db;
+ Iterator* iter = dbfull()->NewIterator(ReadOptions(), handles_[1]);
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ keys_in_db.append(iter->key().ToString());
+ keys_in_db.push_back(',');
+ }
+ delete iter;
+
+ std::string expected_keys;
+ for (int i = 0; i <= num_keys_in_db; i++) {
+ expected_keys.append(Key(i));
+ expected_keys.push_back(',');
+ }
+
+ ASSERT_EQ(keys_in_db, expected_keys);
+ };
+
+ Random rnd(301);
+ int max_key1 = 200;
+ int max_key2 = 600;
+ int max_key3 = 800;
+ const int KNumKeysPerFile = 10;
+
+ // Stage 1: open a DB with universal compaction, num_levels=1
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = 1;
+ options.write_buffer_size = 200 << 10; // 200KB
+ options.level0_file_num_compaction_trigger = 3;
+ options.memtable_factory.reset(new SpecialSkipListFactory(KNumKeysPerFile));
+ options = CurrentOptions(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ for (int i = 0; i <= max_key1; i++) {
+ // each value is 10K
+ ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
+ dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+ dbfull()->TEST_WaitForCompact();
+ }
+ ASSERT_OK(Flush(1));
+ dbfull()->TEST_WaitForCompact();
+
+ // Stage 2: reopen with universal compaction, num_levels=4
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = 4;
+ options = CurrentOptions(options);
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ verify_func(max_key1);
+
+ // Insert more keys
+ for (int i = max_key1 + 1; i <= max_key2; i++) {
+ // each value is 10K
+ ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
+ dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+ dbfull()->TEST_WaitForCompact();
+ }
+ ASSERT_OK(Flush(1));
+ dbfull()->TEST_WaitForCompact();
+
+ verify_func(max_key2);
+ // Compaction to non-L0 has happened.
+ ASSERT_GT(NumTableFilesAtLevel(options.num_levels - 1, 1), 0);
+
+ // Stage 3: Revert it back to one level and revert to num_levels=1.
+ options.num_levels = 4;
+ options.target_file_size_base = INT_MAX;
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ // Compact all to level 0
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 0;
+ compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+ dbfull()->CompactRange(compact_options, handles_[1], nullptr, nullptr);
+ // Need to restart it once to remove higher level records in manifest.
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ // Final reopen
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = 1;
+ options = CurrentOptions(options);
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ // Insert more keys
+ for (int i = max_key2 + 1; i <= max_key3; i++) {
+ // each value is 10K
+ ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
+ dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+ dbfull()->TEST_WaitForCompact();
+ }
+ ASSERT_OK(Flush(1));
+ dbfull()->TEST_WaitForCompact();
+ verify_func(max_key3);
+}
+
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionSecondPathRatio) {
+ if (!Snappy_Supported()) {
+ return;
+ }
+ Options options = CurrentOptions();
+ options.db_paths.emplace_back(dbname_, 500 * 1024);
+ options.db_paths.emplace_back(dbname_ + "_2", 1024 * 1024 * 1024);
+ options.compaction_style = kCompactionStyleUniversal;
+ options.compaction_options_universal.size_ratio = 5;
+ options.write_buffer_size = 111 << 10; // 114KB
+ options.arena_block_size = 4 << 10;
+ options.level0_file_num_compaction_trigger = 2;
+ options.num_levels = 1;
+ options.memtable_factory.reset(
+ new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+
+ std::vector<std::string> filenames;
+ env_->GetChildren(options.db_paths[1].path, &filenames);
+ // Delete archival files.
+ for (size_t i = 0; i < filenames.size(); ++i) {
+ env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]);
+ }
+ env_->DeleteDir(options.db_paths[1].path);
+ Reopen(options);
+
+ Random rnd(301);
+ int key_idx = 0;
+
+ // First three 110KB files are not going to second path.
+ // After that, (100K, 200K)
+ for (int num = 0; num < 3; num++) {
+ GenerateNewFile(&rnd, &key_idx);
+ }
+
+ // Another 110KB triggers a compaction to 400K file to second path
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+ // (1, 4)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1,1,4) -> (2, 4)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1, 2, 4) -> (3, 4)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ // (1, 3, 4) -> (8)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ // (1, 8)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1, 1, 8) -> (2, 8)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1, 2, 8) -> (3, 8)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ // (1, 3, 8) -> (4, 8)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ // (1, 4, 8) -> (5, 8)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ for (int i = 0; i < key_idx; i++) {
+ auto v = Get(Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ Reopen(options);
+
+ for (int i = 0; i < key_idx; i++) {
+ auto v = Get(Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ Destroy(options);
+}
+
+TEST_P(DBTestUniversalCompaction, ConcurrentBottomPriLowPriCompactions) {
+ if (num_levels_ == 1) {
+ // for single-level universal, everything's bottom level so nothing should
+ // be executed in bottom-pri thread pool.
+ return;
+ }
+ const int kNumFilesTrigger = 3;
+ Env::Default()->SetBackgroundThreads(1, Env::Priority::BOTTOM);
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = num_levels_;
+ options.write_buffer_size = 100 << 10; // 100KB
+ options.target_file_size_base = 32 << 10; // 32KB
+ options.level0_file_num_compaction_trigger = kNumFilesTrigger;
+ // Trigger compaction if size amplification exceeds 110%
+ options.compaction_options_universal.max_size_amplification_percent = 110;
+ DestroyAndReopen(options);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {// wait for the full compaction to be picked before adding files intended
+ // for the second one.
+ {"DBImpl::BackgroundCompaction:ForwardToBottomPriPool",
+ "DBTestUniversalCompaction:ConcurrentBottomPriLowPriCompactions:0"},
+ // the full (bottom-pri) compaction waits until a partial (low-pri)
+ // compaction has started to verify they can run in parallel.
+ {"DBImpl::BackgroundCompaction:NonTrivial",
+ "DBImpl::BGWorkBottomCompaction"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ for (int i = 0; i < 2; ++i) {
+ for (int num = 0; num < kNumFilesTrigger; num++) {
+ int key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx, true /* no_wait */);
+ // use no_wait above because that one waits for flush and compaction. We
+ // don't want to wait for compaction because the full compaction is
+ // intentionally blocked while more files are flushed.
+ dbfull()->TEST_WaitForFlushMemTable();
+ }
+ if (i == 0) {
+ TEST_SYNC_POINT(
+ "DBTestUniversalCompaction:ConcurrentBottomPriLowPriCompactions:0");
+ }
+ }
+ dbfull()->TEST_WaitForCompact();
+
+ // First compaction should output to bottom level. Second should output to L0
+ // since older L0 files pending compaction prevent it from being placed lower.
+ ASSERT_EQ(NumSortedRuns(), 2);
+ ASSERT_GT(NumTableFilesAtLevel(0), 0);
+ ASSERT_GT(NumTableFilesAtLevel(num_levels_ - 1), 0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ Env::Default()->SetBackgroundThreads(0, Env::Priority::BOTTOM);
+}
+
+TEST_P(DBTestUniversalCompaction, RecalculateScoreAfterPicking) {
+ // Regression test for extra compactions scheduled. Once enough compactions
+ // have been scheduled to bring the score below one, we should stop
+ // scheduling more; otherwise, other CFs/DBs may be delayed unnecessarily.
+ const int kNumFilesTrigger = 8;
+ Options options = CurrentOptions();
+ options.memtable_factory.reset(
+ new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+ options.compaction_options_universal.max_merge_width = kNumFilesTrigger / 2;
+ options.compaction_options_universal.max_size_amplification_percent =
+ static_cast<unsigned int>(-1);
+ options.compaction_style = kCompactionStyleUniversal;
+ options.level0_file_num_compaction_trigger = kNumFilesTrigger;
+ options.num_levels = num_levels_;
+ Reopen(options);
+
+ std::atomic<int> num_compactions_attempted(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:Start",
+ [&](void* /*arg*/) { ++num_compactions_attempted; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ for (int num = 0; num < kNumFilesTrigger; num++) {
+ ASSERT_EQ(NumSortedRuns(), num);
+ int key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ }
+ dbfull()->TEST_WaitForCompact();
+ // Compacting the first four files was enough to bring the score below one so
+ // there's no need to schedule any more compactions.
+ ASSERT_EQ(1, num_compactions_attempted);
+ ASSERT_EQ(NumSortedRuns(), 5);
+}
+
+TEST_P(DBTestUniversalCompaction, FinalSortedRunCompactFilesConflict) {
+ // Regression test for conflict between:
+ // (1) Running CompactFiles including file in the final sorted run; and
+ // (2) Picking universal size-amp-triggered compaction, which always includes
+ // the final sorted run.
+ if (exclusive_manual_compaction_) {
+ return;
+ }
+
+ Options opts = CurrentOptions();
+ opts.compaction_style = kCompactionStyleUniversal;
+ opts.compaction_options_universal.max_size_amplification_percent = 50;
+ opts.compaction_options_universal.min_merge_width = 2;
+ opts.compression = kNoCompression;
+ opts.level0_file_num_compaction_trigger = 2;
+ opts.max_background_compactions = 2;
+ opts.num_levels = num_levels_;
+ Reopen(opts);
+
+ // make sure compaction jobs can be parallelized
+ auto stop_token =
+ dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+ Put("key", "val");
+ Flush();
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ ASSERT_EQ(NumTableFilesAtLevel(num_levels_ - 1), 1);
+ ColumnFamilyMetaData cf_meta;
+ ColumnFamilyHandle* default_cfh = db_->DefaultColumnFamily();
+ dbfull()->GetColumnFamilyMetaData(default_cfh, &cf_meta);
+ ASSERT_EQ(1, cf_meta.levels[num_levels_ - 1].files.size());
+ std::string first_sst_filename =
+ cf_meta.levels[num_levels_ - 1].files[0].name;
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"CompactFilesImpl:0",
+ "DBTestUniversalCompaction:FinalSortedRunCompactFilesConflict:0"},
+ {"DBImpl::BackgroundCompaction():AfterPickCompaction",
+ "CompactFilesImpl:1"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ port::Thread compact_files_thread([&]() {
+ ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), default_cfh,
+ {first_sst_filename}, num_levels_ - 1));
+ });
+
+ TEST_SYNC_POINT(
+ "DBTestUniversalCompaction:FinalSortedRunCompactFilesConflict:0");
+ for (int i = 0; i < 2; ++i) {
+ Put("key", "val");
+ Flush();
+ }
+ dbfull()->TEST_WaitForCompact();
+
+ compact_files_thread.join();
+}
+
+INSTANTIATE_TEST_CASE_P(NumLevels, DBTestUniversalCompaction,
+ ::testing::Combine(::testing::Values(1, 3, 5),
+ ::testing::Bool()));
+
+class DBTestUniversalManualCompactionOutputPathId
+ : public DBTestUniversalCompactionBase {
+ public:
+ DBTestUniversalManualCompactionOutputPathId() :
+ DBTestUniversalCompactionBase(
+ "/db_universal_compaction_manual_pid_test") {}
+};
+
+TEST_P(DBTestUniversalManualCompactionOutputPathId,
+ ManualCompactionOutputPathId) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.db_paths.emplace_back(dbname_, 1000000000);
+ options.db_paths.emplace_back(dbname_ + "_2", 1000000000);
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = num_levels_;
+ options.target_file_size_base = 1 << 30; // Big size
+ options.level0_file_num_compaction_trigger = 10;
+ Destroy(options);
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ MakeTables(3, "p", "q", 1);
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(2, TotalLiveFiles(1));
+ ASSERT_EQ(2, GetSstFileCount(options.db_paths[0].path));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[1].path));
+
+ // Full compaction to DB path 0
+ CompactRangeOptions compact_options;
+ compact_options.target_path_id = 1;
+ compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+ db_->CompactRange(compact_options, handles_[1], nullptr, nullptr);
+ ASSERT_EQ(1, TotalLiveFiles(1));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+ ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+ ASSERT_EQ(1, TotalLiveFiles(1));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+ MakeTables(1, "p", "q", 1);
+ ASSERT_EQ(2, TotalLiveFiles(1));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+ ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+ ASSERT_EQ(2, TotalLiveFiles(1));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+ // Full compaction to DB path 0
+ compact_options.target_path_id = 0;
+ compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+ db_->CompactRange(compact_options, handles_[1], nullptr, nullptr);
+ ASSERT_EQ(1, TotalLiveFiles(1));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[1].path));
+
+ // Fail when compacting to an invalid path ID
+ compact_options.target_path_id = 2;
+ compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+ ASSERT_TRUE(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)
+ .IsInvalidArgument());
+}
+
+INSTANTIATE_TEST_CASE_P(OutputPathId,
+ DBTestUniversalManualCompactionOutputPathId,
+ ::testing::Combine(::testing::Values(1, 8),
+ ::testing::Bool()));
+
+TEST_F(DBTestUniversalCompaction2, BasicL0toL1) {
+ const int kNumKeys = 3000;
+ const int kWindowSize = 100;
+ const int kNumDelsTrigger = 90;
+
+ Options opts = CurrentOptions();
+ opts.table_properties_collector_factories.emplace_back(
+ NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
+ opts.compaction_style = kCompactionStyleUniversal;
+ opts.level0_file_num_compaction_trigger = 2;
+ opts.compression = kNoCompression;
+ opts.compaction_options_universal.size_ratio = 10;
+ opts.compaction_options_universal.min_merge_width = 2;
+ opts.compaction_options_universal.max_size_amplification_percent = 200;
+ Reopen(opts);
+
+ // add an L1 file to prevent tombstones from dropping due to obsolescence
+ // during flush
+ int i;
+ for (i = 0; i < 2000; ++i) {
+ Put(Key(i), "val");
+ }
+ Flush();
+ // MoveFilesToLevel(6);
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+ for (i = 1999; i < kNumKeys; ++i) {
+ if (i >= kNumKeys - kWindowSize &&
+ i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+ Delete(Key(i));
+ } else {
+ Put(Key(i), "val");
+ }
+ }
+ Flush();
+
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_GT(NumTableFilesAtLevel(6), 0);
+}
+
+TEST_F(DBTestUniversalCompaction2, SingleLevel) {
+ const int kNumKeys = 3000;
+ const int kWindowSize = 100;
+ const int kNumDelsTrigger = 90;
+
+ Options opts = CurrentOptions();
+ opts.table_properties_collector_factories.emplace_back(
+ NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
+ opts.compaction_style = kCompactionStyleUniversal;
+ opts.level0_file_num_compaction_trigger = 2;
+ opts.compression = kNoCompression;
+ opts.num_levels = 1;
+ opts.compaction_options_universal.size_ratio = 10;
+ opts.compaction_options_universal.min_merge_width = 2;
+ opts.compaction_options_universal.max_size_amplification_percent = 200;
+ Reopen(opts);
+
+ // add an L1 file to prevent tombstones from dropping due to obsolescence
+ // during flush
+ int i;
+ for (i = 0; i < 2000; ++i) {
+ Put(Key(i), "val");
+ }
+ Flush();
+
+ for (i = 1999; i < kNumKeys; ++i) {
+ if (i >= kNumKeys - kWindowSize &&
+ i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+ Delete(Key(i));
+ } else {
+ Put(Key(i), "val");
+ }
+ }
+ Flush();
+
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+}
+
+TEST_F(DBTestUniversalCompaction2, MultipleLevels) {
+ const int kWindowSize = 100;
+ const int kNumDelsTrigger = 90;
+
+ Options opts = CurrentOptions();
+ opts.table_properties_collector_factories.emplace_back(
+ NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
+ opts.compaction_style = kCompactionStyleUniversal;
+ opts.level0_file_num_compaction_trigger = 4;
+ opts.compression = kNoCompression;
+ opts.compaction_options_universal.size_ratio = 10;
+ opts.compaction_options_universal.min_merge_width = 2;
+ opts.compaction_options_universal.max_size_amplification_percent = 200;
+ Reopen(opts);
+
+ // add an L1 file to prevent tombstones from dropping due to obsolescence
+ // during flush
+ int i;
+ for (i = 0; i < 500; ++i) {
+ Put(Key(i), "val");
+ }
+ Flush();
+ for (i = 500; i < 1000; ++i) {
+ Put(Key(i), "val");
+ }
+ Flush();
+ for (i = 1000; i < 1500; ++i) {
+ Put(Key(i), "val");
+ }
+ Flush();
+ for (i = 1500; i < 2000; ++i) {
+ Put(Key(i), "val");
+ }
+ Flush();
+
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_GT(NumTableFilesAtLevel(6), 0);
+
+ for (i = 1999; i < 2333; ++i) {
+ Put(Key(i), "val");
+ }
+ Flush();
+ for (i = 2333; i < 2666; ++i) {
+ Put(Key(i), "val");
+ }
+ Flush();
+ for (i = 2666; i < 2999; ++i) {
+ Put(Key(i), "val");
+ }
+ Flush();
+
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_GT(NumTableFilesAtLevel(6), 0);
+ ASSERT_GT(NumTableFilesAtLevel(5), 0);
+
+ for (i = 1900; i < 2100; ++i) {
+ Delete(Key(i));
+ }
+ Flush();
+
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_EQ(0, NumTableFilesAtLevel(1));
+ ASSERT_EQ(0, NumTableFilesAtLevel(2));
+ ASSERT_EQ(0, NumTableFilesAtLevel(3));
+ ASSERT_EQ(0, NumTableFilesAtLevel(4));
+ ASSERT_EQ(0, NumTableFilesAtLevel(5));
+ ASSERT_GT(NumTableFilesAtLevel(6), 0);
+}
+
+TEST_F(DBTestUniversalCompaction2, OverlappingL0) {
+ const int kWindowSize = 100;
+ const int kNumDelsTrigger = 90;
+
+ Options opts = CurrentOptions();
+ opts.table_properties_collector_factories.emplace_back(
+ NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
+ opts.compaction_style = kCompactionStyleUniversal;
+ opts.level0_file_num_compaction_trigger = 5;
+ opts.compression = kNoCompression;
+ opts.compaction_options_universal.size_ratio = 10;
+ opts.compaction_options_universal.min_merge_width = 2;
+ opts.compaction_options_universal.max_size_amplification_percent = 200;
+ Reopen(opts);
+
+ // add an L1 file to prevent tombstones from dropping due to obsolescence
+ // during flush
+ int i;
+ for (i = 0; i < 2000; ++i) {
+ Put(Key(i), "val");
+ }
+ Flush();
+ for (i = 2000; i < 3000; ++i) {
+ Put(Key(i), "val");
+ }
+ Flush();
+ for (i = 3500; i < 4000; ++i) {
+ Put(Key(i), "val");
+ }
+ Flush();
+ for (i = 2900; i < 3100; ++i) {
+ Delete(Key(i));
+ }
+ Flush();
+
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(2, NumTableFilesAtLevel(0));
+ ASSERT_GT(NumTableFilesAtLevel(6), 0);
+}
+
+TEST_F(DBTestUniversalCompaction2, IngestBehind) {
+ const int kNumKeys = 3000;
+ const int kWindowSize = 100;
+ const int kNumDelsTrigger = 90;
+
+ Options opts = CurrentOptions();
+ opts.table_properties_collector_factories.emplace_back(
+ NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
+ opts.compaction_style = kCompactionStyleUniversal;
+ opts.level0_file_num_compaction_trigger = 2;
+ opts.compression = kNoCompression;
+ opts.allow_ingest_behind = true;
+ opts.compaction_options_universal.size_ratio = 10;
+ opts.compaction_options_universal.min_merge_width = 2;
+ opts.compaction_options_universal.max_size_amplification_percent = 200;
+ Reopen(opts);
+
+ // add an L1 file to prevent tombstones from dropping due to obsolescence
+ // during flush
+ int i;
+ for (i = 0; i < 2000; ++i) {
+ Put(Key(i), "val");
+ }
+ Flush();
+ // MoveFilesToLevel(6);
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+ for (i = 1999; i < kNumKeys; ++i) {
+ if (i >= kNumKeys - kWindowSize &&
+ i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+ Delete(Key(i));
+ } else {
+ Put(Key(i), "val");
+ }
+ }
+ Flush();
+
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_EQ(0, NumTableFilesAtLevel(6));
+ ASSERT_GT(NumTableFilesAtLevel(5), 0);
+}
+
+TEST_F(DBTestUniversalCompaction2, PeriodicCompactionDefault) {
+ Options options;
+ options.compaction_style = kCompactionStyleUniversal;
+
+ KeepFilterFactory* filter = new KeepFilterFactory(true);
+ options.compaction_filter_factory.reset(filter);
+ Reopen(options);
+ ASSERT_EQ(30 * 24 * 60 * 60,
+ dbfull()->GetOptions().periodic_compaction_seconds);
+
+ KeepFilter df;
+ options.compaction_filter_factory.reset();
+ options.compaction_filter = &df;
+ Reopen(options);
+ ASSERT_EQ(30 * 24 * 60 * 60,
+ dbfull()->GetOptions().periodic_compaction_seconds);
+
+ options.ttl = 60 * 24 * 60 * 60;
+ options.compaction_filter = nullptr;
+ Reopen(options);
+ ASSERT_EQ(60 * 24 * 60 * 60,
+ dbfull()->GetOptions().periodic_compaction_seconds);
+}
+
+TEST_F(DBTestUniversalCompaction2, PeriodicCompaction) {
+ Options opts = CurrentOptions();
+ opts.env = env_;
+ opts.compaction_style = kCompactionStyleUniversal;
+ opts.level0_file_num_compaction_trigger = 10;
+ opts.max_open_files = -1;
+ opts.compaction_options_universal.size_ratio = 10;
+ opts.compaction_options_universal.min_merge_width = 2;
+ opts.compaction_options_universal.max_size_amplification_percent = 200;
+ opts.periodic_compaction_seconds = 48 * 60 * 60; // 2 days
+ opts.num_levels = 5;
+ env_->addon_time_.store(0);
+ Reopen(opts);
+
+ int periodic_compactions = 0;
+ int start_level = -1;
+ int output_level = -1;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "UniversalCompactionPicker::PickPeriodicCompaction:Return",
+ [&](void* arg) {
+ Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+ ASSERT_TRUE(arg != nullptr);
+ ASSERT_TRUE(compaction->compaction_reason() ==
+ CompactionReason::kPeriodicCompaction);
+ start_level = compaction->start_level();
+ output_level = compaction->output_level();
+ periodic_compactions++;
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Case 1: Oldest flushed file excceeds periodic compaction threshold.
+ ASSERT_OK(Put("foo", "bar"));
+ Flush();
+ ASSERT_EQ(0, periodic_compactions);
+ // Move clock forward so that the flushed file would qualify periodic
+ // compaction.
+ env_->addon_time_.store(48 * 60 * 60 + 100);
+
+ // Another flush would trigger compaction the oldest file.
+ ASSERT_OK(Put("foo", "bar2"));
+ Flush();
+ dbfull()->TEST_WaitForCompact();
+
+ ASSERT_EQ(1, periodic_compactions);
+ ASSERT_EQ(0, start_level);
+ ASSERT_EQ(4, output_level);
+
+ // Case 2: Oldest compacted file excceeds periodic compaction threshold
+ periodic_compactions = 0;
+ // A flush doesn't trigger a periodic compaction when threshold not hit
+ ASSERT_OK(Put("foo", "bar2"));
+ Flush();
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(0, periodic_compactions);
+
+ // After periodic compaction threshold hits, a flush will trigger
+ // a compaction
+ ASSERT_OK(Put("foo", "bar2"));
+ env_->addon_time_.fetch_add(48 * 60 * 60 + 100);
+ Flush();
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(1, periodic_compactions);
+ ASSERT_EQ(0, start_level);
+ ASSERT_EQ(4, output_level);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !defined(ROCKSDB_LITE)
+
+int main(int argc, char** argv) {
+#if !defined(ROCKSDB_LITE)
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+#else
+ (void) argc;
+ (void) argv;
+ return 0;
+#endif
+}
diff --git a/src/rocksdb/db/db_wal_test.cc b/src/rocksdb/db/db_wal_test.cc
new file mode 100644
index 000000000..ef81de803
--- /dev/null
+++ b/src/rocksdb/db/db_wal_test.cc
@@ -0,0 +1,1586 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "env/composite_env_wrapper.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+class DBWALTest : public DBTestBase {
+ public:
+ DBWALTest() : DBTestBase("/db_wal_test") {}
+
+#if defined(ROCKSDB_PLATFORM_POSIX)
+ uint64_t GetAllocatedFileSize(std::string file_name) {
+ struct stat sbuf;
+ int err = stat(file_name.c_str(), &sbuf);
+ assert(err == 0);
+ return sbuf.st_blocks * 512;
+ }
+#endif
+};
+
+// A SpecialEnv enriched to give more insight about deleted files
+class EnrichedSpecialEnv : public SpecialEnv {
+ public:
+ explicit EnrichedSpecialEnv(Env* base) : SpecialEnv(base) {}
+ Status NewSequentialFile(const std::string& f,
+ std::unique_ptr<SequentialFile>* r,
+ const EnvOptions& soptions) override {
+ InstrumentedMutexLock l(&env_mutex_);
+ if (f == skipped_wal) {
+ deleted_wal_reopened = true;
+ if (IsWAL(f) && largetest_deleted_wal.size() != 0 &&
+ f.compare(largetest_deleted_wal) <= 0) {
+ gap_in_wals = true;
+ }
+ }
+ return SpecialEnv::NewSequentialFile(f, r, soptions);
+ }
+ Status DeleteFile(const std::string& fname) override {
+ if (IsWAL(fname)) {
+ deleted_wal_cnt++;
+ InstrumentedMutexLock l(&env_mutex_);
+ // If this is the first WAL, remember its name and skip deleting it. We
+ // remember its name partly because the application might attempt to
+ // delete the file again.
+ if (skipped_wal.size() != 0 && skipped_wal != fname) {
+ if (largetest_deleted_wal.size() == 0 ||
+ largetest_deleted_wal.compare(fname) < 0) {
+ largetest_deleted_wal = fname;
+ }
+ } else {
+ skipped_wal = fname;
+ return Status::OK();
+ }
+ }
+ return SpecialEnv::DeleteFile(fname);
+ }
+ bool IsWAL(const std::string& fname) {
+ // printf("iswal %s\n", fname.c_str());
+ return fname.compare(fname.size() - 3, 3, "log") == 0;
+ }
+
+ InstrumentedMutex env_mutex_;
+ // the wal whose actual delete was skipped by the env
+ std::string skipped_wal = "";
+ // the largest WAL that was requested to be deleted
+ std::string largetest_deleted_wal = "";
+ // number of WALs that were successfully deleted
+ std::atomic<size_t> deleted_wal_cnt = {0};
+ // the WAL whose delete from fs was skipped is reopened during recovery
+ std::atomic<bool> deleted_wal_reopened = {false};
+ // whether a gap in the WALs was detected during recovery
+ std::atomic<bool> gap_in_wals = {false};
+};
+
+class DBWALTestWithEnrichedEnv : public DBTestBase {
+ public:
+ DBWALTestWithEnrichedEnv() : DBTestBase("/db_wal_test") {
+ enriched_env_ = new EnrichedSpecialEnv(env_->target());
+ auto options = CurrentOptions();
+ options.env = enriched_env_;
+ options.allow_2pc = true;
+ Reopen(options);
+ delete env_;
+ // to be deleted by the parent class
+ env_ = enriched_env_;
+ }
+
+ protected:
+ EnrichedSpecialEnv* enriched_env_;
+};
+
+// Test that the recovery would successfully avoid the gaps between the logs.
+// One known scenario that could cause this is that the application issue the
+// WAL deletion out of order. For the sake of simplicity in the test, here we
+// create the gap by manipulating the env to skip deletion of the first WAL but
+// not the ones after it.
+TEST_F(DBWALTestWithEnrichedEnv, SkipDeletedWALs) {
+ auto options = last_options_;
+ // To cause frequent WAL deletion
+ options.write_buffer_size = 128;
+ Reopen(options);
+
+ WriteOptions writeOpt = WriteOptions();
+ for (int i = 0; i < 128 * 5; i++) {
+ ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v1"));
+ }
+ FlushOptions fo;
+ fo.wait = true;
+ ASSERT_OK(db_->Flush(fo));
+
+ // some wals are deleted
+ ASSERT_NE(0, enriched_env_->deleted_wal_cnt);
+ // but not the first one
+ ASSERT_NE(0, enriched_env_->skipped_wal.size());
+
+ // Test that the WAL that was not deleted will be skipped during recovery
+ options = last_options_;
+ Reopen(options);
+ ASSERT_FALSE(enriched_env_->deleted_wal_reopened);
+ ASSERT_FALSE(enriched_env_->gap_in_wals);
+}
+
+TEST_F(DBWALTest, WAL) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ WriteOptions writeOpt = WriteOptions();
+ writeOpt.disableWAL = true;
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_EQ("v1", Get(1, "foo"));
+ ASSERT_EQ("v1", Get(1, "bar"));
+
+ writeOpt.disableWAL = false;
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2"));
+ writeOpt.disableWAL = true;
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2"));
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ // Both value's should be present.
+ ASSERT_EQ("v2", Get(1, "bar"));
+ ASSERT_EQ("v2", Get(1, "foo"));
+
+ writeOpt.disableWAL = true;
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3"));
+ writeOpt.disableWAL = false;
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3"));
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ // again both values should be present.
+ ASSERT_EQ("v3", Get(1, "foo"));
+ ASSERT_EQ("v3", Get(1, "bar"));
+ } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, RollLog) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "foo", "v1"));
+ ASSERT_OK(Put(1, "baz", "v5"));
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ for (int i = 0; i < 10; i++) {
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ }
+ ASSERT_OK(Put(1, "foo", "v4"));
+ for (int i = 0; i < 10; i++) {
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ }
+ } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, SyncWALNotBlockWrite) {
+ Options options = CurrentOptions();
+ options.max_write_buffer_number = 4;
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("foo1", "bar1"));
+ ASSERT_OK(Put("foo5", "bar5"));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"WritableFileWriter::SyncWithoutFlush:1",
+ "DBWALTest::SyncWALNotBlockWrite:1"},
+ {"DBWALTest::SyncWALNotBlockWrite:2",
+ "WritableFileWriter::SyncWithoutFlush:2"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ROCKSDB_NAMESPACE::port::Thread thread([&]() { ASSERT_OK(db_->SyncWAL()); });
+
+ TEST_SYNC_POINT("DBWALTest::SyncWALNotBlockWrite:1");
+ ASSERT_OK(Put("foo2", "bar2"));
+ ASSERT_OK(Put("foo3", "bar3"));
+ FlushOptions fo;
+ fo.wait = false;
+ ASSERT_OK(db_->Flush(fo));
+ ASSERT_OK(Put("foo4", "bar4"));
+
+ TEST_SYNC_POINT("DBWALTest::SyncWALNotBlockWrite:2");
+
+ thread.join();
+
+ ASSERT_EQ(Get("foo1"), "bar1");
+ ASSERT_EQ(Get("foo2"), "bar2");
+ ASSERT_EQ(Get("foo3"), "bar3");
+ ASSERT_EQ(Get("foo4"), "bar4");
+ ASSERT_EQ(Get("foo5"), "bar5");
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBWALTest, SyncWALNotWaitWrite) {
+ ASSERT_OK(Put("foo1", "bar1"));
+ ASSERT_OK(Put("foo3", "bar3"));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"SpecialEnv::WalFile::Append:1", "DBWALTest::SyncWALNotWaitWrite:1"},
+ {"DBWALTest::SyncWALNotWaitWrite:2", "SpecialEnv::WalFile::Append:2"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ROCKSDB_NAMESPACE::port::Thread thread(
+ [&]() { ASSERT_OK(Put("foo2", "bar2")); });
+ // Moving this to SyncWAL before the actual fsync
+ // TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:1");
+ ASSERT_OK(db_->SyncWAL());
+ // Moving this to SyncWAL after actual fsync
+ // TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:2");
+
+ thread.join();
+
+ ASSERT_EQ(Get("foo1"), "bar1");
+ ASSERT_EQ(Get("foo2"), "bar2");
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBWALTest, Recover) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "foo", "v1"));
+ ASSERT_OK(Put(1, "baz", "v5"));
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_EQ("v1", Get(1, "foo"));
+
+ ASSERT_EQ("v1", Get(1, "foo"));
+ ASSERT_EQ("v5", Get(1, "baz"));
+ ASSERT_OK(Put(1, "bar", "v2"));
+ ASSERT_OK(Put(1, "foo", "v3"));
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_EQ("v3", Get(1, "foo"));
+ ASSERT_OK(Put(1, "foo", "v4"));
+ ASSERT_EQ("v4", Get(1, "foo"));
+ ASSERT_EQ("v2", Get(1, "bar"));
+ ASSERT_EQ("v5", Get(1, "baz"));
+ } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, RecoverWithTableHandle) {
+ do {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ options.avoid_flush_during_recovery = false;
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_OK(Put(1, "foo", "v1"));
+ ASSERT_OK(Put(1, "bar", "v2"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(Put(1, "foo", "v3"));
+ ASSERT_OK(Put(1, "bar", "v4"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(Put(1, "big", std::string(100, 'a')));
+
+ options = CurrentOptions();
+ const int kSmallMaxOpenFiles = 13;
+ if (option_config_ == kDBLogDir) {
+ // Use this option to check not preloading files
+ // Set the max open files to be small enough so no preload will
+ // happen.
+ options.max_open_files = kSmallMaxOpenFiles;
+ // RocksDB sanitize max open files to at least 20. Modify it back.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+ int* max_open_files = static_cast<int*>(arg);
+ *max_open_files = kSmallMaxOpenFiles;
+ });
+
+ } else if (option_config_ == kWalDirAndMmapReads) {
+ // Use this option to check always loading all files.
+ options.max_open_files = 100;
+ } else {
+ options.max_open_files = -1;
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ std::vector<std::vector<FileMetaData>> files;
+ dbfull()->TEST_GetFilesMetaData(handles_[1], &files);
+ size_t total_files = 0;
+ for (const auto& level : files) {
+ total_files += level.size();
+ }
+ ASSERT_EQ(total_files, 3);
+ for (const auto& level : files) {
+ for (const auto& file : level) {
+ if (options.max_open_files == kSmallMaxOpenFiles) {
+ ASSERT_TRUE(file.table_reader_handle == nullptr);
+ } else {
+ ASSERT_TRUE(file.table_reader_handle != nullptr);
+ }
+ }
+ }
+ } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, IgnoreRecoveredLog) {
+ std::string backup_logs = dbname_ + "/backup_logs";
+
+ do {
+ // delete old files in backup_logs directory
+ env_->CreateDirIfMissing(backup_logs);
+ std::vector<std::string> old_files;
+ env_->GetChildren(backup_logs, &old_files);
+ for (auto& file : old_files) {
+ if (file != "." && file != "..") {
+ env_->DeleteFile(backup_logs + "/" + file);
+ }
+ }
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+ options.wal_dir = dbname_ + "/logs";
+ DestroyAndReopen(options);
+
+ // fill up the DB
+ std::string one, two;
+ PutFixed64(&one, 1);
+ PutFixed64(&two, 2);
+ ASSERT_OK(db_->Merge(WriteOptions(), Slice("foo"), Slice(one)));
+ ASSERT_OK(db_->Merge(WriteOptions(), Slice("foo"), Slice(one)));
+ ASSERT_OK(db_->Merge(WriteOptions(), Slice("bar"), Slice(one)));
+
+ // copy the logs to backup
+ std::vector<std::string> logs;
+ env_->GetChildren(options.wal_dir, &logs);
+ for (auto& log : logs) {
+ if (log != ".." && log != ".") {
+ CopyFile(options.wal_dir + "/" + log, backup_logs + "/" + log);
+ }
+ }
+
+ // recover the DB
+ Reopen(options);
+ ASSERT_EQ(two, Get("foo"));
+ ASSERT_EQ(one, Get("bar"));
+ Close();
+
+ // copy the logs from backup back to wal dir
+ for (auto& log : logs) {
+ if (log != ".." && log != ".") {
+ CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
+ }
+ }
+ // this should ignore the log files, recovery should not happen again
+ // if the recovery happens, the same merge operator would be called twice,
+ // leading to incorrect results
+ Reopen(options);
+ ASSERT_EQ(two, Get("foo"));
+ ASSERT_EQ(one, Get("bar"));
+ Close();
+ Destroy(options);
+ Reopen(options);
+ Close();
+
+ // copy the logs from backup back to wal dir
+ env_->CreateDirIfMissing(options.wal_dir);
+ for (auto& log : logs) {
+ if (log != ".." && log != ".") {
+ CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
+ }
+ }
+ // assert that we successfully recovered only from logs, even though we
+ // destroyed the DB
+ Reopen(options);
+ ASSERT_EQ(two, Get("foo"));
+ ASSERT_EQ(one, Get("bar"));
+
+ // Recovery will fail if DB directory doesn't exist.
+ Destroy(options);
+ // copy the logs from backup back to wal dir
+ env_->CreateDirIfMissing(options.wal_dir);
+ for (auto& log : logs) {
+ if (log != ".." && log != ".") {
+ CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
+ // we won't be needing this file no more
+ env_->DeleteFile(backup_logs + "/" + log);
+ }
+ }
+ Status s = TryReopen(options);
+ ASSERT_TRUE(!s.ok());
+ Destroy(options);
+ } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, RecoveryWithEmptyLog) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "foo", "v1"));
+ ASSERT_OK(Put(1, "foo", "v2"));
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "foo", "v3"));
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_EQ("v3", Get(1, "foo"));
+ } while (ChangeWalOptions());
+}
+
+#if !(defined NDEBUG) || !defined(OS_WIN)
+TEST_F(DBWALTest, PreallocateBlock) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 10 * 1000 * 1000;
+ options.max_total_wal_size = 0;
+
+ size_t expected_preallocation_size = static_cast<size_t>(
+ options.write_buffer_size + options.write_buffer_size / 10);
+
+ DestroyAndReopen(options);
+
+ std::atomic<int> called(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBTestWalFile.GetPreallocationStatus", [&](void* arg) {
+ ASSERT_TRUE(arg != nullptr);
+ size_t preallocation_size = *(static_cast<size_t*>(arg));
+ ASSERT_EQ(expected_preallocation_size, preallocation_size);
+ called.fetch_add(1);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ Put("", "");
+ Flush();
+ Put("", "");
+ Close();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ASSERT_EQ(2, called.load());
+
+ options.max_total_wal_size = 1000 * 1000;
+ expected_preallocation_size = static_cast<size_t>(options.max_total_wal_size);
+ Reopen(options);
+ called.store(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBTestWalFile.GetPreallocationStatus", [&](void* arg) {
+ ASSERT_TRUE(arg != nullptr);
+ size_t preallocation_size = *(static_cast<size_t*>(arg));
+ ASSERT_EQ(expected_preallocation_size, preallocation_size);
+ called.fetch_add(1);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ Put("", "");
+ Flush();
+ Put("", "");
+ Close();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ASSERT_EQ(2, called.load());
+
+ options.db_write_buffer_size = 800 * 1000;
+ expected_preallocation_size =
+ static_cast<size_t>(options.db_write_buffer_size);
+ Reopen(options);
+ called.store(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBTestWalFile.GetPreallocationStatus", [&](void* arg) {
+ ASSERT_TRUE(arg != nullptr);
+ size_t preallocation_size = *(static_cast<size_t*>(arg));
+ ASSERT_EQ(expected_preallocation_size, preallocation_size);
+ called.fetch_add(1);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ Put("", "");
+ Flush();
+ Put("", "");
+ Close();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ASSERT_EQ(2, called.load());
+
+ expected_preallocation_size = 700 * 1000;
+ std::shared_ptr<WriteBufferManager> write_buffer_manager =
+ std::make_shared<WriteBufferManager>(static_cast<uint64_t>(700 * 1000));
+ options.write_buffer_manager = write_buffer_manager;
+ Reopen(options);
+ called.store(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBTestWalFile.GetPreallocationStatus", [&](void* arg) {
+ ASSERT_TRUE(arg != nullptr);
+ size_t preallocation_size = *(static_cast<size_t*>(arg));
+ ASSERT_EQ(expected_preallocation_size, preallocation_size);
+ called.fetch_add(1);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ Put("", "");
+ Flush();
+ Put("", "");
+ Close();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ASSERT_EQ(2, called.load());
+}
+#endif // !(defined NDEBUG) || !defined(OS_WIN)
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBWALTest, FullPurgePreservesRecycledLog) {
+ // For github issue #1303
+ for (int i = 0; i < 2; ++i) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.recycle_log_file_num = 2;
+ if (i != 0) {
+ options.wal_dir = alternative_wal_dir_;
+ }
+
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("foo", "v1"));
+ VectorLogPtr log_files;
+ ASSERT_OK(dbfull()->GetSortedWalFiles(log_files));
+ ASSERT_GT(log_files.size(), 0);
+ ASSERT_OK(Flush());
+
+ // Now the original WAL is in log_files[0] and should be marked for
+ // recycling.
+ // Verify full purge cannot remove this file.
+ JobContext job_context(0);
+ dbfull()->TEST_LockMutex();
+ dbfull()->FindObsoleteFiles(&job_context, true /* force */);
+ dbfull()->TEST_UnlockMutex();
+ dbfull()->PurgeObsoleteFiles(job_context);
+
+ if (i == 0) {
+ ASSERT_OK(
+ env_->FileExists(LogFileName(dbname_, log_files[0]->LogNumber())));
+ } else {
+ ASSERT_OK(env_->FileExists(
+ LogFileName(alternative_wal_dir_, log_files[0]->LogNumber())));
+ }
+ }
+}
+
+TEST_F(DBWALTest, FullPurgePreservesLogPendingReuse) {
+ // Ensures full purge cannot delete a WAL while it's in the process of being
+ // recycled. In particular, we force the full purge after a file has been
+ // chosen for reuse, but before it has been renamed.
+ for (int i = 0; i < 2; ++i) {
+ Options options = CurrentOptions();
+ options.recycle_log_file_num = 1;
+ if (i != 0) {
+ options.wal_dir = alternative_wal_dir_;
+ }
+ DestroyAndReopen(options);
+
+ // The first flush creates a second log so writes can continue before the
+ // flush finishes.
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Flush());
+
+ // The second flush can recycle the first log. Sync points enforce the
+ // full purge happens after choosing the log to recycle and before it is
+ // renamed.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"DBImpl::CreateWAL:BeforeReuseWritableFile1",
+ "DBWALTest::FullPurgePreservesLogPendingReuse:PreFullPurge"},
+ {"DBWALTest::FullPurgePreservesLogPendingReuse:PostFullPurge",
+ "DBImpl::CreateWAL:BeforeReuseWritableFile2"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ROCKSDB_NAMESPACE::port::Thread thread([&]() {
+ TEST_SYNC_POINT(
+ "DBWALTest::FullPurgePreservesLogPendingReuse:PreFullPurge");
+ ASSERT_OK(db_->EnableFileDeletions(true));
+ TEST_SYNC_POINT(
+ "DBWALTest::FullPurgePreservesLogPendingReuse:PostFullPurge");
+ });
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Flush());
+ thread.join();
+ }
+}
+
+TEST_F(DBWALTest, GetSortedWalFiles) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ VectorLogPtr log_files;
+ ASSERT_OK(dbfull()->GetSortedWalFiles(log_files));
+ ASSERT_EQ(0, log_files.size());
+
+ ASSERT_OK(Put(1, "foo", "v1"));
+ ASSERT_OK(dbfull()->GetSortedWalFiles(log_files));
+ ASSERT_EQ(1, log_files.size());
+ } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, GetCurrentWalFile) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+
+ std::unique_ptr<LogFile>* bad_log_file = nullptr;
+ ASSERT_NOK(dbfull()->GetCurrentWalFile(bad_log_file));
+
+ std::unique_ptr<LogFile> log_file;
+ ASSERT_OK(dbfull()->GetCurrentWalFile(&log_file));
+
+ // nothing has been written to the log yet
+ ASSERT_EQ(log_file->StartSequence(), 0);
+ ASSERT_EQ(log_file->SizeFileBytes(), 0);
+ ASSERT_EQ(log_file->Type(), kAliveLogFile);
+ ASSERT_GT(log_file->LogNumber(), 0);
+
+ // add some data and verify that the file size actually moves foward
+ ASSERT_OK(Put(0, "foo", "v1"));
+ ASSERT_OK(Put(0, "foo2", "v2"));
+ ASSERT_OK(Put(0, "foo3", "v3"));
+
+ ASSERT_OK(dbfull()->GetCurrentWalFile(&log_file));
+
+ ASSERT_EQ(log_file->StartSequence(), 0);
+ ASSERT_GT(log_file->SizeFileBytes(), 0);
+ ASSERT_EQ(log_file->Type(), kAliveLogFile);
+ ASSERT_GT(log_file->LogNumber(), 0);
+
+ // force log files to cycle and add some more data, then check if
+ // log number moves forward
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ for (int i = 0; i < 10; i++) {
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ }
+
+ ASSERT_OK(Put(0, "foo4", "v4"));
+ ASSERT_OK(Put(0, "foo5", "v5"));
+ ASSERT_OK(Put(0, "foo6", "v6"));
+
+ ASSERT_OK(dbfull()->GetCurrentWalFile(&log_file));
+
+ ASSERT_EQ(log_file->StartSequence(), 0);
+ ASSERT_GT(log_file->SizeFileBytes(), 0);
+ ASSERT_EQ(log_file->Type(), kAliveLogFile);
+ ASSERT_GT(log_file->LogNumber(), 0);
+
+ } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, RecoveryWithLogDataForSomeCFs) {
+ // Test for regression of WAL cleanup missing files that don't contain data
+ // for every column family.
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "foo", "v1"));
+ ASSERT_OK(Put(1, "foo", "v2"));
+ uint64_t earliest_log_nums[2];
+ for (int i = 0; i < 2; ++i) {
+ if (i > 0) {
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ }
+ VectorLogPtr log_files;
+ ASSERT_OK(dbfull()->GetSortedWalFiles(log_files));
+ if (log_files.size() > 0) {
+ earliest_log_nums[i] = log_files[0]->LogNumber();
+ } else {
+ earliest_log_nums[i] = port::kMaxUint64;
+ }
+ }
+ // Check at least the first WAL was cleaned up during the recovery.
+ ASSERT_LT(earliest_log_nums[0], earliest_log_nums[1]);
+ } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, RecoverWithLargeLog) {
+ do {
+ {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ASSERT_OK(Put(1, "big1", std::string(200000, '1')));
+ ASSERT_OK(Put(1, "big2", std::string(200000, '2')));
+ ASSERT_OK(Put(1, "small3", std::string(10, '3')));
+ ASSERT_OK(Put(1, "small4", std::string(10, '4')));
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+ }
+
+ // Make sure that if we re-open with a small write buffer size that
+ // we flush table files in the middle of a large log file.
+ Options options;
+ options.write_buffer_size = 100000;
+ options = CurrentOptions(options);
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 3);
+ ASSERT_EQ(std::string(200000, '1'), Get(1, "big1"));
+ ASSERT_EQ(std::string(200000, '2'), Get(1, "big2"));
+ ASSERT_EQ(std::string(10, '3'), Get(1, "small3"));
+ ASSERT_EQ(std::string(10, '4'), Get(1, "small4"));
+ ASSERT_GT(NumTableFilesAtLevel(0, 1), 1);
+ } while (ChangeWalOptions());
+}
+
+// In https://reviews.facebook.net/D20661 we change
+// recovery behavior: previously for each log file each column family
+// memtable was flushed, even it was empty. Now it's changed:
+// we try to create the smallest number of table files by merging
+// updates from multiple logs
+TEST_F(DBWALTest, RecoverCheckFileAmountWithSmallWriteBuffer) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 5000000;
+ CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);
+
+ // Since we will reopen DB with smaller write_buffer_size,
+ // each key will go to new SST file
+ ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
+ ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
+ ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
+ ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
+
+ ASSERT_OK(Put(3, Key(10), DummyString(1)));
+ // Make 'dobrynia' to be flushed and new WAL file to be created
+ ASSERT_OK(Put(2, Key(10), DummyString(7500000)));
+ ASSERT_OK(Put(2, Key(1), DummyString(1)));
+ dbfull()->TEST_WaitForFlushMemTable(handles_[2]);
+ {
+ auto tables = ListTableFiles(env_, dbname_);
+ ASSERT_EQ(tables.size(), static_cast<size_t>(1));
+ // Make sure 'dobrynia' was flushed: check sst files amount
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+ static_cast<uint64_t>(1));
+ }
+ // New WAL file
+ ASSERT_OK(Put(1, Key(1), DummyString(1)));
+ ASSERT_OK(Put(1, Key(1), DummyString(1)));
+ ASSERT_OK(Put(3, Key(10), DummyString(1)));
+ ASSERT_OK(Put(3, Key(10), DummyString(1)));
+ ASSERT_OK(Put(3, Key(10), DummyString(1)));
+
+ options.write_buffer_size = 4096;
+ options.arena_block_size = 4096;
+ ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"},
+ options);
+ {
+ // No inserts => default is empty
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+ static_cast<uint64_t>(0));
+ // First 4 keys goes to separate SSTs + 1 more SST for 2 smaller keys
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+ static_cast<uint64_t>(5));
+ // 1 SST for big key + 1 SST for small one
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+ static_cast<uint64_t>(2));
+ // 1 SST for all keys
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+ static_cast<uint64_t>(1));
+ }
+}
+
+// In https://reviews.facebook.net/D20661 we change
+// recovery behavior: previously for each log file each column family
+// memtable was flushed, even it wasn't empty. Now it's changed:
+// we try to create the smallest number of table files by merging
+// updates from multiple logs
+TEST_F(DBWALTest, RecoverCheckFileAmount) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100000;
+ options.arena_block_size = 4 * 1024;
+ options.avoid_flush_during_recovery = false;
+ CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);
+
+ ASSERT_OK(Put(0, Key(1), DummyString(1)));
+ ASSERT_OK(Put(1, Key(1), DummyString(1)));
+ ASSERT_OK(Put(2, Key(1), DummyString(1)));
+
+ // Make 'nikitich' memtable to be flushed
+ ASSERT_OK(Put(3, Key(10), DummyString(1002400)));
+ ASSERT_OK(Put(3, Key(1), DummyString(1)));
+ dbfull()->TEST_WaitForFlushMemTable(handles_[3]);
+ // 4 memtable are not flushed, 1 sst file
+ {
+ auto tables = ListTableFiles(env_, dbname_);
+ ASSERT_EQ(tables.size(), static_cast<size_t>(1));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+ static_cast<uint64_t>(1));
+ }
+ // Memtable for 'nikitich' has flushed, new WAL file has opened
+ // 4 memtable still not flushed
+
+ // Write to new WAL file
+ ASSERT_OK(Put(0, Key(1), DummyString(1)));
+ ASSERT_OK(Put(1, Key(1), DummyString(1)));
+ ASSERT_OK(Put(2, Key(1), DummyString(1)));
+
+ // Fill up 'nikitich' one more time
+ ASSERT_OK(Put(3, Key(10), DummyString(1002400)));
+ // make it flush
+ ASSERT_OK(Put(3, Key(1), DummyString(1)));
+ dbfull()->TEST_WaitForFlushMemTable(handles_[3]);
+ // There are still 4 memtable not flushed, and 2 sst tables
+ ASSERT_OK(Put(0, Key(1), DummyString(1)));
+ ASSERT_OK(Put(1, Key(1), DummyString(1)));
+ ASSERT_OK(Put(2, Key(1), DummyString(1)));
+
+ {
+ auto tables = ListTableFiles(env_, dbname_);
+ ASSERT_EQ(tables.size(), static_cast<size_t>(2));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+ static_cast<uint64_t>(2));
+ }
+
+ ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"},
+ options);
+ {
+ std::vector<uint64_t> table_files = ListTableFiles(env_, dbname_);
+ // Check, that records for 'default', 'dobrynia' and 'pikachu' from
+ // first, second and third WALs went to the same SST.
+ // So, there is 6 SSTs: three for 'nikitich', one for 'default', one for
+ // 'dobrynia', one for 'pikachu'
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+ static_cast<uint64_t>(1));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+ static_cast<uint64_t>(3));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+ static_cast<uint64_t>(1));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+ static_cast<uint64_t>(1));
+ }
+}
+
+TEST_F(DBWALTest, SyncMultipleLogs) {
+ const uint64_t kNumBatches = 2;
+ const int kBatchSize = 1000;
+
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.write_buffer_size = 4096;
+ Reopen(options);
+
+ WriteBatch batch;
+ WriteOptions wo;
+ wo.sync = true;
+
+ for (uint64_t b = 0; b < kNumBatches; b++) {
+ batch.Clear();
+ for (int i = 0; i < kBatchSize; i++) {
+ batch.Put(Key(i), DummyString(128));
+ }
+
+ dbfull()->Write(wo, &batch);
+ }
+
+ ASSERT_OK(dbfull()->SyncWAL());
+}
+
+// Github issue 1339. Prior the fix we read sequence id from the first log to
+// a local variable, then keep increase the variable as we replay logs,
+// ignoring actual sequence id of the records. This is incorrect if some writes
+// come with WAL disabled.
+TEST_F(DBWALTest, PartOfWritesWithWALDisabled) {
+ std::unique_ptr<FaultInjectionTestEnv> fault_env(
+ new FaultInjectionTestEnv(env_));
+ Options options = CurrentOptions();
+ options.env = fault_env.get();
+ options.disable_auto_compactions = true;
+ WriteOptions wal_on, wal_off;
+ wal_on.sync = true;
+ wal_on.disableWAL = false;
+ wal_off.disableWAL = true;
+ CreateAndReopenWithCF({"dummy"}, options);
+ ASSERT_OK(Put(1, "dummy", "d1", wal_on)); // seq id 1
+ ASSERT_OK(Put(1, "dummy", "d2", wal_off));
+ ASSERT_OK(Put(1, "dummy", "d3", wal_off));
+ ASSERT_OK(Put(0, "key", "v4", wal_on)); // seq id 4
+ ASSERT_OK(Flush(0));
+ ASSERT_OK(Put(0, "key", "v5", wal_on)); // seq id 5
+ ASSERT_EQ("v5", Get(0, "key"));
+ dbfull()->FlushWAL(false);
+ // Simulate a crash.
+ fault_env->SetFilesystemActive(false);
+ Close();
+ fault_env->ResetState();
+ ReopenWithColumnFamilies({"default", "dummy"}, options);
+ // Prior to the fix, we may incorrectly recover "v5" with sequence id = 3.
+ ASSERT_EQ("v5", Get(0, "key"));
+ // Destroy DB before destruct fault_env.
+ Destroy(options);
+}
+
+//
+// Test WAL recovery for the various modes available
+//
+class RecoveryTestHelper {
+ public:
+ // Number of WAL files to generate
+ static const int kWALFilesCount = 10;
+ // Starting number for the WAL file name like 00010.log
+ static const int kWALFileOffset = 10;
+ // Keys to be written per WAL file
+ static const int kKeysPerWALFile = 133;
+ // Size of the value
+ static const int kValueSize = 96;
+
+ // Create WAL files with values filled in
+ static void FillData(DBWALTest* test, const Options& options,
+ const size_t wal_count, size_t* count) {
+ // Calling internal functions requires sanitized options.
+ Options sanitized_options = SanitizeOptions(test->dbname_, options);
+ const ImmutableDBOptions db_options(sanitized_options);
+
+ *count = 0;
+
+ std::shared_ptr<Cache> table_cache = NewLRUCache(50, 0);
+ EnvOptions env_options;
+ WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size);
+
+ std::unique_ptr<VersionSet> versions;
+ std::unique_ptr<WalManager> wal_manager;
+ WriteController write_controller;
+
+ versions.reset(new VersionSet(test->dbname_, &db_options, env_options,
+ table_cache.get(), &write_buffer_manager,
+ &write_controller,
+ /*block_cache_tracer=*/nullptr));
+
+ wal_manager.reset(new WalManager(db_options, env_options));
+
+ std::unique_ptr<log::Writer> current_log_writer;
+
+ for (size_t j = kWALFileOffset; j < wal_count + kWALFileOffset; j++) {
+ uint64_t current_log_number = j;
+ std::string fname = LogFileName(test->dbname_, current_log_number);
+ std::unique_ptr<WritableFile> file;
+ ASSERT_OK(db_options.env->NewWritableFile(fname, &file, env_options));
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ NewLegacyWritableFileWrapper(std::move(file)), fname, env_options));
+ current_log_writer.reset(
+ new log::Writer(std::move(file_writer), current_log_number,
+ db_options.recycle_log_file_num > 0));
+
+ WriteBatch batch;
+ for (int i = 0; i < kKeysPerWALFile; i++) {
+ std::string key = "key" + ToString((*count)++);
+ std::string value = test->DummyString(kValueSize);
+ assert(current_log_writer.get() != nullptr);
+ uint64_t seq = versions->LastSequence() + 1;
+ batch.Clear();
+ batch.Put(key, value);
+ WriteBatchInternal::SetSequence(&batch, seq);
+ current_log_writer->AddRecord(WriteBatchInternal::Contents(&batch));
+ versions->SetLastAllocatedSequence(seq);
+ versions->SetLastPublishedSequence(seq);
+ versions->SetLastSequence(seq);
+ }
+ }
+ }
+
+ // Recreate and fill the store with some data
+ static size_t FillData(DBWALTest* test, Options* options) {
+ options->create_if_missing = true;
+ test->DestroyAndReopen(*options);
+ test->Close();
+
+ size_t count = 0;
+ FillData(test, *options, kWALFilesCount, &count);
+ return count;
+ }
+
+ // Read back all the keys we wrote and return the number of keys found
+ static size_t GetData(DBWALTest* test) {
+ size_t count = 0;
+ for (size_t i = 0; i < kWALFilesCount * kKeysPerWALFile; i++) {
+ if (test->Get("key" + ToString(i)) != "NOT_FOUND") {
+ ++count;
+ }
+ }
+ return count;
+ }
+
+ // Manuall corrupt the specified WAL
+ static void CorruptWAL(DBWALTest* test, const Options& options,
+ const double off, const double len,
+ const int wal_file_id, const bool trunc = false) {
+ Env* env = options.env;
+ std::string fname = LogFileName(test->dbname_, wal_file_id);
+ uint64_t size;
+ ASSERT_OK(env->GetFileSize(fname, &size));
+ ASSERT_GT(size, 0);
+#ifdef OS_WIN
+ // Windows disk cache behaves differently. When we truncate
+ // the original content is still in the cache due to the original
+ // handle is still open. Generally, in Windows, one prohibits
+ // shared access to files and it is not needed for WAL but we allow
+ // it to induce corruption at various tests.
+ test->Close();
+#endif
+ if (trunc) {
+ ASSERT_EQ(0, truncate(fname.c_str(), static_cast<int64_t>(size * off)));
+ } else {
+ InduceCorruption(fname, static_cast<size_t>(size * off + 8),
+ static_cast<size_t>(size * len));
+ }
+ }
+
+ // Overwrite data with 'a' from offset for length len
+ static void InduceCorruption(const std::string& filename, size_t offset,
+ size_t len) {
+ ASSERT_GT(len, 0U);
+
+ int fd = open(filename.c_str(), O_RDWR);
+
+ // On windows long is 32-bit
+ ASSERT_LE(offset, std::numeric_limits<long>::max());
+
+ ASSERT_GT(fd, 0);
+ ASSERT_EQ(offset, lseek(fd, static_cast<long>(offset), SEEK_SET));
+
+ void* buf = alloca(len);
+ memset(buf, 'b', len);
+ ASSERT_EQ(len, write(fd, buf, static_cast<unsigned int>(len)));
+
+ close(fd);
+ }
+};
+
+// Test scope:
+// - We expect to open the data store when there is incomplete trailing writes
+// at the end of any of the logs
+// - We do not expect to open the data store for corruption
+TEST_F(DBWALTest, kTolerateCorruptedTailRecords) {
+ const int jstart = RecoveryTestHelper::kWALFileOffset;
+ const int jend = jstart + RecoveryTestHelper::kWALFilesCount;
+
+ for (auto trunc : {true, false}) { /* Corruption style */
+ for (int i = 0; i < 3; i++) { /* Corruption offset position */
+ for (int j = jstart; j < jend; j++) { /* WAL file */
+ // Fill data for testing
+ Options options = CurrentOptions();
+ const size_t row_count = RecoveryTestHelper::FillData(this, &options);
+ // test checksum failure or parsing
+ RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3,
+ /*len%=*/.1, /*wal=*/j, trunc);
+
+ if (trunc) {
+ options.wal_recovery_mode =
+ WALRecoveryMode::kTolerateCorruptedTailRecords;
+ options.create_if_missing = false;
+ ASSERT_OK(TryReopen(options));
+ const size_t recovered_row_count = RecoveryTestHelper::GetData(this);
+ ASSERT_TRUE(i == 0 || recovered_row_count > 0);
+ ASSERT_LT(recovered_row_count, row_count);
+ } else {
+ options.wal_recovery_mode =
+ WALRecoveryMode::kTolerateCorruptedTailRecords;
+ ASSERT_NOK(TryReopen(options));
+ }
+ }
+ }
+ }
+}
+
+// Test scope:
+// We don't expect the data store to be opened if there is any corruption
+// (leading, middle or trailing -- incomplete writes or corruption)
+TEST_F(DBWALTest, kAbsoluteConsistency) {
+ const int jstart = RecoveryTestHelper::kWALFileOffset;
+ const int jend = jstart + RecoveryTestHelper::kWALFilesCount;
+
+ // Verify clean slate behavior
+ Options options = CurrentOptions();
+ const size_t row_count = RecoveryTestHelper::FillData(this, &options);
+ options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency;
+ options.create_if_missing = false;
+ ASSERT_OK(TryReopen(options));
+ ASSERT_EQ(RecoveryTestHelper::GetData(this), row_count);
+
+ for (auto trunc : {true, false}) { /* Corruption style */
+ for (int i = 0; i < 4; i++) { /* Corruption offset position */
+ if (trunc && i == 0) {
+ continue;
+ }
+
+ for (int j = jstart; j < jend; j++) { /* wal files */
+ // fill with new date
+ RecoveryTestHelper::FillData(this, &options);
+ // corrupt the wal
+ RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3,
+ /*len%=*/.1, j, trunc);
+ // verify
+ options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency;
+ options.create_if_missing = false;
+ ASSERT_NOK(TryReopen(options));
+ }
+ }
+ }
+}
+
+// Test scope:
+// We don't expect the data store to be opened if there is any inconsistency
+// between WAL and SST files
+TEST_F(DBWALTest, kPointInTimeRecoveryCFConsistency) {
+ Options options = CurrentOptions();
+ options.avoid_flush_during_recovery = true;
+
+ // Create DB with multiple column families.
+ CreateAndReopenWithCF({"one", "two"}, options);
+ ASSERT_OK(Put(1, "key1", "val1"));
+ ASSERT_OK(Put(2, "key2", "val2"));
+
+ // Record the offset at this point
+ Env* env = options.env;
+ uint64_t wal_file_id = dbfull()->TEST_LogfileNumber();
+ std::string fname = LogFileName(dbname_, wal_file_id);
+ uint64_t offset_to_corrupt;
+ ASSERT_OK(env->GetFileSize(fname, &offset_to_corrupt));
+ ASSERT_GT(offset_to_corrupt, 0);
+
+ ASSERT_OK(Put(1, "key3", "val3"));
+ // Corrupt WAL at location of key3
+ RecoveryTestHelper::InduceCorruption(
+ fname, static_cast<size_t>(offset_to_corrupt), static_cast<size_t>(4));
+ ASSERT_OK(Put(2, "key4", "val4"));
+ ASSERT_OK(Put(1, "key5", "val5"));
+ Flush(2);
+
+ // PIT recovery & verify
+ options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+ ASSERT_NOK(TryReopenWithColumnFamilies({"default", "one", "two"}, options));
+}
+
+// Test scope:
+// - We expect to open data store under all circumstances
+// - We expect only data upto the point where the first error was encountered
+TEST_F(DBWALTest, kPointInTimeRecovery) {
+ const int jstart = RecoveryTestHelper::kWALFileOffset;
+ const int jend = jstart + RecoveryTestHelper::kWALFilesCount;
+ const int maxkeys =
+ RecoveryTestHelper::kWALFilesCount * RecoveryTestHelper::kKeysPerWALFile;
+
+ for (auto trunc : {true, false}) { /* Corruption style */
+ for (int i = 0; i < 4; i++) { /* Offset of corruption */
+ for (int j = jstart; j < jend; j++) { /* WAL file */
+ // Fill data for testing
+ Options options = CurrentOptions();
+ const size_t row_count = RecoveryTestHelper::FillData(this, &options);
+
+ // Corrupt the wal
+ RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3,
+ /*len%=*/.1, j, trunc);
+
+ // Verify
+ options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+ options.create_if_missing = false;
+ ASSERT_OK(TryReopen(options));
+
+ // Probe data for invariants
+ size_t recovered_row_count = RecoveryTestHelper::GetData(this);
+ ASSERT_LT(recovered_row_count, row_count);
+
+ bool expect_data = true;
+ for (size_t k = 0; k < maxkeys; ++k) {
+ bool found = Get("key" + ToString(i)) != "NOT_FOUND";
+ if (expect_data && !found) {
+ expect_data = false;
+ }
+ ASSERT_EQ(found, expect_data);
+ }
+
+ const size_t min = RecoveryTestHelper::kKeysPerWALFile *
+ (j - RecoveryTestHelper::kWALFileOffset);
+ ASSERT_GE(recovered_row_count, min);
+ if (!trunc && i != 0) {
+ const size_t max = RecoveryTestHelper::kKeysPerWALFile *
+ (j - RecoveryTestHelper::kWALFileOffset + 1);
+ ASSERT_LE(recovered_row_count, max);
+ }
+ }
+ }
+ }
+}
+
+// Test scope:
+// - We expect to open the data store under all scenarios
+// - We expect to have recovered records past the corruption zone
+TEST_F(DBWALTest, kSkipAnyCorruptedRecords) {
+ const int jstart = RecoveryTestHelper::kWALFileOffset;
+ const int jend = jstart + RecoveryTestHelper::kWALFilesCount;
+
+ for (auto trunc : {true, false}) { /* Corruption style */
+ for (int i = 0; i < 4; i++) { /* Corruption offset */
+ for (int j = jstart; j < jend; j++) { /* wal files */
+ // Fill data for testing
+ Options options = CurrentOptions();
+ const size_t row_count = RecoveryTestHelper::FillData(this, &options);
+
+ // Corrupt the WAL
+ RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3,
+ /*len%=*/.1, j, trunc);
+
+ // Verify behavior
+ options.wal_recovery_mode = WALRecoveryMode::kSkipAnyCorruptedRecords;
+ options.create_if_missing = false;
+ ASSERT_OK(TryReopen(options));
+
+ // Probe data for invariants
+ size_t recovered_row_count = RecoveryTestHelper::GetData(this);
+ ASSERT_LT(recovered_row_count, row_count);
+
+ if (!trunc) {
+ ASSERT_TRUE(i != 0 || recovered_row_count > 0);
+ }
+ }
+ }
+ }
+}
+
+TEST_F(DBWALTest, AvoidFlushDuringRecovery) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.avoid_flush_during_recovery = false;
+
+ // Test with flush after recovery.
+ Reopen(options);
+ ASSERT_OK(Put("foo", "v1"));
+ ASSERT_OK(Put("bar", "v2"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("foo", "v3"));
+ ASSERT_OK(Put("bar", "v4"));
+ ASSERT_EQ(1, TotalTableFiles());
+ // Reopen DB. Check if WAL logs flushed.
+ Reopen(options);
+ ASSERT_EQ("v3", Get("foo"));
+ ASSERT_EQ("v4", Get("bar"));
+ ASSERT_EQ(2, TotalTableFiles());
+
+ // Test without flush after recovery.
+ options.avoid_flush_during_recovery = true;
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("foo", "v5"));
+ ASSERT_OK(Put("bar", "v6"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("foo", "v7"));
+ ASSERT_OK(Put("bar", "v8"));
+ ASSERT_EQ(1, TotalTableFiles());
+ // Reopen DB. WAL logs should not be flushed this time.
+ Reopen(options);
+ ASSERT_EQ("v7", Get("foo"));
+ ASSERT_EQ("v8", Get("bar"));
+ ASSERT_EQ(1, TotalTableFiles());
+
+ // Force flush with allow_2pc.
+ options.avoid_flush_during_recovery = true;
+ options.allow_2pc = true;
+ ASSERT_OK(Put("foo", "v9"));
+ ASSERT_OK(Put("bar", "v10"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("foo", "v11"));
+ ASSERT_OK(Put("bar", "v12"));
+ Reopen(options);
+ ASSERT_EQ("v11", Get("foo"));
+ ASSERT_EQ("v12", Get("bar"));
+ ASSERT_EQ(3, TotalTableFiles());
+}
+
+TEST_F(DBWALTest, WalCleanupAfterAvoidFlushDuringRecovery) {
+ // Verifies WAL files that were present during recovery, but not flushed due
+ // to avoid_flush_during_recovery, will be considered for deletion at a later
+ // stage. We check at least one such file is deleted during Flush().
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.avoid_flush_during_recovery = true;
+ Reopen(options);
+
+ ASSERT_OK(Put("foo", "v1"));
+ Reopen(options);
+ for (int i = 0; i < 2; ++i) {
+ if (i > 0) {
+ // Flush() triggers deletion of obsolete tracked files
+ Flush();
+ }
+ VectorLogPtr log_files;
+ ASSERT_OK(dbfull()->GetSortedWalFiles(log_files));
+ if (i == 0) {
+ ASSERT_GT(log_files.size(), 0);
+ } else {
+ ASSERT_EQ(0, log_files.size());
+ }
+ }
+}
+
+TEST_F(DBWALTest, RecoverWithoutFlush) {
+ Options options = CurrentOptions();
+ options.avoid_flush_during_recovery = true;
+ options.create_if_missing = false;
+ options.disable_auto_compactions = true;
+ options.write_buffer_size = 64 * 1024 * 1024;
+
+ size_t count = RecoveryTestHelper::FillData(this, &options);
+ auto validateData = [this, count]() {
+ for (size_t i = 0; i < count; i++) {
+ ASSERT_NE(Get("key" + ToString(i)), "NOT_FOUND");
+ }
+ };
+ Reopen(options);
+ validateData();
+ // Insert some data without flush
+ ASSERT_OK(Put("foo", "foo_v1"));
+ ASSERT_OK(Put("bar", "bar_v1"));
+ Reopen(options);
+ validateData();
+ ASSERT_EQ(Get("foo"), "foo_v1");
+ ASSERT_EQ(Get("bar"), "bar_v1");
+ // Insert again and reopen
+ ASSERT_OK(Put("foo", "foo_v2"));
+ ASSERT_OK(Put("bar", "bar_v2"));
+ Reopen(options);
+ validateData();
+ ASSERT_EQ(Get("foo"), "foo_v2");
+ ASSERT_EQ(Get("bar"), "bar_v2");
+ // manual flush and insert again
+ Flush();
+ ASSERT_EQ(Get("foo"), "foo_v2");
+ ASSERT_EQ(Get("bar"), "bar_v2");
+ ASSERT_OK(Put("foo", "foo_v3"));
+ ASSERT_OK(Put("bar", "bar_v3"));
+ Reopen(options);
+ validateData();
+ ASSERT_EQ(Get("foo"), "foo_v3");
+ ASSERT_EQ(Get("bar"), "bar_v3");
+}
+
+TEST_F(DBWALTest, RecoverWithoutFlushMultipleCF) {
+ const std::string kSmallValue = "v";
+ const std::string kLargeValue = DummyString(1024);
+ Options options = CurrentOptions();
+ options.avoid_flush_during_recovery = true;
+ options.create_if_missing = false;
+ options.disable_auto_compactions = true;
+
+ auto countWalFiles = [this]() {
+ VectorLogPtr log_files;
+ dbfull()->GetSortedWalFiles(log_files);
+ return log_files.size();
+ };
+
+ // Create DB with multiple column families and multiple log files.
+ CreateAndReopenWithCF({"one", "two"}, options);
+ ASSERT_OK(Put(0, "key1", kSmallValue));
+ ASSERT_OK(Put(1, "key2", kLargeValue));
+ Flush(1);
+ ASSERT_EQ(1, countWalFiles());
+ ASSERT_OK(Put(0, "key3", kSmallValue));
+ ASSERT_OK(Put(2, "key4", kLargeValue));
+ Flush(2);
+ ASSERT_EQ(2, countWalFiles());
+
+ // Reopen, insert and flush.
+ options.db_write_buffer_size = 64 * 1024 * 1024;
+ ReopenWithColumnFamilies({"default", "one", "two"}, options);
+ ASSERT_EQ(Get(0, "key1"), kSmallValue);
+ ASSERT_EQ(Get(1, "key2"), kLargeValue);
+ ASSERT_EQ(Get(0, "key3"), kSmallValue);
+ ASSERT_EQ(Get(2, "key4"), kLargeValue);
+ // Insert more data.
+ ASSERT_OK(Put(0, "key5", kLargeValue));
+ ASSERT_OK(Put(1, "key6", kLargeValue));
+ ASSERT_EQ(3, countWalFiles());
+ Flush(1);
+ ASSERT_OK(Put(2, "key7", kLargeValue));
+ dbfull()->FlushWAL(false);
+ ASSERT_EQ(4, countWalFiles());
+
+ // Reopen twice and validate.
+ for (int i = 0; i < 2; i++) {
+ ReopenWithColumnFamilies({"default", "one", "two"}, options);
+ ASSERT_EQ(Get(0, "key1"), kSmallValue);
+ ASSERT_EQ(Get(1, "key2"), kLargeValue);
+ ASSERT_EQ(Get(0, "key3"), kSmallValue);
+ ASSERT_EQ(Get(2, "key4"), kLargeValue);
+ ASSERT_EQ(Get(0, "key5"), kLargeValue);
+ ASSERT_EQ(Get(1, "key6"), kLargeValue);
+ ASSERT_EQ(Get(2, "key7"), kLargeValue);
+ ASSERT_EQ(4, countWalFiles());
+ }
+}
+
+// In this test we are trying to do the following:
+// 1. Create a DB with corrupted WAL log;
+// 2. Open with avoid_flush_during_recovery = true;
+// 3. Append more data without flushing, which creates new WAL log.
+// 4. Open again. See if it can correctly handle previous corruption.
+TEST_F(DBWALTest, RecoverFromCorruptedWALWithoutFlush) {
+ const int jstart = RecoveryTestHelper::kWALFileOffset;
+ const int jend = jstart + RecoveryTestHelper::kWALFilesCount;
+ const int kAppendKeys = 100;
+ Options options = CurrentOptions();
+ options.avoid_flush_during_recovery = true;
+ options.create_if_missing = false;
+ options.disable_auto_compactions = true;
+ options.write_buffer_size = 64 * 1024 * 1024;
+
+ auto getAll = [this]() {
+ std::vector<std::pair<std::string, std::string>> data;
+ ReadOptions ropt;
+ Iterator* iter = dbfull()->NewIterator(ropt);
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ data.push_back(
+ std::make_pair(iter->key().ToString(), iter->value().ToString()));
+ }
+ delete iter;
+ return data;
+ };
+ for (auto& mode : wal_recovery_mode_string_map) {
+ options.wal_recovery_mode = mode.second;
+ for (auto trunc : {true, false}) {
+ for (int i = 0; i < 4; i++) {
+ for (int j = jstart; j < jend; j++) {
+ // Create corrupted WAL
+ RecoveryTestHelper::FillData(this, &options);
+ RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3,
+ /*len%=*/.1, /*wal=*/j, trunc);
+ // Skip the test if DB won't open.
+ if (!TryReopen(options).ok()) {
+ ASSERT_TRUE(options.wal_recovery_mode ==
+ WALRecoveryMode::kAbsoluteConsistency ||
+ (!trunc &&
+ options.wal_recovery_mode ==
+ WALRecoveryMode::kTolerateCorruptedTailRecords));
+ continue;
+ }
+ ASSERT_OK(TryReopen(options));
+ // Append some more data.
+ for (int k = 0; k < kAppendKeys; k++) {
+ std::string key = "extra_key" + ToString(k);
+ std::string value = DummyString(RecoveryTestHelper::kValueSize);
+ ASSERT_OK(Put(key, value));
+ }
+ // Save data for comparison.
+ auto data = getAll();
+ // Reopen. Verify data.
+ ASSERT_OK(TryReopen(options));
+ auto actual_data = getAll();
+ ASSERT_EQ(data, actual_data);
+ }
+ }
+ }
+ }
+}
+
+// Tests that total log size is recovered if we set
+// avoid_flush_during_recovery=true.
+// Flush should trigger if max_total_wal_size is reached.
+TEST_F(DBWALTest, RestoreTotalLogSizeAfterRecoverWithoutFlush) {
+ class TestFlushListener : public EventListener {
+ public:
+ std::atomic<int> count{0};
+
+ TestFlushListener() = default;
+
+ void OnFlushBegin(DB* /*db*/, const FlushJobInfo& flush_job_info) override {
+ count++;
+ assert(FlushReason::kWriteBufferManager == flush_job_info.flush_reason);
+ }
+ };
+ std::shared_ptr<TestFlushListener> test_listener =
+ std::make_shared<TestFlushListener>();
+
+ constexpr size_t kKB = 1024;
+ constexpr size_t kMB = 1024 * 1024;
+ Options options = CurrentOptions();
+ options.avoid_flush_during_recovery = true;
+ options.max_total_wal_size = 1 * kMB;
+ options.listeners.push_back(test_listener);
+ // Have to open DB in multi-CF mode to trigger flush when
+ // max_total_wal_size is reached.
+ CreateAndReopenWithCF({"one"}, options);
+ // Write some keys and we will end up with one log file which is slightly
+ // smaller than 1MB.
+ std::string value_100k(100 * kKB, 'v');
+ std::string value_300k(300 * kKB, 'v');
+ ASSERT_OK(Put(0, "foo", "v1"));
+ for (int i = 0; i < 9; i++) {
+ ASSERT_OK(Put(1, "key" + ToString(i), value_100k));
+ }
+ // Get log files before reopen.
+ VectorLogPtr log_files_before;
+ ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before));
+ ASSERT_EQ(1, log_files_before.size());
+ uint64_t log_size_before = log_files_before[0]->SizeFileBytes();
+ ASSERT_GT(log_size_before, 900 * kKB);
+ ASSERT_LT(log_size_before, 1 * kMB);
+ ReopenWithColumnFamilies({"default", "one"}, options);
+ // Write one more value to make log larger than 1MB.
+ ASSERT_OK(Put(1, "bar", value_300k));
+ // Get log files again. A new log file will be opened.
+ VectorLogPtr log_files_after_reopen;
+ ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_after_reopen));
+ ASSERT_EQ(2, log_files_after_reopen.size());
+ ASSERT_EQ(log_files_before[0]->LogNumber(),
+ log_files_after_reopen[0]->LogNumber());
+ ASSERT_GT(log_files_after_reopen[0]->SizeFileBytes() +
+ log_files_after_reopen[1]->SizeFileBytes(),
+ 1 * kMB);
+ // Write one more key to trigger flush.
+ ASSERT_OK(Put(0, "foo", "v2"));
+ dbfull()->TEST_WaitForFlushMemTable();
+ // Flushed two column families.
+ ASSERT_EQ(2, test_listener->count.load());
+}
+
+#if defined(ROCKSDB_PLATFORM_POSIX)
+#if defined(ROCKSDB_FALLOCATE_PRESENT)
+// Tests that we will truncate the preallocated space of the last log from
+// previous.
+TEST_F(DBWALTest, TruncateLastLogAfterRecoverWithoutFlush) {
+ constexpr size_t kKB = 1024;
+ Options options = CurrentOptions();
+ options.avoid_flush_during_recovery = true;
+ DestroyAndReopen(options);
+ size_t preallocated_size =
+ dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size);
+ ASSERT_OK(Put("foo", "v1"));
+ VectorLogPtr log_files_before;
+ ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before));
+ ASSERT_EQ(1, log_files_before.size());
+ auto& file_before = log_files_before[0];
+ ASSERT_LT(file_before->SizeFileBytes(), 1 * kKB);
+ // The log file has preallocated space.
+ ASSERT_GE(GetAllocatedFileSize(dbname_ + file_before->PathName()),
+ preallocated_size);
+ Reopen(options);
+ VectorLogPtr log_files_after;
+ ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_after));
+ ASSERT_EQ(1, log_files_after.size());
+ ASSERT_LT(log_files_after[0]->SizeFileBytes(), 1 * kKB);
+ // The preallocated space should be truncated.
+ ASSERT_LT(GetAllocatedFileSize(dbname_ + file_before->PathName()),
+ preallocated_size);
+}
+#endif // ROCKSDB_FALLOCATE_PRESENT
+#endif // ROCKSDB_PLATFORM_POSIX
+
+#endif // ROCKSDB_LITE
+
+TEST_F(DBWALTest, WalTermTest) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_OK(Put(1, "foo", "bar"));
+
+ WriteOptions wo;
+ wo.sync = true;
+ wo.disableWAL = false;
+
+ WriteBatch batch;
+ batch.Put("foo", "bar");
+ batch.MarkWalTerminationPoint();
+ batch.Put("foo2", "bar2");
+
+ ASSERT_OK(dbfull()->Write(wo, &batch));
+
+ // make sure we can re-open it.
+ ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+ ASSERT_EQ("bar", Get(1, "foo"));
+ ASSERT_EQ("NOT_FOUND", Get(1, "foo2"));
+}
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_write_test.cc b/src/rocksdb/db/db_write_test.cc
new file mode 100644
index 000000000..cc1aaac08
--- /dev/null
+++ b/src/rocksdb/db/db_write_test.cc
@@ -0,0 +1,329 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <atomic>
+#include <memory>
+#include <thread>
+#include <vector>
+#include <fstream>
+#include "db/db_test_util.h"
+#include "db/write_batch_internal.h"
+#include "db/write_thread.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/sync_point.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Test variations of WriteImpl.
+class DBWriteTest : public DBTestBase, public testing::WithParamInterface<int> {
+ public:
+ DBWriteTest() : DBTestBase("/db_write_test") {}
+
+ Options GetOptions() { return DBTestBase::GetOptions(GetParam()); }
+
+ void Open() { DBTestBase::Reopen(GetOptions()); }
+};
+
+// It is invalid to do sync write while disabling WAL.
+TEST_P(DBWriteTest, SyncAndDisableWAL) {
+ WriteOptions write_options;
+ write_options.sync = true;
+ write_options.disableWAL = true;
+ ASSERT_TRUE(dbfull()->Put(write_options, "foo", "bar").IsInvalidArgument());
+ WriteBatch batch;
+ ASSERT_OK(batch.Put("foo", "bar"));
+ ASSERT_TRUE(dbfull()->Write(write_options, &batch).IsInvalidArgument());
+}
+
+TEST_P(DBWriteTest, WriteThreadHangOnWriteStall) {
+ Options options = GetOptions();
+ options.level0_stop_writes_trigger = options.level0_slowdown_writes_trigger = 4;
+ std::vector<port::Thread> threads;
+ std::atomic<int> thread_num(0);
+ port::Mutex mutex;
+ port::CondVar cv(&mutex);
+
+ Reopen(options);
+
+ std::function<void()> write_slowdown_func = [&]() {
+ int a = thread_num.fetch_add(1);
+ std::string key = "foo" + std::to_string(a);
+ WriteOptions wo;
+ wo.no_slowdown = false;
+ dbfull()->Put(wo, key, "bar");
+ };
+ std::function<void()> write_no_slowdown_func = [&]() {
+ int a = thread_num.fetch_add(1);
+ std::string key = "foo" + std::to_string(a);
+ WriteOptions wo;
+ wo.no_slowdown = true;
+ dbfull()->Put(wo, key, "bar");
+ };
+ std::function<void(void *)> unblock_main_thread_func = [&](void *) {
+ mutex.Lock();
+ cv.SignalAll();
+ mutex.Unlock();
+ };
+
+ // Create 3 L0 files and schedule 4th without waiting
+ Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar");
+ Flush();
+ Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar");
+ Flush();
+ Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar");
+ Flush();
+ Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar");
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WriteThread::JoinBatchGroup:Start", unblock_main_thread_func);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBWriteTest::WriteThreadHangOnWriteStall:1",
+ "DBImpl::BackgroundCallFlush:start"},
+ {"DBWriteTest::WriteThreadHangOnWriteStall:2",
+ "DBImpl::WriteImpl:BeforeLeaderEnters"},
+ // Make compaction start wait for the write stall to be detected and
+ // implemented by a write group leader
+ {"DBWriteTest::WriteThreadHangOnWriteStall:3",
+ "BackgroundCallCompaction:0"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Schedule creation of 4th L0 file without waiting. This will seal the
+ // memtable and then wait for a sync point before writing the file. We need
+ // to do it this way because SwitchMemtable() needs to enter the
+ // write_thread
+ FlushOptions fopt;
+ fopt.wait = false;
+ dbfull()->Flush(fopt);
+
+ // Create a mix of slowdown/no_slowdown write threads
+ mutex.Lock();
+ // First leader
+ threads.emplace_back(write_slowdown_func);
+ cv.Wait();
+ // Second leader. Will stall writes
+ threads.emplace_back(write_slowdown_func);
+ cv.Wait();
+ threads.emplace_back(write_no_slowdown_func);
+ cv.Wait();
+ threads.emplace_back(write_slowdown_func);
+ cv.Wait();
+ threads.emplace_back(write_no_slowdown_func);
+ cv.Wait();
+ threads.emplace_back(write_slowdown_func);
+ cv.Wait();
+ mutex.Unlock();
+
+ TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:1");
+ dbfull()->TEST_WaitForFlushMemTable(nullptr);
+ // This would have triggered a write stall. Unblock the write group leader
+ TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:2");
+ // The leader is going to create missing newer links. When the leader finishes,
+ // the next leader is going to delay writes and fail writers with no_slowdown
+
+ TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:3");
+ for (auto& t : threads) {
+ t.join();
+ }
+}
+
+TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) {
+ constexpr int kNumThreads = 5;
+ std::unique_ptr<FaultInjectionTestEnv> mock_env(
+ new FaultInjectionTestEnv(Env::Default()));
+ Options options = GetOptions();
+ options.env = mock_env.get();
+ Reopen(options);
+ std::atomic<int> ready_count{0};
+ std::atomic<int> leader_count{0};
+ std::vector<port::Thread> threads;
+ mock_env->SetFilesystemActive(false);
+
+ // Wait until all threads linked to write threads, to make sure
+ // all threads join the same batch group.
+ SyncPoint::GetInstance()->SetCallBack(
+ "WriteThread::JoinBatchGroup:Wait", [&](void* arg) {
+ ready_count++;
+ auto* w = reinterpret_cast<WriteThread::Writer*>(arg);
+ if (w->state == WriteThread::STATE_GROUP_LEADER) {
+ leader_count++;
+ while (ready_count < kNumThreads) {
+ // busy waiting
+ }
+ }
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ for (int i = 0; i < kNumThreads; i++) {
+ threads.push_back(port::Thread(
+ [&](int index) {
+ // All threads should fail.
+ auto res = Put("key" + ToString(index), "value");
+ if (options.manual_wal_flush) {
+ ASSERT_TRUE(res.ok());
+ // we should see fs error when we do the flush
+
+ // TSAN reports a false alarm for lock-order-inversion but Open and
+ // FlushWAL are not run concurrently. Disabling this until TSAN is
+ // fixed.
+ // res = dbfull()->FlushWAL(false);
+ // ASSERT_FALSE(res.ok());
+ } else {
+ ASSERT_FALSE(res.ok());
+ }
+ },
+ i));
+ }
+ for (int i = 0; i < kNumThreads; i++) {
+ threads[i].join();
+ }
+ ASSERT_EQ(1, leader_count);
+ // Close before mock_env destruct.
+ Close();
+}
+
+TEST_P(DBWriteTest, ManualWalFlushInEffect) {
+ Options options = GetOptions();
+ Reopen(options);
+ // try the 1st WAL created during open
+ ASSERT_TRUE(Put("key" + ToString(0), "value").ok());
+ ASSERT_TRUE(options.manual_wal_flush != dbfull()->TEST_WALBufferIsEmpty());
+ ASSERT_TRUE(dbfull()->FlushWAL(false).ok());
+ ASSERT_TRUE(dbfull()->TEST_WALBufferIsEmpty());
+ // try the 2nd wal created during SwitchWAL
+ dbfull()->TEST_SwitchWAL();
+ ASSERT_TRUE(Put("key" + ToString(0), "value").ok());
+ ASSERT_TRUE(options.manual_wal_flush != dbfull()->TEST_WALBufferIsEmpty());
+ ASSERT_TRUE(dbfull()->FlushWAL(false).ok());
+ ASSERT_TRUE(dbfull()->TEST_WALBufferIsEmpty());
+}
+
+TEST_P(DBWriteTest, IOErrorOnWALWriteTriggersReadOnlyMode) {
+ std::unique_ptr<FaultInjectionTestEnv> mock_env(
+ new FaultInjectionTestEnv(Env::Default()));
+ Options options = GetOptions();
+ options.env = mock_env.get();
+ Reopen(options);
+ for (int i = 0; i < 2; i++) {
+ // Forcibly fail WAL write for the first Put only. Subsequent Puts should
+ // fail due to read-only mode
+ mock_env->SetFilesystemActive(i != 0);
+ auto res = Put("key" + ToString(i), "value");
+ // TSAN reports a false alarm for lock-order-inversion but Open and
+ // FlushWAL are not run concurrently. Disabling this until TSAN is
+ // fixed.
+ /*
+ if (options.manual_wal_flush && i == 0) {
+ // even with manual_wal_flush the 2nd Put should return error because of
+ // the read-only mode
+ ASSERT_TRUE(res.ok());
+ // we should see fs error when we do the flush
+ res = dbfull()->FlushWAL(false);
+ }
+ */
+ if (!options.manual_wal_flush) {
+ ASSERT_FALSE(res.ok());
+ }
+ }
+ // Close before mock_env destruct.
+ Close();
+}
+
+TEST_P(DBWriteTest, IOErrorOnSwitchMemtable) {
+ Random rnd(301);
+ std::unique_ptr<FaultInjectionTestEnv> mock_env(
+ new FaultInjectionTestEnv(Env::Default()));
+ Options options = GetOptions();
+ options.env = mock_env.get();
+ options.writable_file_max_buffer_size = 4 * 1024 * 1024;
+ options.write_buffer_size = 3 * 512 * 1024;
+ options.wal_bytes_per_sync = 256 * 1024;
+ options.manual_wal_flush = true;
+ Reopen(options);
+ mock_env->SetFilesystemActive(false, Status::IOError("Not active"));
+ Status s;
+ for (int i = 0; i < 4 * 512; ++i) {
+ s = Put(Key(i), RandomString(&rnd, 1024));
+ if (!s.ok()) {
+ break;
+ }
+ }
+ ASSERT_EQ(s.severity(), Status::Severity::kFatalError);
+
+ mock_env->SetFilesystemActive(true);
+ // Close before mock_env destruct.
+ Close();
+}
+
+// Test that db->LockWAL() flushes the WAL after locking.
+TEST_P(DBWriteTest, LockWalInEffect) {
+ Options options = GetOptions();
+ Reopen(options);
+ // try the 1st WAL created during open
+ ASSERT_OK(Put("key" + ToString(0), "value"));
+ ASSERT_TRUE(options.manual_wal_flush != dbfull()->TEST_WALBufferIsEmpty());
+ ASSERT_OK(dbfull()->LockWAL());
+ ASSERT_TRUE(dbfull()->TEST_WALBufferIsEmpty(false));
+ ASSERT_OK(dbfull()->UnlockWAL());
+ // try the 2nd wal created during SwitchWAL
+ dbfull()->TEST_SwitchWAL();
+ ASSERT_OK(Put("key" + ToString(0), "value"));
+ ASSERT_TRUE(options.manual_wal_flush != dbfull()->TEST_WALBufferIsEmpty());
+ ASSERT_OK(dbfull()->LockWAL());
+ ASSERT_TRUE(dbfull()->TEST_WALBufferIsEmpty(false));
+ ASSERT_OK(dbfull()->UnlockWAL());
+}
+
+TEST_P(DBWriteTest, ConcurrentlyDisabledWAL) {
+ Options options = GetOptions();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.statistics->set_stats_level(StatsLevel::kAll);
+ Reopen(options);
+ std::string wal_key_prefix = "WAL_KEY_";
+ std::string no_wal_key_prefix = "K_";
+ // 100 KB value each for NO-WAL operation
+ std::string no_wal_value(1024 * 100, 'X');
+ // 1B value each for WAL operation
+ std::string wal_value = "0";
+ std::thread threads[10];
+ for (int t = 0; t < 10; t++) {
+ threads[t] = std::thread([t, wal_key_prefix, wal_value, no_wal_key_prefix, no_wal_value, this] {
+ for(int i = 0; i < 10; i++) {
+ ROCKSDB_NAMESPACE::WriteOptions write_option_disable;
+ write_option_disable.disableWAL = true;
+ ROCKSDB_NAMESPACE::WriteOptions write_option_default;
+ std::string no_wal_key = no_wal_key_prefix + std::to_string(t) +
+ "_" + std::to_string(i);
+ this->Put(no_wal_key, no_wal_value, write_option_disable);
+ std::string wal_key =
+ wal_key_prefix + std::to_string(i) + "_" + std::to_string(i);
+ this->Put(wal_key, wal_value, write_option_default);
+ dbfull()->SyncWAL();
+ }
+ return 0;
+ });
+ }
+ for (auto& t: threads) {
+ t.join();
+ }
+ uint64_t bytes_num = options.statistics->getTickerCount(
+ ROCKSDB_NAMESPACE::Tickers::WAL_FILE_BYTES);
+ // written WAL size should less than 100KB (even included HEADER & FOOTER overhead)
+ ASSERT_LE(bytes_num, 1024 * 100);
+}
+
+INSTANTIATE_TEST_CASE_P(DBWriteTestInstance, DBWriteTest,
+ testing::Values(DBTestBase::kDefault,
+ DBTestBase::kConcurrentWALWrites,
+ DBTestBase::kPipelinedWrite));
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/dbformat.cc b/src/rocksdb/db/dbformat.cc
new file mode 100644
index 000000000..e10af2b85
--- /dev/null
+++ b/src/rocksdb/db/dbformat.cc
@@ -0,0 +1,197 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/dbformat.h"
+
+#include <stdio.h>
+#include <cinttypes>
+#include "monitoring/perf_context_imp.h"
+#include "port/port.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// kValueTypeForSeek defines the ValueType that should be passed when
+// constructing a ParsedInternalKey object for seeking to a particular
+// sequence number (since we sort sequence numbers in decreasing order
+// and the value type is embedded as the low 8 bits in the sequence
+// number in internal keys, we need to use the highest-numbered
+// ValueType, not the lowest).
+const ValueType kValueTypeForSeek = kTypeBlobIndex;
+const ValueType kValueTypeForSeekForPrev = kTypeDeletion;
+
+uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
+ assert(seq <= kMaxSequenceNumber);
+ assert(IsExtendedValueType(t));
+ return (seq << 8) | t;
+}
+
+EntryType GetEntryType(ValueType value_type) {
+ switch (value_type) {
+ case kTypeValue:
+ return kEntryPut;
+ case kTypeDeletion:
+ return kEntryDelete;
+ case kTypeSingleDeletion:
+ return kEntrySingleDelete;
+ case kTypeMerge:
+ return kEntryMerge;
+ case kTypeRangeDeletion:
+ return kEntryRangeDeletion;
+ case kTypeBlobIndex:
+ return kEntryBlobIndex;
+ default:
+ return kEntryOther;
+ }
+}
+
+bool ParseFullKey(const Slice& internal_key, FullKey* fkey) {
+ ParsedInternalKey ikey;
+ if (!ParseInternalKey(internal_key, &ikey)) {
+ return false;
+ }
+ fkey->user_key = ikey.user_key;
+ fkey->sequence = ikey.sequence;
+ fkey->type = GetEntryType(ikey.type);
+ return true;
+}
+
+void UnPackSequenceAndType(uint64_t packed, uint64_t* seq, ValueType* t) {
+ *seq = packed >> 8;
+ *t = static_cast<ValueType>(packed & 0xff);
+
+ assert(*seq <= kMaxSequenceNumber);
+ assert(IsExtendedValueType(*t));
+}
+
+void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
+ result->append(key.user_key.data(), key.user_key.size());
+ PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
+}
+
+void AppendInternalKeyFooter(std::string* result, SequenceNumber s,
+ ValueType t) {
+ PutFixed64(result, PackSequenceAndType(s, t));
+}
+
+std::string ParsedInternalKey::DebugString(bool hex) const {
+ char buf[50];
+ snprintf(buf, sizeof(buf), "' seq:%" PRIu64 ", type:%d", sequence,
+ static_cast<int>(type));
+ std::string result = "'";
+ result += user_key.ToString(hex);
+ result += buf;
+ return result;
+}
+
+std::string InternalKey::DebugString(bool hex) const {
+ std::string result;
+ ParsedInternalKey parsed;
+ if (ParseInternalKey(rep_, &parsed)) {
+ result = parsed.DebugString(hex);
+ } else {
+ result = "(bad)";
+ result.append(EscapeString(rep_));
+ }
+ return result;
+}
+
+const char* InternalKeyComparator::Name() const { return name_.c_str(); }
+
+int InternalKeyComparator::Compare(const ParsedInternalKey& a,
+ const ParsedInternalKey& b) const {
+ // Order by:
+ // increasing user key (according to user-supplied comparator)
+ // decreasing sequence number
+ // decreasing type (though sequence# should be enough to disambiguate)
+ int r = user_comparator_.Compare(a.user_key, b.user_key);
+ if (r == 0) {
+ if (a.sequence > b.sequence) {
+ r = -1;
+ } else if (a.sequence < b.sequence) {
+ r = +1;
+ } else if (a.type > b.type) {
+ r = -1;
+ } else if (a.type < b.type) {
+ r = +1;
+ }
+ }
+ return r;
+}
+
+void InternalKeyComparator::FindShortestSeparator(std::string* start,
+ const Slice& limit) const {
+ // Attempt to shorten the user portion of the key
+ Slice user_start = ExtractUserKey(*start);
+ Slice user_limit = ExtractUserKey(limit);
+ std::string tmp(user_start.data(), user_start.size());
+ user_comparator_.FindShortestSeparator(&tmp, user_limit);
+ if (tmp.size() <= user_start.size() &&
+ user_comparator_.Compare(user_start, tmp) < 0) {
+ // User key has become shorter physically, but larger logically.
+ // Tack on the earliest possible number to the shortened user key.
+ PutFixed64(&tmp,
+ PackSequenceAndType(kMaxSequenceNumber, kValueTypeForSeek));
+ assert(this->Compare(*start, tmp) < 0);
+ assert(this->Compare(tmp, limit) < 0);
+ start->swap(tmp);
+ }
+}
+
+void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
+ Slice user_key = ExtractUserKey(*key);
+ std::string tmp(user_key.data(), user_key.size());
+ user_comparator_.FindShortSuccessor(&tmp);
+ if (tmp.size() <= user_key.size() &&
+ user_comparator_.Compare(user_key, tmp) < 0) {
+ // User key has become shorter physically, but larger logically.
+ // Tack on the earliest possible number to the shortened user key.
+ PutFixed64(&tmp,
+ PackSequenceAndType(kMaxSequenceNumber, kValueTypeForSeek));
+ assert(this->Compare(*key, tmp) < 0);
+ key->swap(tmp);
+ }
+}
+
+LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s,
+ const Slice* ts) {
+ size_t usize = _user_key.size();
+ size_t ts_sz = (nullptr == ts) ? 0 : ts->size();
+ size_t needed = usize + ts_sz + 13; // A conservative estimate
+ char* dst;
+ if (needed <= sizeof(space_)) {
+ dst = space_;
+ } else {
+ dst = new char[needed];
+ }
+ start_ = dst;
+ // NOTE: We don't support users keys of more than 2GB :)
+ dst = EncodeVarint32(dst, static_cast<uint32_t>(usize + ts_sz + 8));
+ kstart_ = dst;
+ memcpy(dst, _user_key.data(), usize);
+ dst += usize;
+ if (nullptr != ts) {
+ memcpy(dst, ts->data(), ts_sz);
+ dst += ts_sz;
+ }
+ EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek));
+ dst += 8;
+ end_ = dst;
+}
+
+void IterKey::EnlargeBuffer(size_t key_size) {
+ // If size is smaller than buffer size, continue using current buffer,
+ // or the static allocated one, as default
+ assert(key_size > buf_size_);
+ // Need to enlarge the buffer.
+ ResetBuffer();
+ buf_ = new char[key_size];
+ buf_size_ = key_size;
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/dbformat.h b/src/rocksdb/db/dbformat.h
new file mode 100644
index 000000000..de98be8df
--- /dev/null
+++ b/src/rocksdb/db/dbformat.h
@@ -0,0 +1,671 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdio.h>
+#include <memory>
+#include <string>
+#include <utility>
+#include "db/lookup_key.h"
+#include "db/merge_context.h"
+#include "logging/logging.h"
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "rocksdb/types.h"
+#include "util/coding.h"
+#include "util/user_comparator_wrapper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// The file declares data structures and functions that deal with internal
+// keys.
+// Each internal key contains a user key, a sequence number (SequenceNumber)
+// and a type (ValueType), and they are usually encoded together.
+// There are some related helper classes here.
+
+class InternalKey;
+
+// Value types encoded as the last component of internal keys.
+// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
+// data structures.
+// The highest bit of the value type needs to be reserved to SST tables
+// for them to do more flexible encoding.
+enum ValueType : unsigned char {
+ kTypeDeletion = 0x0,
+ kTypeValue = 0x1,
+ kTypeMerge = 0x2,
+ kTypeLogData = 0x3, // WAL only.
+ kTypeColumnFamilyDeletion = 0x4, // WAL only.
+ kTypeColumnFamilyValue = 0x5, // WAL only.
+ kTypeColumnFamilyMerge = 0x6, // WAL only.
+ kTypeSingleDeletion = 0x7,
+ kTypeColumnFamilySingleDeletion = 0x8, // WAL only.
+ kTypeBeginPrepareXID = 0x9, // WAL only.
+ kTypeEndPrepareXID = 0xA, // WAL only.
+ kTypeCommitXID = 0xB, // WAL only.
+ kTypeRollbackXID = 0xC, // WAL only.
+ kTypeNoop = 0xD, // WAL only.
+ kTypeColumnFamilyRangeDeletion = 0xE, // WAL only.
+ kTypeRangeDeletion = 0xF, // meta block
+ kTypeColumnFamilyBlobIndex = 0x10, // Blob DB only
+ kTypeBlobIndex = 0x11, // Blob DB only
+ // When the prepared record is also persisted in db, we use a different
+ // record. This is to ensure that the WAL that is generated by a WritePolicy
+ // is not mistakenly read by another, which would result into data
+ // inconsistency.
+ kTypeBeginPersistedPrepareXID = 0x12, // WAL only.
+ // Similar to kTypeBeginPersistedPrepareXID, this is to ensure that WAL
+ // generated by WriteUnprepared write policy is not mistakenly read by
+ // another.
+ kTypeBeginUnprepareXID = 0x13, // WAL only.
+ kMaxValue = 0x7F // Not used for storing records.
+};
+
+// Defined in dbformat.cc
+extern const ValueType kValueTypeForSeek;
+extern const ValueType kValueTypeForSeekForPrev;
+
+// Checks whether a type is an inline value type
+// (i.e. a type used in memtable skiplist and sst file datablock).
+inline bool IsValueType(ValueType t) {
+ return t <= kTypeMerge || t == kTypeSingleDeletion || t == kTypeBlobIndex;
+}
+
+// Checks whether a type is from user operation
+// kTypeRangeDeletion is in meta block so this API is separated from above
+inline bool IsExtendedValueType(ValueType t) {
+ return IsValueType(t) || t == kTypeRangeDeletion;
+}
+
+// We leave eight bits empty at the bottom so a type and sequence#
+// can be packed together into 64-bits.
+static const SequenceNumber kMaxSequenceNumber = ((0x1ull << 56) - 1);
+
+static const SequenceNumber kDisableGlobalSequenceNumber = port::kMaxUint64;
+
+// The data structure that represents an internal key in the way that user_key,
+// sequence number and type are stored in separated forms.
+struct ParsedInternalKey {
+ Slice user_key;
+ SequenceNumber sequence;
+ ValueType type;
+
+ ParsedInternalKey()
+ : sequence(kMaxSequenceNumber) // Make code analyzer happy
+ {} // Intentionally left uninitialized (for speed)
+ ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)
+ : user_key(u), sequence(seq), type(t) {}
+ std::string DebugString(bool hex = false) const;
+
+ void clear() {
+ user_key.clear();
+ sequence = 0;
+ type = kTypeDeletion;
+ }
+};
+
+// Return the length of the encoding of "key".
+inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
+ return key.user_key.size() + 8;
+}
+
+// Pack a sequence number and a ValueType into a uint64_t
+extern uint64_t PackSequenceAndType(uint64_t seq, ValueType t);
+
+// Given the result of PackSequenceAndType, store the sequence number in *seq
+// and the ValueType in *t.
+extern void UnPackSequenceAndType(uint64_t packed, uint64_t* seq, ValueType* t);
+
+EntryType GetEntryType(ValueType value_type);
+
+// Append the serialization of "key" to *result.
+extern void AppendInternalKey(std::string* result,
+ const ParsedInternalKey& key);
+// Serialized internal key consists of user key followed by footer.
+// This function appends the footer to *result, assuming that *result already
+// contains the user key at the end.
+extern void AppendInternalKeyFooter(std::string* result, SequenceNumber s,
+ ValueType t);
+
+// Attempt to parse an internal key from "internal_key". On success,
+// stores the parsed data in "*result", and returns true.
+//
+// On error, returns false, leaves "*result" in an undefined state.
+extern bool ParseInternalKey(const Slice& internal_key,
+ ParsedInternalKey* result);
+
+// Returns the user key portion of an internal key.
+inline Slice ExtractUserKey(const Slice& internal_key) {
+ assert(internal_key.size() >= 8);
+ return Slice(internal_key.data(), internal_key.size() - 8);
+}
+
+inline Slice ExtractUserKeyAndStripTimestamp(const Slice& internal_key,
+ size_t ts_sz) {
+ assert(internal_key.size() >= 8 + ts_sz);
+ return Slice(internal_key.data(), internal_key.size() - 8 - ts_sz);
+}
+
+inline Slice StripTimestampFromUserKey(const Slice& user_key, size_t ts_sz) {
+ assert(user_key.size() >= ts_sz);
+ return Slice(user_key.data(), user_key.size() - ts_sz);
+}
+
+inline uint64_t ExtractInternalKeyFooter(const Slice& internal_key) {
+ assert(internal_key.size() >= 8);
+ const size_t n = internal_key.size();
+ return DecodeFixed64(internal_key.data() + n - 8);
+}
+
+inline ValueType ExtractValueType(const Slice& internal_key) {
+ uint64_t num = ExtractInternalKeyFooter(internal_key);
+ unsigned char c = num & 0xff;
+ return static_cast<ValueType>(c);
+}
+
+// A comparator for internal keys that uses a specified comparator for
+// the user key portion and breaks ties by decreasing sequence number.
+class InternalKeyComparator
+#ifdef NDEBUG
+ final
+#endif
+ : public Comparator {
+ private:
+ UserComparatorWrapper user_comparator_;
+ std::string name_;
+
+ public:
+ explicit InternalKeyComparator(const Comparator* c)
+ : user_comparator_(c),
+ name_("rocksdb.InternalKeyComparator:" +
+ std::string(user_comparator_.Name())) {}
+ virtual ~InternalKeyComparator() {}
+
+ virtual const char* Name() const override;
+ virtual int Compare(const Slice& a, const Slice& b) const override;
+ // Same as Compare except that it excludes the value type from comparison
+ virtual int CompareKeySeq(const Slice& a, const Slice& b) const;
+ virtual void FindShortestSeparator(std::string* start,
+ const Slice& limit) const override;
+ virtual void FindShortSuccessor(std::string* key) const override;
+
+ const Comparator* user_comparator() const {
+ return user_comparator_.user_comparator();
+ }
+
+ int Compare(const InternalKey& a, const InternalKey& b) const;
+ int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const;
+ virtual const Comparator* GetRootComparator() const override {
+ return user_comparator_.GetRootComparator();
+ }
+};
+
+// The class represent the internal key in encoded form.
+class InternalKey {
+ private:
+ std::string rep_;
+
+ public:
+ InternalKey() {} // Leave rep_ as empty to indicate it is invalid
+ InternalKey(const Slice& _user_key, SequenceNumber s, ValueType t) {
+ AppendInternalKey(&rep_, ParsedInternalKey(_user_key, s, t));
+ }
+
+ // sets the internal key to be bigger or equal to all internal keys with this
+ // user key
+ void SetMaxPossibleForUserKey(const Slice& _user_key) {
+ AppendInternalKey(
+ &rep_, ParsedInternalKey(_user_key, 0, static_cast<ValueType>(0)));
+ }
+
+ // sets the internal key to be smaller or equal to all internal keys with this
+ // user key
+ void SetMinPossibleForUserKey(const Slice& _user_key) {
+ AppendInternalKey(&rep_, ParsedInternalKey(_user_key, kMaxSequenceNumber,
+ kValueTypeForSeek));
+ }
+
+ bool Valid() const {
+ ParsedInternalKey parsed;
+ return ParseInternalKey(Slice(rep_), &parsed);
+ }
+
+ void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); }
+ Slice Encode() const {
+ assert(!rep_.empty());
+ return rep_;
+ }
+
+ Slice user_key() const { return ExtractUserKey(rep_); }
+ size_t size() { return rep_.size(); }
+
+ void Set(const Slice& _user_key, SequenceNumber s, ValueType t) {
+ SetFrom(ParsedInternalKey(_user_key, s, t));
+ }
+
+ void SetFrom(const ParsedInternalKey& p) {
+ rep_.clear();
+ AppendInternalKey(&rep_, p);
+ }
+
+ void Clear() { rep_.clear(); }
+
+ // The underlying representation.
+ // Intended only to be used together with ConvertFromUserKey().
+ std::string* rep() { return &rep_; }
+
+ // Assuming that *rep() contains a user key, this method makes internal key
+ // out of it in-place. This saves a memcpy compared to Set()/SetFrom().
+ void ConvertFromUserKey(SequenceNumber s, ValueType t) {
+ AppendInternalKeyFooter(&rep_, s, t);
+ }
+
+ std::string DebugString(bool hex = false) const;
+};
+
+inline int InternalKeyComparator::Compare(const InternalKey& a,
+ const InternalKey& b) const {
+ return Compare(a.Encode(), b.Encode());
+}
+
+inline bool ParseInternalKey(const Slice& internal_key,
+ ParsedInternalKey* result) {
+ const size_t n = internal_key.size();
+ if (n < 8) return false;
+ uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
+ unsigned char c = num & 0xff;
+ result->sequence = num >> 8;
+ result->type = static_cast<ValueType>(c);
+ assert(result->type <= ValueType::kMaxValue);
+ result->user_key = Slice(internal_key.data(), n - 8);
+ return IsExtendedValueType(result->type);
+}
+
+// Update the sequence number in the internal key.
+// Guarantees not to invalidate ikey.data().
+inline void UpdateInternalKey(std::string* ikey, uint64_t seq, ValueType t) {
+ size_t ikey_sz = ikey->size();
+ assert(ikey_sz >= 8);
+ uint64_t newval = (seq << 8) | t;
+
+ // Note: Since C++11, strings are guaranteed to be stored contiguously and
+ // string::operator[]() is guaranteed not to change ikey.data().
+ EncodeFixed64(&(*ikey)[ikey_sz - 8], newval);
+}
+
+// Get the sequence number from the internal key
+inline uint64_t GetInternalKeySeqno(const Slice& internal_key) {
+ const size_t n = internal_key.size();
+ assert(n >= 8);
+ uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
+ return num >> 8;
+}
+
+// The class to store keys in an efficient way. It allows:
+// 1. Users can either copy the key into it, or have it point to an unowned
+// address.
+// 2. For copied key, a short inline buffer is kept to reduce memory
+// allocation for smaller keys.
+// 3. It tracks user key or internal key, and allow conversion between them.
+class IterKey {
+ public:
+ IterKey()
+ : buf_(space_),
+ key_(buf_),
+ key_size_(0),
+ buf_size_(sizeof(space_)),
+ is_user_key_(true) {}
+ // No copying allowed
+ IterKey(const IterKey&) = delete;
+ void operator=(const IterKey&) = delete;
+
+ ~IterKey() { ResetBuffer(); }
+
+ // The bool will be picked up by the next calls to SetKey
+ void SetIsUserKey(bool is_user_key) { is_user_key_ = is_user_key; }
+
+ // Returns the key in whichever format that was provided to KeyIter
+ Slice GetKey() const { return Slice(key_, key_size_); }
+
+ Slice GetInternalKey() const {
+ assert(!IsUserKey());
+ return Slice(key_, key_size_);
+ }
+
+ Slice GetUserKey() const {
+ if (IsUserKey()) {
+ return Slice(key_, key_size_);
+ } else {
+ assert(key_size_ >= 8);
+ return Slice(key_, key_size_ - 8);
+ }
+ }
+
+ size_t Size() const { return key_size_; }
+
+ void Clear() { key_size_ = 0; }
+
+ // Append "non_shared_data" to its back, from "shared_len"
+ // This function is used in Block::Iter::ParseNextKey
+ // shared_len: bytes in [0, shard_len-1] would be remained
+ // non_shared_data: data to be append, its length must be >= non_shared_len
+ void TrimAppend(const size_t shared_len, const char* non_shared_data,
+ const size_t non_shared_len) {
+ assert(shared_len <= key_size_);
+ size_t total_size = shared_len + non_shared_len;
+
+ if (IsKeyPinned() /* key is not in buf_ */) {
+ // Copy the key from external memory to buf_ (copy shared_len bytes)
+ EnlargeBufferIfNeeded(total_size);
+ memcpy(buf_, key_, shared_len);
+ } else if (total_size > buf_size_) {
+ // Need to allocate space, delete previous space
+ char* p = new char[total_size];
+ memcpy(p, key_, shared_len);
+
+ if (buf_ != space_) {
+ delete[] buf_;
+ }
+
+ buf_ = p;
+ buf_size_ = total_size;
+ }
+
+ memcpy(buf_ + shared_len, non_shared_data, non_shared_len);
+ key_ = buf_;
+ key_size_ = total_size;
+ }
+
+ Slice SetKey(const Slice& key, bool copy = true) {
+ // is_user_key_ expected to be set already via SetIsUserKey
+ return SetKeyImpl(key, copy);
+ }
+
+ Slice SetUserKey(const Slice& key, bool copy = true) {
+ is_user_key_ = true;
+ return SetKeyImpl(key, copy);
+ }
+
+ Slice SetInternalKey(const Slice& key, bool copy = true) {
+ is_user_key_ = false;
+ return SetKeyImpl(key, copy);
+ }
+
+ // Copies the content of key, updates the reference to the user key in ikey
+ // and returns a Slice referencing the new copy.
+ Slice SetInternalKey(const Slice& key, ParsedInternalKey* ikey) {
+ size_t key_n = key.size();
+ assert(key_n >= 8);
+ SetInternalKey(key);
+ ikey->user_key = Slice(key_, key_n - 8);
+ return Slice(key_, key_n);
+ }
+
+ // Copy the key into IterKey own buf_
+ void OwnKey() {
+ assert(IsKeyPinned() == true);
+
+ Reserve(key_size_);
+ memcpy(buf_, key_, key_size_);
+ key_ = buf_;
+ }
+
+ // Update the sequence number in the internal key. Guarantees not to
+ // invalidate slices to the key (and the user key).
+ void UpdateInternalKey(uint64_t seq, ValueType t) {
+ assert(!IsKeyPinned());
+ assert(key_size_ >= 8);
+ uint64_t newval = (seq << 8) | t;
+ EncodeFixed64(&buf_[key_size_ - 8], newval);
+ }
+
+ bool IsKeyPinned() const { return (key_ != buf_); }
+
+ void SetInternalKey(const Slice& key_prefix, const Slice& user_key,
+ SequenceNumber s,
+ ValueType value_type = kValueTypeForSeek) {
+ size_t psize = key_prefix.size();
+ size_t usize = user_key.size();
+ EnlargeBufferIfNeeded(psize + usize + sizeof(uint64_t));
+ if (psize > 0) {
+ memcpy(buf_, key_prefix.data(), psize);
+ }
+ memcpy(buf_ + psize, user_key.data(), usize);
+ EncodeFixed64(buf_ + usize + psize, PackSequenceAndType(s, value_type));
+
+ key_ = buf_;
+ key_size_ = psize + usize + sizeof(uint64_t);
+ is_user_key_ = false;
+ }
+
+ void SetInternalKey(const Slice& user_key, SequenceNumber s,
+ ValueType value_type = kValueTypeForSeek) {
+ SetInternalKey(Slice(), user_key, s, value_type);
+ }
+
+ void Reserve(size_t size) {
+ EnlargeBufferIfNeeded(size);
+ key_size_ = size;
+ }
+
+ void SetInternalKey(const ParsedInternalKey& parsed_key) {
+ SetInternalKey(Slice(), parsed_key);
+ }
+
+ void SetInternalKey(const Slice& key_prefix,
+ const ParsedInternalKey& parsed_key_suffix) {
+ SetInternalKey(key_prefix, parsed_key_suffix.user_key,
+ parsed_key_suffix.sequence, parsed_key_suffix.type);
+ }
+
+ void EncodeLengthPrefixedKey(const Slice& key) {
+ auto size = key.size();
+ EnlargeBufferIfNeeded(size + static_cast<size_t>(VarintLength(size)));
+ char* ptr = EncodeVarint32(buf_, static_cast<uint32_t>(size));
+ memcpy(ptr, key.data(), size);
+ key_ = buf_;
+ is_user_key_ = true;
+ }
+
+ bool IsUserKey() const { return is_user_key_; }
+
+ private:
+ char* buf_;
+ const char* key_;
+ size_t key_size_;
+ size_t buf_size_;
+ char space_[32]; // Avoid allocation for short keys
+ bool is_user_key_;
+
+ Slice SetKeyImpl(const Slice& key, bool copy) {
+ size_t size = key.size();
+ if (copy) {
+ // Copy key to buf_
+ EnlargeBufferIfNeeded(size);
+ memcpy(buf_, key.data(), size);
+ key_ = buf_;
+ } else {
+ // Update key_ to point to external memory
+ key_ = key.data();
+ }
+ key_size_ = size;
+ return Slice(key_, key_size_);
+ }
+
+ void ResetBuffer() {
+ if (buf_ != space_) {
+ delete[] buf_;
+ buf_ = space_;
+ }
+ buf_size_ = sizeof(space_);
+ key_size_ = 0;
+ }
+
+ // Enlarge the buffer size if needed based on key_size.
+ // By default, static allocated buffer is used. Once there is a key
+ // larger than the static allocated buffer, another buffer is dynamically
+ // allocated, until a larger key buffer is requested. In that case, we
+ // reallocate buffer and delete the old one.
+ void EnlargeBufferIfNeeded(size_t key_size) {
+ // If size is smaller than buffer size, continue using current buffer,
+ // or the static allocated one, as default
+ if (key_size > buf_size_) {
+ EnlargeBuffer(key_size);
+ }
+ }
+
+ void EnlargeBuffer(size_t key_size);
+};
+
+// Convert from a SliceTranform of user keys, to a SliceTransform of
+// user keys.
+class InternalKeySliceTransform : public SliceTransform {
+ public:
+ explicit InternalKeySliceTransform(const SliceTransform* transform)
+ : transform_(transform) {}
+
+ virtual const char* Name() const override { return transform_->Name(); }
+
+ virtual Slice Transform(const Slice& src) const override {
+ auto user_key = ExtractUserKey(src);
+ return transform_->Transform(user_key);
+ }
+
+ virtual bool InDomain(const Slice& src) const override {
+ auto user_key = ExtractUserKey(src);
+ return transform_->InDomain(user_key);
+ }
+
+ virtual bool InRange(const Slice& dst) const override {
+ auto user_key = ExtractUserKey(dst);
+ return transform_->InRange(user_key);
+ }
+
+ const SliceTransform* user_prefix_extractor() const { return transform_; }
+
+ private:
+ // Like comparator, InternalKeySliceTransform will not take care of the
+ // deletion of transform_
+ const SliceTransform* const transform_;
+};
+
+// Read the key of a record from a write batch.
+// if this record represent the default column family then cf_record
+// must be passed as false, otherwise it must be passed as true.
+extern bool ReadKeyFromWriteBatchEntry(Slice* input, Slice* key,
+ bool cf_record);
+
+// Read record from a write batch piece from input.
+// tag, column_family, key, value and blob are return values. Callers own the
+// Slice they point to.
+// Tag is defined as ValueType.
+// input will be advanced to after the record.
+extern Status ReadRecordFromWriteBatch(Slice* input, char* tag,
+ uint32_t* column_family, Slice* key,
+ Slice* value, Slice* blob, Slice* xid);
+
+// When user call DeleteRange() to delete a range of keys,
+// we will store a serialized RangeTombstone in MemTable and SST.
+// the struct here is a easy-understood form
+// start/end_key_ is the start/end user key of the range to be deleted
+struct RangeTombstone {
+ Slice start_key_;
+ Slice end_key_;
+ SequenceNumber seq_;
+ RangeTombstone() = default;
+ RangeTombstone(Slice sk, Slice ek, SequenceNumber sn)
+ : start_key_(sk), end_key_(ek), seq_(sn) {}
+
+ RangeTombstone(ParsedInternalKey parsed_key, Slice value) {
+ start_key_ = parsed_key.user_key;
+ seq_ = parsed_key.sequence;
+ end_key_ = value;
+ }
+
+ // be careful to use Serialize(), allocates new memory
+ std::pair<InternalKey, Slice> Serialize() const {
+ auto key = InternalKey(start_key_, seq_, kTypeRangeDeletion);
+ Slice value = end_key_;
+ return std::make_pair(std::move(key), std::move(value));
+ }
+
+ // be careful to use SerializeKey(), allocates new memory
+ InternalKey SerializeKey() const {
+ return InternalKey(start_key_, seq_, kTypeRangeDeletion);
+ }
+
+ // The tombstone end-key is exclusive, so we generate an internal-key here
+ // which has a similar property. Using kMaxSequenceNumber guarantees that
+ // the returned internal-key will compare less than any other internal-key
+ // with the same user-key. This in turn guarantees that the serialized
+ // end-key for a tombstone such as [a-b] will compare less than the key "b".
+ //
+ // be careful to use SerializeEndKey(), allocates new memory
+ InternalKey SerializeEndKey() const {
+ return InternalKey(end_key_, kMaxSequenceNumber, kTypeRangeDeletion);
+ }
+};
+
+inline int InternalKeyComparator::Compare(const Slice& akey,
+ const Slice& bkey) const {
+ // Order by:
+ // increasing user key (according to user-supplied comparator)
+ // decreasing sequence number
+ // decreasing type (though sequence# should be enough to disambiguate)
+ int r = user_comparator_.Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
+ if (r == 0) {
+ const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
+ const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
+ if (anum > bnum) {
+ r = -1;
+ } else if (anum < bnum) {
+ r = +1;
+ }
+ }
+ return r;
+}
+
+inline int InternalKeyComparator::CompareKeySeq(const Slice& akey,
+ const Slice& bkey) const {
+ // Order by:
+ // increasing user key (according to user-supplied comparator)
+ // decreasing sequence number
+ int r = user_comparator_.Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
+ if (r == 0) {
+ // Shift the number to exclude the last byte which contains the value type
+ const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8) >> 8;
+ const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8) >> 8;
+ if (anum > bnum) {
+ r = -1;
+ } else if (anum < bnum) {
+ r = +1;
+ }
+ }
+ return r;
+}
+
+// Wrap InternalKeyComparator as a comparator class for ParsedInternalKey.
+struct ParsedInternalKeyComparator {
+ explicit ParsedInternalKeyComparator(const InternalKeyComparator* c)
+ : cmp(c) {}
+
+ bool operator()(const ParsedInternalKey& a,
+ const ParsedInternalKey& b) const {
+ return cmp->Compare(a, b) < 0;
+ }
+
+ const InternalKeyComparator* cmp;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/dbformat_test.cc b/src/rocksdb/db/dbformat_test.cc
new file mode 100644
index 000000000..a2c67795a
--- /dev/null
+++ b/src/rocksdb/db/dbformat_test.cc
@@ -0,0 +1,207 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/dbformat.h"
+#include "logging/logging.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static std::string IKey(const std::string& user_key,
+ uint64_t seq,
+ ValueType vt) {
+ std::string encoded;
+ AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt));
+ return encoded;
+}
+
+static std::string Shorten(const std::string& s, const std::string& l) {
+ std::string result = s;
+ InternalKeyComparator(BytewiseComparator()).FindShortestSeparator(&result, l);
+ return result;
+}
+
+static std::string ShortSuccessor(const std::string& s) {
+ std::string result = s;
+ InternalKeyComparator(BytewiseComparator()).FindShortSuccessor(&result);
+ return result;
+}
+
+static void TestKey(const std::string& key,
+ uint64_t seq,
+ ValueType vt) {
+ std::string encoded = IKey(key, seq, vt);
+
+ Slice in(encoded);
+ ParsedInternalKey decoded("", 0, kTypeValue);
+
+ ASSERT_TRUE(ParseInternalKey(in, &decoded));
+ ASSERT_EQ(key, decoded.user_key.ToString());
+ ASSERT_EQ(seq, decoded.sequence);
+ ASSERT_EQ(vt, decoded.type);
+
+ ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded));
+}
+
+class FormatTest : public testing::Test {};
+
+TEST_F(FormatTest, InternalKey_EncodeDecode) {
+ const char* keys[] = { "", "k", "hello", "longggggggggggggggggggggg" };
+ const uint64_t seq[] = {
+ 1, 2, 3,
+ (1ull << 8) - 1, 1ull << 8, (1ull << 8) + 1,
+ (1ull << 16) - 1, 1ull << 16, (1ull << 16) + 1,
+ (1ull << 32) - 1, 1ull << 32, (1ull << 32) + 1
+ };
+ for (unsigned int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) {
+ for (unsigned int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) {
+ TestKey(keys[k], seq[s], kTypeValue);
+ TestKey("hello", 1, kTypeDeletion);
+ }
+ }
+}
+
+TEST_F(FormatTest, InternalKeyShortSeparator) {
+ // When user keys are same
+ ASSERT_EQ(IKey("foo", 100, kTypeValue),
+ Shorten(IKey("foo", 100, kTypeValue),
+ IKey("foo", 99, kTypeValue)));
+ ASSERT_EQ(IKey("foo", 100, kTypeValue),
+ Shorten(IKey("foo", 100, kTypeValue),
+ IKey("foo", 101, kTypeValue)));
+ ASSERT_EQ(IKey("foo", 100, kTypeValue),
+ Shorten(IKey("foo", 100, kTypeValue),
+ IKey("foo", 100, kTypeValue)));
+ ASSERT_EQ(IKey("foo", 100, kTypeValue),
+ Shorten(IKey("foo", 100, kTypeValue),
+ IKey("foo", 100, kTypeDeletion)));
+
+ // When user keys are misordered
+ ASSERT_EQ(IKey("foo", 100, kTypeValue),
+ Shorten(IKey("foo", 100, kTypeValue),
+ IKey("bar", 99, kTypeValue)));
+
+ // When user keys are different, but correctly ordered
+ ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
+ Shorten(IKey("foo", 100, kTypeValue),
+ IKey("hello", 200, kTypeValue)));
+
+ ASSERT_EQ(IKey("ABC2", kMaxSequenceNumber, kValueTypeForSeek),
+ Shorten(IKey("ABC1AAAAA", 100, kTypeValue),
+ IKey("ABC2ABB", 200, kTypeValue)));
+
+ ASSERT_EQ(IKey("AAA2", kMaxSequenceNumber, kValueTypeForSeek),
+ Shorten(IKey("AAA1AAA", 100, kTypeValue),
+ IKey("AAA2AA", 200, kTypeValue)));
+
+ ASSERT_EQ(
+ IKey("AAA2", kMaxSequenceNumber, kValueTypeForSeek),
+ Shorten(IKey("AAA1AAA", 100, kTypeValue), IKey("AAA4", 200, kTypeValue)));
+
+ ASSERT_EQ(
+ IKey("AAA1B", kMaxSequenceNumber, kValueTypeForSeek),
+ Shorten(IKey("AAA1AAA", 100, kTypeValue), IKey("AAA2", 200, kTypeValue)));
+
+ ASSERT_EQ(IKey("AAA2", kMaxSequenceNumber, kValueTypeForSeek),
+ Shorten(IKey("AAA1AAA", 100, kTypeValue),
+ IKey("AAA2A", 200, kTypeValue)));
+
+ ASSERT_EQ(
+ IKey("AAA1", 100, kTypeValue),
+ Shorten(IKey("AAA1", 100, kTypeValue), IKey("AAA2", 200, kTypeValue)));
+
+ // When start user key is prefix of limit user key
+ ASSERT_EQ(IKey("foo", 100, kTypeValue),
+ Shorten(IKey("foo", 100, kTypeValue),
+ IKey("foobar", 200, kTypeValue)));
+
+ // When limit user key is prefix of start user key
+ ASSERT_EQ(IKey("foobar", 100, kTypeValue),
+ Shorten(IKey("foobar", 100, kTypeValue),
+ IKey("foo", 200, kTypeValue)));
+}
+
+TEST_F(FormatTest, InternalKeyShortestSuccessor) {
+ ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
+ ShortSuccessor(IKey("foo", 100, kTypeValue)));
+ ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue),
+ ShortSuccessor(IKey("\xff\xff", 100, kTypeValue)));
+}
+
+TEST_F(FormatTest, IterKeyOperation) {
+ IterKey k;
+ const char p[] = "abcdefghijklmnopqrstuvwxyz";
+ const char q[] = "0123456789";
+
+ ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+ std::string(""));
+
+ k.TrimAppend(0, p, 3);
+ ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+ std::string("abc"));
+
+ k.TrimAppend(1, p, 3);
+ ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+ std::string("aabc"));
+
+ k.TrimAppend(0, p, 26);
+ ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+ std::string("abcdefghijklmnopqrstuvwxyz"));
+
+ k.TrimAppend(26, q, 10);
+ ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+ std::string("abcdefghijklmnopqrstuvwxyz0123456789"));
+
+ k.TrimAppend(36, q, 1);
+ ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+ std::string("abcdefghijklmnopqrstuvwxyz01234567890"));
+
+ k.TrimAppend(26, q, 1);
+ ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+ std::string("abcdefghijklmnopqrstuvwxyz0"));
+
+ // Size going up, memory allocation is triggered
+ k.TrimAppend(27, p, 26);
+ ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+ std::string("abcdefghijklmnopqrstuvwxyz0"
+ "abcdefghijklmnopqrstuvwxyz"));
+}
+
+TEST_F(FormatTest, UpdateInternalKey) {
+ std::string user_key("abcdefghijklmnopqrstuvwxyz");
+ uint64_t new_seq = 0x123456;
+ ValueType new_val_type = kTypeDeletion;
+
+ std::string ikey;
+ AppendInternalKey(&ikey, ParsedInternalKey(user_key, 100U, kTypeValue));
+ size_t ikey_size = ikey.size();
+ UpdateInternalKey(&ikey, new_seq, new_val_type);
+ ASSERT_EQ(ikey_size, ikey.size());
+
+ Slice in(ikey);
+ ParsedInternalKey decoded;
+ ASSERT_TRUE(ParseInternalKey(in, &decoded));
+ ASSERT_EQ(user_key, decoded.user_key.ToString());
+ ASSERT_EQ(new_seq, decoded.sequence);
+ ASSERT_EQ(new_val_type, decoded.type);
+}
+
+TEST_F(FormatTest, RangeTombstoneSerializeEndKey) {
+ RangeTombstone t("a", "b", 2);
+ InternalKey k("b", 3, kTypeValue);
+ const InternalKeyComparator cmp(BytewiseComparator());
+ ASSERT_LT(cmp.Compare(t.SerializeEndKey(), k), 0);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/deletefile_test.cc b/src/rocksdb/db/deletefile_test.cc
new file mode 100644
index 000000000..f202388c0
--- /dev/null
+++ b/src/rocksdb/db/deletefile_test.cc
@@ -0,0 +1,571 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include <stdlib.h>
+#include <map>
+#include <string>
+#include <vector>
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "file/filename.h"
+#include "port/stack_trace.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/transaction_log.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DeleteFileTest : public DBTestBase {
+ public:
+ const int numlevels_;
+ const std::string wal_dir_;
+
+ DeleteFileTest()
+ : DBTestBase("/deletefile_test"),
+ numlevels_(7),
+ wal_dir_(dbname_ + "/wal_files") {}
+
+ void SetOptions(Options* options) {
+ assert(options);
+ options->delete_obsolete_files_period_micros = 0; // always do full purge
+ options->enable_thread_tracking = true;
+ options->write_buffer_size = 1024 * 1024 * 1000;
+ options->target_file_size_base = 1024 * 1024 * 1000;
+ options->max_bytes_for_level_base = 1024 * 1024 * 1000;
+ options->WAL_ttl_seconds = 300; // Used to test log files
+ options->WAL_size_limit_MB = 1024; // Used to test log files
+ options->wal_dir = wal_dir_;
+ }
+
+ void AddKeys(int numkeys, int startkey = 0) {
+ WriteOptions options;
+ options.sync = false;
+ ReadOptions roptions;
+ for (int i = startkey; i < (numkeys + startkey) ; i++) {
+ std::string temp = ToString(i);
+ Slice key(temp);
+ Slice value(temp);
+ ASSERT_OK(db_->Put(options, key, value));
+ }
+ }
+
+ int numKeysInLevels(
+ std::vector<LiveFileMetaData> &metadata,
+ std::vector<int> *keysperlevel = nullptr) {
+
+ if (keysperlevel != nullptr) {
+ keysperlevel->resize(numlevels_);
+ }
+
+ int numKeys = 0;
+ for (size_t i = 0; i < metadata.size(); i++) {
+ int startkey = atoi(metadata[i].smallestkey.c_str());
+ int endkey = atoi(metadata[i].largestkey.c_str());
+ int numkeysinfile = (endkey - startkey + 1);
+ numKeys += numkeysinfile;
+ if (keysperlevel != nullptr) {
+ (*keysperlevel)[(int)metadata[i].level] += numkeysinfile;
+ }
+ fprintf(stderr, "level %d name %s smallest %s largest %s\n",
+ metadata[i].level, metadata[i].name.c_str(),
+ metadata[i].smallestkey.c_str(),
+ metadata[i].largestkey.c_str());
+ }
+ return numKeys;
+ }
+
+ void CreateTwoLevels() {
+ AddKeys(50000, 10000);
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ for (int i = 0; i < 2; ++i) {
+ ASSERT_OK(dbfull()->TEST_CompactRange(i, nullptr, nullptr));
+ }
+
+ AddKeys(50000, 10000);
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+ }
+
+ void CheckFileTypeCounts(const std::string& dir, int required_log,
+ int required_sst, int required_manifest) {
+ std::vector<std::string> filenames;
+ env_->GetChildren(dir, &filenames);
+
+ int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0;
+ for (auto file : filenames) {
+ uint64_t number;
+ FileType type;
+ if (ParseFileName(file, &number, &type)) {
+ log_cnt += (type == kLogFile);
+ sst_cnt += (type == kTableFile);
+ manifest_cnt += (type == kDescriptorFile);
+ }
+ }
+ ASSERT_EQ(required_log, log_cnt);
+ ASSERT_EQ(required_sst, sst_cnt);
+ ASSERT_EQ(required_manifest, manifest_cnt);
+ }
+
+ static void DoSleep(void* arg) {
+ auto test = reinterpret_cast<DeleteFileTest*>(arg);
+ test->env_->SleepForMicroseconds(2 * 1000 * 1000);
+ }
+
+ // An empty job to guard all jobs are processed
+ static void GuardFinish(void* /*arg*/) {
+ TEST_SYNC_POINT("DeleteFileTest::GuardFinish");
+ }
+};
+
+TEST_F(DeleteFileTest, AddKeysAndQueryLevels) {
+ Options options = CurrentOptions();
+ SetOptions(&options);
+ Destroy(options);
+ options.create_if_missing = true;
+ Reopen(options);
+
+ CreateTwoLevels();
+ std::vector<LiveFileMetaData> metadata;
+ db_->GetLiveFilesMetaData(&metadata);
+
+ std::string level1file = "";
+ int level1keycount = 0;
+ std::string level2file = "";
+ int level2keycount = 0;
+ int level1index = 0;
+ int level2index = 1;
+
+ ASSERT_EQ((int)metadata.size(), 2);
+ if (metadata[0].level == 2) {
+ level1index = 1;
+ level2index = 0;
+ }
+
+ level1file = metadata[level1index].name;
+ int startkey = atoi(metadata[level1index].smallestkey.c_str());
+ int endkey = atoi(metadata[level1index].largestkey.c_str());
+ level1keycount = (endkey - startkey + 1);
+ level2file = metadata[level2index].name;
+ startkey = atoi(metadata[level2index].smallestkey.c_str());
+ endkey = atoi(metadata[level2index].largestkey.c_str());
+ level2keycount = (endkey - startkey + 1);
+
+ // COntrolled setup. Levels 1 and 2 should both have 50K files.
+ // This is a little fragile as it depends on the current
+ // compaction heuristics.
+ ASSERT_EQ(level1keycount, 50000);
+ ASSERT_EQ(level2keycount, 50000);
+
+ Status status = db_->DeleteFile("0.sst");
+ ASSERT_TRUE(status.IsInvalidArgument());
+
+ // intermediate level files cannot be deleted.
+ status = db_->DeleteFile(level1file);
+ ASSERT_TRUE(status.IsInvalidArgument());
+
+ // Lowest level file deletion should succeed.
+ ASSERT_OK(db_->DeleteFile(level2file));
+}
+
+TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) {
+ Options options = CurrentOptions();
+ SetOptions(&options);
+ Destroy(options);
+ options.create_if_missing = true;
+ Reopen(options);
+
+ CreateTwoLevels();
+ // there should be only one (empty) log file because CreateTwoLevels()
+ // flushes the memtables to disk
+ CheckFileTypeCounts(wal_dir_, 1, 0, 0);
+ // 2 ssts, 1 manifest
+ CheckFileTypeCounts(dbname_, 0, 2, 1);
+ std::string first("0"), last("999999");
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 2;
+ Slice first_slice(first), last_slice(last);
+ db_->CompactRange(compact_options, &first_slice, &last_slice);
+ // 1 sst after compaction
+ CheckFileTypeCounts(dbname_, 0, 1, 1);
+
+ // this time, we keep an iterator alive
+ Reopen(options);
+ Iterator *itr = nullptr;
+ CreateTwoLevels();
+ itr = db_->NewIterator(ReadOptions());
+ db_->CompactRange(compact_options, &first_slice, &last_slice);
+ // 3 sst after compaction with live iterator
+ CheckFileTypeCounts(dbname_, 0, 3, 1);
+ delete itr;
+ // 1 sst after iterator deletion
+ CheckFileTypeCounts(dbname_, 0, 1, 1);
+}
+
+TEST_F(DeleteFileTest, BackgroundPurgeIteratorTest) {
+ Options options = CurrentOptions();
+ SetOptions(&options);
+ Destroy(options);
+ options.create_if_missing = true;
+ Reopen(options);
+
+ std::string first("0"), last("999999");
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 2;
+ Slice first_slice(first), last_slice(last);
+
+ // We keep an iterator alive
+ Iterator* itr = nullptr;
+ CreateTwoLevels();
+ ReadOptions read_options;
+ read_options.background_purge_on_iterator_cleanup = true;
+ itr = db_->NewIterator(read_options);
+ db_->CompactRange(compact_options, &first_slice, &last_slice);
+ // 3 sst after compaction with live iterator
+ CheckFileTypeCounts(dbname_, 0, 3, 1);
+ test::SleepingBackgroundTask sleeping_task_before;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+ &sleeping_task_before, Env::Priority::HIGH);
+ delete itr;
+ test::SleepingBackgroundTask sleeping_task_after;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+ &sleeping_task_after, Env::Priority::HIGH);
+
+ // Make sure no purges are executed foreground
+ CheckFileTypeCounts(dbname_, 0, 3, 1);
+ sleeping_task_before.WakeUp();
+ sleeping_task_before.WaitUntilDone();
+
+ // Make sure all background purges are executed
+ sleeping_task_after.WakeUp();
+ sleeping_task_after.WaitUntilDone();
+ // 1 sst after iterator deletion
+ CheckFileTypeCounts(dbname_, 0, 1, 1);
+}
+
+TEST_F(DeleteFileTest, BackgroundPurgeCFDropTest) {
+ Options options = CurrentOptions();
+ SetOptions(&options);
+ Destroy(options);
+ options.create_if_missing = true;
+ Reopen(options);
+
+ auto do_test = [&](bool bg_purge) {
+ ColumnFamilyOptions co;
+ co.max_write_buffer_size_to_maintain =
+ static_cast<int64_t>(co.write_buffer_size);
+ WriteOptions wo;
+ FlushOptions fo;
+ ColumnFamilyHandle* cfh = nullptr;
+
+ ASSERT_OK(db_->CreateColumnFamily(co, "dropme", &cfh));
+
+ ASSERT_OK(db_->Put(wo, cfh, "pika", "chu"));
+ ASSERT_OK(db_->Flush(fo, cfh));
+ // Expect 1 sst file.
+ CheckFileTypeCounts(dbname_, 0, 1, 1);
+
+ ASSERT_OK(db_->DropColumnFamily(cfh));
+ // Still 1 file, it won't be deleted while ColumnFamilyHandle is alive.
+ CheckFileTypeCounts(dbname_, 0, 1, 1);
+
+ delete cfh;
+ test::SleepingBackgroundTask sleeping_task_after;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+ &sleeping_task_after, Env::Priority::HIGH);
+ // If background purge is enabled, the file should still be there.
+ CheckFileTypeCounts(dbname_, 0, bg_purge ? 1 : 0, 1);
+ TEST_SYNC_POINT("DeleteFileTest::BackgroundPurgeCFDropTest:1");
+
+ // Execute background purges.
+ sleeping_task_after.WakeUp();
+ sleeping_task_after.WaitUntilDone();
+ // The file should have been deleted.
+ CheckFileTypeCounts(dbname_, 0, 0, 1);
+ };
+
+ {
+ SCOPED_TRACE("avoid_unnecessary_blocking_io = false");
+ do_test(false);
+ }
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DeleteFileTest::BackgroundPurgeCFDropTest:1",
+ "DBImpl::BGWorkPurge:start"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ options.avoid_unnecessary_blocking_io = true;
+ options.create_if_missing = false;
+ Reopen(options);
+ {
+ SCOPED_TRACE("avoid_unnecessary_blocking_io = true");
+ do_test(true);
+ }
+}
+
+// This test is to reproduce a bug that read invalid ReadOption in iterator
+// cleanup function
+TEST_F(DeleteFileTest, BackgroundPurgeCopyOptions) {
+ Options options = CurrentOptions();
+ SetOptions(&options);
+ Destroy(options);
+ options.create_if_missing = true;
+ Reopen(options);
+
+ std::string first("0"), last("999999");
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 2;
+ Slice first_slice(first), last_slice(last);
+
+ // We keep an iterator alive
+ Iterator* itr = nullptr;
+ CreateTwoLevels();
+ {
+ ReadOptions read_options;
+ read_options.background_purge_on_iterator_cleanup = true;
+ itr = db_->NewIterator(read_options);
+ // ReadOptions is deleted, but iterator cleanup function should not be
+ // affected
+ }
+
+ db_->CompactRange(compact_options, &first_slice, &last_slice);
+ // 3 sst after compaction with live iterator
+ CheckFileTypeCounts(dbname_, 0, 3, 1);
+ delete itr;
+
+ test::SleepingBackgroundTask sleeping_task_after;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+ &sleeping_task_after, Env::Priority::HIGH);
+
+ // Make sure all background purges are executed
+ sleeping_task_after.WakeUp();
+ sleeping_task_after.WaitUntilDone();
+ // 1 sst after iterator deletion
+ CheckFileTypeCounts(dbname_, 0, 1, 1);
+}
+
+TEST_F(DeleteFileTest, BackgroundPurgeTestMultipleJobs) {
+ Options options = CurrentOptions();
+ SetOptions(&options);
+ Destroy(options);
+ options.create_if_missing = true;
+ Reopen(options);
+
+ std::string first("0"), last("999999");
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 2;
+ Slice first_slice(first), last_slice(last);
+
+ // We keep an iterator alive
+ CreateTwoLevels();
+ ReadOptions read_options;
+ read_options.background_purge_on_iterator_cleanup = true;
+ Iterator* itr1 = db_->NewIterator(read_options);
+ CreateTwoLevels();
+ Iterator* itr2 = db_->NewIterator(read_options);
+ db_->CompactRange(compact_options, &first_slice, &last_slice);
+ // 5 sst files after 2 compactions with 2 live iterators
+ CheckFileTypeCounts(dbname_, 0, 5, 1);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ // ~DBImpl should wait until all BGWorkPurge are finished
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::~DBImpl:WaitJob", "DBImpl::BGWorkPurge"},
+ {"DeleteFileTest::GuardFinish",
+ "DeleteFileTest::BackgroundPurgeTestMultipleJobs:DBClose"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ delete itr1;
+ env_->Schedule(&DeleteFileTest::DoSleep, this, Env::Priority::HIGH);
+ delete itr2;
+ env_->Schedule(&DeleteFileTest::GuardFinish, nullptr, Env::Priority::HIGH);
+ Close();
+
+ TEST_SYNC_POINT("DeleteFileTest::BackgroundPurgeTestMultipleJobs:DBClose");
+ // 1 sst after iterator deletion
+ CheckFileTypeCounts(dbname_, 0, 1, 1);
+}
+
+TEST_F(DeleteFileTest, DeleteFileWithIterator) {
+ Options options = CurrentOptions();
+ SetOptions(&options);
+ Destroy(options);
+ options.create_if_missing = true;
+ Reopen(options);
+
+ CreateTwoLevels();
+ ReadOptions read_options;
+ Iterator* it = db_->NewIterator(read_options);
+ std::vector<LiveFileMetaData> metadata;
+ db_->GetLiveFilesMetaData(&metadata);
+
+ std::string level2file;
+
+ ASSERT_EQ(metadata.size(), static_cast<size_t>(2));
+ if (metadata[0].level == 1) {
+ level2file = metadata[1].name;
+ } else {
+ level2file = metadata[0].name;
+ }
+
+ Status status = db_->DeleteFile(level2file);
+ fprintf(stdout, "Deletion status %s: %s\n",
+ level2file.c_str(), status.ToString().c_str());
+ ASSERT_TRUE(status.ok());
+ it->SeekToFirst();
+ int numKeysIterated = 0;
+ while(it->Valid()) {
+ numKeysIterated++;
+ it->Next();
+ }
+ ASSERT_EQ(numKeysIterated, 50000);
+ delete it;
+}
+
+TEST_F(DeleteFileTest, DeleteLogFiles) {
+ Options options = CurrentOptions();
+ SetOptions(&options);
+ Destroy(options);
+ options.create_if_missing = true;
+ Reopen(options);
+
+ AddKeys(10, 0);
+ VectorLogPtr logfiles;
+ db_->GetSortedWalFiles(logfiles);
+ ASSERT_GT(logfiles.size(), 0UL);
+ // Take the last log file which is expected to be alive and try to delete it
+ // Should not succeed because live logs are not allowed to be deleted
+ std::unique_ptr<LogFile> alive_log = std::move(logfiles.back());
+ ASSERT_EQ(alive_log->Type(), kAliveLogFile);
+ ASSERT_OK(env_->FileExists(wal_dir_ + "/" + alive_log->PathName()));
+ fprintf(stdout, "Deleting alive log file %s\n",
+ alive_log->PathName().c_str());
+ ASSERT_TRUE(!db_->DeleteFile(alive_log->PathName()).ok());
+ ASSERT_OK(env_->FileExists(wal_dir_ + "/" + alive_log->PathName()));
+ logfiles.clear();
+
+ // Call Flush to bring about a new working log file and add more keys
+ // Call Flush again to flush out memtable and move alive log to archived log
+ // and try to delete the archived log file
+ FlushOptions fopts;
+ db_->Flush(fopts);
+ AddKeys(10, 0);
+ db_->Flush(fopts);
+ db_->GetSortedWalFiles(logfiles);
+ ASSERT_GT(logfiles.size(), 0UL);
+ std::unique_ptr<LogFile> archived_log = std::move(logfiles.front());
+ ASSERT_EQ(archived_log->Type(), kArchivedLogFile);
+ ASSERT_OK(env_->FileExists(wal_dir_ + "/" + archived_log->PathName()));
+ fprintf(stdout, "Deleting archived log file %s\n",
+ archived_log->PathName().c_str());
+ ASSERT_OK(db_->DeleteFile(archived_log->PathName()));
+ ASSERT_EQ(Status::NotFound(),
+ env_->FileExists(wal_dir_ + "/" + archived_log->PathName()));
+}
+
+TEST_F(DeleteFileTest, DeleteNonDefaultColumnFamily) {
+ Options options = CurrentOptions();
+ SetOptions(&options);
+ Destroy(options);
+ options.create_if_missing = true;
+ Reopen(options);
+ CreateAndReopenWithCF({"new_cf"}, options);
+
+ Random rnd(5);
+ for (int i = 0; i < 1000; ++i) {
+ ASSERT_OK(db_->Put(WriteOptions(), handles_[1], test::RandomKey(&rnd, 10),
+ test::RandomKey(&rnd, 10)));
+ }
+ ASSERT_OK(db_->Flush(FlushOptions(), handles_[1]));
+ for (int i = 0; i < 1000; ++i) {
+ ASSERT_OK(db_->Put(WriteOptions(), handles_[1], test::RandomKey(&rnd, 10),
+ test::RandomKey(&rnd, 10)));
+ }
+ ASSERT_OK(db_->Flush(FlushOptions(), handles_[1]));
+
+ std::vector<LiveFileMetaData> metadata;
+ db_->GetLiveFilesMetaData(&metadata);
+ ASSERT_EQ(2U, metadata.size());
+ ASSERT_EQ("new_cf", metadata[0].column_family_name);
+ ASSERT_EQ("new_cf", metadata[1].column_family_name);
+ auto old_file = metadata[0].smallest_seqno < metadata[1].smallest_seqno
+ ? metadata[0].name
+ : metadata[1].name;
+ auto new_file = metadata[0].smallest_seqno > metadata[1].smallest_seqno
+ ? metadata[0].name
+ : metadata[1].name;
+ ASSERT_TRUE(db_->DeleteFile(new_file).IsInvalidArgument());
+ ASSERT_OK(db_->DeleteFile(old_file));
+
+ {
+ std::unique_ptr<Iterator> itr(db_->NewIterator(ReadOptions(), handles_[1]));
+ int count = 0;
+ for (itr->SeekToFirst(); itr->Valid(); itr->Next()) {
+ ASSERT_OK(itr->status());
+ ++count;
+ }
+ ASSERT_EQ(count, 1000);
+ }
+
+ Close();
+ ReopenWithColumnFamilies({kDefaultColumnFamilyName, "new_cf"}, options);
+
+ {
+ std::unique_ptr<Iterator> itr(db_->NewIterator(ReadOptions(), handles_[1]));
+ int count = 0;
+ for (itr->SeekToFirst(); itr->Valid(); itr->Next()) {
+ ASSERT_OK(itr->status());
+ ++count;
+ }
+ ASSERT_EQ(count, 1000);
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr,
+ "SKIPPED as DBImpl::DeleteFile is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/error_handler.cc b/src/rocksdb/db/error_handler.cc
new file mode 100644
index 000000000..3ba4d9fd9
--- /dev/null
+++ b/src/rocksdb/db/error_handler.cc
@@ -0,0 +1,344 @@
+// Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#include "db/error_handler.h"
+#include "db/db_impl/db_impl.h"
+#include "db/event_helpers.h"
+#include "file/sst_file_manager_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Maps to help decide the severity of an error based on the
+// BackgroundErrorReason, Code, SubCode and whether db_options.paranoid_checks
+// is set or not. There are 3 maps, going from most specific to least specific
+// (i.e from all 4 fields in a tuple to only the BackgroundErrorReason and
+// paranoid_checks). The less specific map serves as a catch all in case we miss
+// a specific error code or subcode.
+std::map<std::tuple<BackgroundErrorReason, Status::Code, Status::SubCode, bool>,
+ Status::Severity>
+ ErrorSeverityMap = {
+ // Errors during BG compaction
+ {std::make_tuple(BackgroundErrorReason::kCompaction,
+ Status::Code::kIOError, Status::SubCode::kNoSpace,
+ true),
+ Status::Severity::kSoftError},
+ {std::make_tuple(BackgroundErrorReason::kCompaction,
+ Status::Code::kIOError, Status::SubCode::kNoSpace,
+ false),
+ Status::Severity::kNoError},
+ {std::make_tuple(BackgroundErrorReason::kCompaction,
+ Status::Code::kIOError, Status::SubCode::kSpaceLimit,
+ true),
+ Status::Severity::kHardError},
+ // Errors during BG flush
+ {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+ Status::SubCode::kNoSpace, true),
+ Status::Severity::kHardError},
+ {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+ Status::SubCode::kNoSpace, false),
+ Status::Severity::kNoError},
+ {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+ Status::SubCode::kSpaceLimit, true),
+ Status::Severity::kHardError},
+ // Errors during Write
+ {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+ Status::Code::kIOError, Status::SubCode::kNoSpace,
+ true),
+ Status::Severity::kHardError},
+ {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+ Status::Code::kIOError, Status::SubCode::kNoSpace,
+ false),
+ Status::Severity::kHardError},
+};
+
+std::map<std::tuple<BackgroundErrorReason, Status::Code, bool>, Status::Severity>
+ DefaultErrorSeverityMap = {
+ // Errors during BG compaction
+ {std::make_tuple(BackgroundErrorReason::kCompaction,
+ Status::Code::kCorruption, true),
+ Status::Severity::kUnrecoverableError},
+ {std::make_tuple(BackgroundErrorReason::kCompaction,
+ Status::Code::kCorruption, false),
+ Status::Severity::kNoError},
+ {std::make_tuple(BackgroundErrorReason::kCompaction,
+ Status::Code::kIOError, true),
+ Status::Severity::kFatalError},
+ {std::make_tuple(BackgroundErrorReason::kCompaction,
+ Status::Code::kIOError, false),
+ Status::Severity::kNoError},
+ // Errors during BG flush
+ {std::make_tuple(BackgroundErrorReason::kFlush,
+ Status::Code::kCorruption, true),
+ Status::Severity::kUnrecoverableError},
+ {std::make_tuple(BackgroundErrorReason::kFlush,
+ Status::Code::kCorruption, false),
+ Status::Severity::kNoError},
+ {std::make_tuple(BackgroundErrorReason::kFlush,
+ Status::Code::kIOError, true),
+ Status::Severity::kFatalError},
+ {std::make_tuple(BackgroundErrorReason::kFlush,
+ Status::Code::kIOError, false),
+ Status::Severity::kNoError},
+ // Errors during Write
+ {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+ Status::Code::kCorruption, true),
+ Status::Severity::kUnrecoverableError},
+ {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+ Status::Code::kCorruption, false),
+ Status::Severity::kNoError},
+ {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+ Status::Code::kIOError, true),
+ Status::Severity::kFatalError},
+ {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+ Status::Code::kIOError, false),
+ Status::Severity::kNoError},
+};
+
+std::map<std::tuple<BackgroundErrorReason, bool>, Status::Severity>
+ DefaultReasonMap = {
+ // Errors during BG compaction
+ {std::make_tuple(BackgroundErrorReason::kCompaction, true),
+ Status::Severity::kFatalError},
+ {std::make_tuple(BackgroundErrorReason::kCompaction, false),
+ Status::Severity::kNoError},
+ // Errors during BG flush
+ {std::make_tuple(BackgroundErrorReason::kFlush, true),
+ Status::Severity::kFatalError},
+ {std::make_tuple(BackgroundErrorReason::kFlush, false),
+ Status::Severity::kNoError},
+ // Errors during Write
+ {std::make_tuple(BackgroundErrorReason::kWriteCallback, true),
+ Status::Severity::kFatalError},
+ {std::make_tuple(BackgroundErrorReason::kWriteCallback, false),
+ Status::Severity::kFatalError},
+ // Errors during Memtable update
+ {std::make_tuple(BackgroundErrorReason::kMemTable, true),
+ Status::Severity::kFatalError},
+ {std::make_tuple(BackgroundErrorReason::kMemTable, false),
+ Status::Severity::kFatalError},
+};
+
+void ErrorHandler::CancelErrorRecovery() {
+#ifndef ROCKSDB_LITE
+ db_mutex_->AssertHeld();
+
+ // We'll release the lock before calling sfm, so make sure no new
+ // recovery gets scheduled at that point
+ auto_recovery_ = false;
+ SstFileManagerImpl* sfm = reinterpret_cast<SstFileManagerImpl*>(
+ db_options_.sst_file_manager.get());
+ if (sfm) {
+ // This may or may not cancel a pending recovery
+ db_mutex_->Unlock();
+ bool cancelled = sfm->CancelErrorRecovery(this);
+ db_mutex_->Lock();
+ if (cancelled) {
+ recovery_in_prog_ = false;
+ }
+ }
+#endif
+}
+
+// This is the main function for looking at an error during a background
+// operation and deciding the severity, and error recovery strategy. The high
+// level algorithm is as follows -
+// 1. Classify the severity of the error based on the ErrorSeverityMap,
+// DefaultErrorSeverityMap and DefaultReasonMap defined earlier
+// 2. Call a Status code specific override function to adjust the severity
+// if needed. The reason for this is our ability to recover may depend on
+// the exact options enabled in DBOptions
+// 3. Determine if auto recovery is possible. A listener notification callback
+// is called, which can disable the auto recovery even if we decide its
+// feasible
+// 4. For Status::NoSpace() errors, rely on SstFileManagerImpl to control
+// the actual recovery. If no sst file manager is specified in DBOptions,
+// a default one is allocated during DB::Open(), so there will always be
+// one.
+// This can also get called as part of a recovery operation. In that case, we
+// also track the error separately in recovery_error_ so we can tell in the
+// end whether recovery succeeded or not
+Status ErrorHandler::SetBGError(const Status& bg_err, BackgroundErrorReason reason) {
+ db_mutex_->AssertHeld();
+
+ if (bg_err.ok()) {
+ return Status::OK();
+ }
+
+ bool paranoid = db_options_.paranoid_checks;
+ Status::Severity sev = Status::Severity::kFatalError;
+ Status new_bg_err;
+ bool found = false;
+
+ {
+ auto entry = ErrorSeverityMap.find(std::make_tuple(reason, bg_err.code(),
+ bg_err.subcode(), paranoid));
+ if (entry != ErrorSeverityMap.end()) {
+ sev = entry->second;
+ found = true;
+ }
+ }
+
+ if (!found) {
+ auto entry = DefaultErrorSeverityMap.find(std::make_tuple(reason,
+ bg_err.code(), paranoid));
+ if (entry != DefaultErrorSeverityMap.end()) {
+ sev = entry->second;
+ found = true;
+ }
+ }
+
+ if (!found) {
+ auto entry = DefaultReasonMap.find(std::make_tuple(reason, paranoid));
+ if (entry != DefaultReasonMap.end()) {
+ sev = entry->second;
+ }
+ }
+
+ new_bg_err = Status(bg_err, sev);
+
+ // Check if recovery is currently in progress. If it is, we will save this
+ // error so we can check it at the end to see if recovery succeeded or not
+ if (recovery_in_prog_ && recovery_error_.ok()) {
+ recovery_error_ = new_bg_err;
+ }
+
+ bool auto_recovery = auto_recovery_;
+ if (new_bg_err.severity() >= Status::Severity::kFatalError && auto_recovery) {
+ auto_recovery = false;
+ }
+
+ // Allow some error specific overrides
+ if (new_bg_err == Status::NoSpace()) {
+ new_bg_err = OverrideNoSpaceError(new_bg_err, &auto_recovery);
+ }
+
+ if (!new_bg_err.ok()) {
+ Status s = new_bg_err;
+ EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, &s,
+ db_mutex_, &auto_recovery);
+ if (!s.ok() && (s.severity() > bg_error_.severity())) {
+ bg_error_ = s;
+ } else {
+ // This error is less severe than previously encountered error. Don't
+ // take any further action
+ return bg_error_;
+ }
+ }
+
+ if (auto_recovery) {
+ recovery_in_prog_ = true;
+
+ // Kick-off error specific recovery
+ if (bg_error_ == Status::NoSpace()) {
+ RecoverFromNoSpace();
+ }
+ }
+ return bg_error_;
+}
+
+Status ErrorHandler::OverrideNoSpaceError(Status bg_error,
+ bool* auto_recovery) {
+#ifndef ROCKSDB_LITE
+ if (bg_error.severity() >= Status::Severity::kFatalError) {
+ return bg_error;
+ }
+
+ if (db_options_.sst_file_manager.get() == nullptr) {
+ // We rely on SFM to poll for enough disk space and recover
+ *auto_recovery = false;
+ return bg_error;
+ }
+
+ if (db_options_.allow_2pc &&
+ (bg_error.severity() <= Status::Severity::kSoftError)) {
+ // Don't know how to recover, as the contents of the current WAL file may
+ // be inconsistent, and it may be needed for 2PC. If 2PC is not enabled,
+ // we can just flush the memtable and discard the log
+ *auto_recovery = false;
+ return Status(bg_error, Status::Severity::kFatalError);
+ }
+
+ {
+ uint64_t free_space;
+ if (db_options_.env->GetFreeSpace(db_options_.db_paths[0].path,
+ &free_space) == Status::NotSupported()) {
+ *auto_recovery = false;
+ }
+ }
+
+ return bg_error;
+#else
+ (void)auto_recovery;
+ return Status(bg_error, Status::Severity::kFatalError);
+#endif
+}
+
+void ErrorHandler::RecoverFromNoSpace() {
+#ifndef ROCKSDB_LITE
+ SstFileManagerImpl* sfm =
+ reinterpret_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get());
+
+ // Inform SFM of the error, so it can kick-off the recovery
+ if (sfm) {
+ sfm->StartErrorRecovery(this, bg_error_);
+ }
+#endif
+}
+
+Status ErrorHandler::ClearBGError() {
+#ifndef ROCKSDB_LITE
+ db_mutex_->AssertHeld();
+
+ // Signal that recovery succeeded
+ if (recovery_error_.ok()) {
+ Status old_bg_error = bg_error_;
+ bg_error_ = Status::OK();
+ recovery_in_prog_ = false;
+ EventHelpers::NotifyOnErrorRecoveryCompleted(db_options_.listeners,
+ old_bg_error, db_mutex_);
+ }
+ return recovery_error_;
+#else
+ return bg_error_;
+#endif
+}
+
+Status ErrorHandler::RecoverFromBGError(bool is_manual) {
+#ifndef ROCKSDB_LITE
+ InstrumentedMutexLock l(db_mutex_);
+ if (is_manual) {
+ // If its a manual recovery and there's a background recovery in progress
+ // return busy status
+ if (recovery_in_prog_) {
+ return Status::Busy();
+ }
+ recovery_in_prog_ = true;
+ }
+
+ if (bg_error_.severity() == Status::Severity::kSoftError) {
+ // Simply clear the background error and return
+ recovery_error_ = Status::OK();
+ return ClearBGError();
+ }
+
+ // Reset recovery_error_. We will use this to record any errors that happen
+ // during the recovery process. While recovering, the only operations that
+ // can generate background errors should be the flush operations
+ recovery_error_ = Status::OK();
+ Status s = db_->ResumeImpl();
+ // For manual recover, shutdown, and fatal error cases, set
+ // recovery_in_prog_ to false. For automatic background recovery, leave it
+ // as is regardless of success or failure as it will be retried
+ if (is_manual || s.IsShutdownInProgress() ||
+ bg_error_.severity() >= Status::Severity::kFatalError) {
+ recovery_in_prog_ = false;
+ }
+ return s;
+#else
+ (void)is_manual;
+ return bg_error_;
+#endif
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/error_handler.h b/src/rocksdb/db/error_handler.h
new file mode 100644
index 000000000..7276f6510
--- /dev/null
+++ b/src/rocksdb/db/error_handler.h
@@ -0,0 +1,75 @@
+// Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include "monitoring/instrumented_mutex.h"
+#include "options/db_options.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBImpl;
+
+class ErrorHandler {
+ public:
+ ErrorHandler(DBImpl* db, const ImmutableDBOptions& db_options,
+ InstrumentedMutex* db_mutex)
+ : db_(db),
+ db_options_(db_options),
+ bg_error_(Status::OK()),
+ recovery_error_(Status::OK()),
+ db_mutex_(db_mutex),
+ auto_recovery_(false),
+ recovery_in_prog_(false) {}
+ ~ErrorHandler() {}
+
+ void EnableAutoRecovery() { auto_recovery_ = true; }
+
+ Status::Severity GetErrorSeverity(BackgroundErrorReason reason,
+ Status::Code code,
+ Status::SubCode subcode);
+
+ Status SetBGError(const Status& bg_err, BackgroundErrorReason reason);
+
+ Status GetBGError() { return bg_error_; }
+
+ Status GetRecoveryError() { return recovery_error_; }
+
+ Status ClearBGError();
+
+ bool IsDBStopped() {
+ return !bg_error_.ok() &&
+ bg_error_.severity() >= Status::Severity::kHardError;
+ }
+
+ bool IsBGWorkStopped() {
+ return !bg_error_.ok() &&
+ (bg_error_.severity() >= Status::Severity::kHardError ||
+ !auto_recovery_);
+ }
+
+ bool IsRecoveryInProgress() { return recovery_in_prog_; }
+
+ Status RecoverFromBGError(bool is_manual = false);
+ void CancelErrorRecovery();
+
+ private:
+ DBImpl* db_;
+ const ImmutableDBOptions& db_options_;
+ Status bg_error_;
+ // A separate Status variable used to record any errors during the
+ // recovery process from hard errors
+ Status recovery_error_;
+ InstrumentedMutex* db_mutex_;
+ // A flag indicating whether automatic recovery from errors is enabled
+ bool auto_recovery_;
+ bool recovery_in_prog_;
+
+ Status OverrideNoSpaceError(Status bg_error, bool* auto_recovery);
+ void RecoverFromNoSpace();
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/error_handler_test.cc b/src/rocksdb/db/error_handler_test.cc
new file mode 100644
index 000000000..b9d78490c
--- /dev/null
+++ b/src/rocksdb/db/error_handler_test.cc
@@ -0,0 +1,871 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#ifndef ROCKSDB_LITE
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/sst_file_manager.h"
+#include "test_util/fault_injection_test_env.h"
+#if !defined(ROCKSDB_LITE)
+#include "test_util/sync_point.h"
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBErrorHandlingTest : public DBTestBase {
+ public:
+ DBErrorHandlingTest() : DBTestBase("/db_error_handling_test") {}
+
+ std::string GetManifestNameFromLiveFiles() {
+ std::vector<std::string> live_files;
+ uint64_t manifest_size;
+
+ dbfull()->GetLiveFiles(live_files, &manifest_size, false);
+ for (auto& file : live_files) {
+ uint64_t num = 0;
+ FileType type;
+ if (ParseFileName(file, &num, &type) && type == kDescriptorFile) {
+ return file;
+ }
+ }
+ return "";
+ }
+};
+
+class DBErrorHandlingEnv : public EnvWrapper {
+ public:
+ DBErrorHandlingEnv() : EnvWrapper(Env::Default()),
+ trig_no_space(false), trig_io_error(false) {}
+
+ void SetTrigNoSpace() {trig_no_space = true;}
+ void SetTrigIoError() {trig_io_error = true;}
+ private:
+ bool trig_no_space;
+ bool trig_io_error;
+};
+
+class ErrorHandlerListener : public EventListener {
+ public:
+ ErrorHandlerListener()
+ : mutex_(),
+ cv_(&mutex_),
+ no_auto_recovery_(false),
+ recovery_complete_(false),
+ file_creation_started_(false),
+ override_bg_error_(false),
+ file_count_(0),
+ fault_env_(nullptr) {}
+
+ void OnTableFileCreationStarted(
+ const TableFileCreationBriefInfo& /*ti*/) override {
+ InstrumentedMutexLock l(&mutex_);
+ file_creation_started_ = true;
+ if (file_count_ > 0) {
+ if (--file_count_ == 0) {
+ fault_env_->SetFilesystemActive(false, file_creation_error_);
+ file_creation_error_ = Status::OK();
+ }
+ }
+ cv_.SignalAll();
+ }
+
+ void OnErrorRecoveryBegin(BackgroundErrorReason /*reason*/,
+ Status /*bg_error*/,
+ bool* auto_recovery) override {
+ if (*auto_recovery && no_auto_recovery_) {
+ *auto_recovery = false;
+ }
+ }
+
+ void OnErrorRecoveryCompleted(Status /*old_bg_error*/) override {
+ InstrumentedMutexLock l(&mutex_);
+ recovery_complete_ = true;
+ cv_.SignalAll();
+ }
+
+ bool WaitForRecovery(uint64_t /*abs_time_us*/) {
+ InstrumentedMutexLock l(&mutex_);
+ while (!recovery_complete_) {
+ cv_.Wait(/*abs_time_us*/);
+ }
+ if (recovery_complete_) {
+ recovery_complete_ = false;
+ return true;
+ }
+ return false;
+ }
+
+ void WaitForTableFileCreationStarted(uint64_t /*abs_time_us*/) {
+ InstrumentedMutexLock l(&mutex_);
+ while (!file_creation_started_) {
+ cv_.Wait(/*abs_time_us*/);
+ }
+ file_creation_started_ = false;
+ }
+
+ void OnBackgroundError(BackgroundErrorReason /*reason*/,
+ Status* bg_error) override {
+ if (override_bg_error_) {
+ *bg_error = bg_error_;
+ override_bg_error_ = false;
+ }
+ }
+
+ void EnableAutoRecovery(bool enable = true) { no_auto_recovery_ = !enable; }
+
+ void OverrideBGError(Status bg_err) {
+ bg_error_ = bg_err;
+ override_bg_error_ = true;
+ }
+
+ void InjectFileCreationError(FaultInjectionTestEnv* env, int file_count,
+ Status s) {
+ fault_env_ = env;
+ file_count_ = file_count;
+ file_creation_error_ = s;
+ }
+
+ private:
+ InstrumentedMutex mutex_;
+ InstrumentedCondVar cv_;
+ bool no_auto_recovery_;
+ bool recovery_complete_;
+ bool file_creation_started_;
+ bool override_bg_error_;
+ int file_count_;
+ Status file_creation_error_;
+ Status bg_error_;
+ FaultInjectionTestEnv* fault_env_;
+};
+
+TEST_F(DBErrorHandlingTest, FLushWriteError) {
+ std::unique_ptr<FaultInjectionTestEnv> fault_env(
+ new FaultInjectionTestEnv(Env::Default()));
+ std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.env = fault_env.get();
+ options.listeners.emplace_back(listener);
+ Status s;
+
+ listener->EnableAutoRecovery(false);
+ DestroyAndReopen(options);
+
+ Put(Key(0), "val");
+ SyncPoint::GetInstance()->SetCallBack(
+ "FlushJob::Start", [&](void *) {
+ fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_env->SetFilesystemActive(true);
+ s = dbfull()->Resume();
+ ASSERT_EQ(s, Status::OK());
+
+ Reopen(options);
+ ASSERT_EQ("val", Get(Key(0)));
+ Destroy(options);
+}
+
+TEST_F(DBErrorHandlingTest, ManifestWriteError) {
+ std::unique_ptr<FaultInjectionTestEnv> fault_env(
+ new FaultInjectionTestEnv(Env::Default()));
+ std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.env = fault_env.get();
+ options.listeners.emplace_back(listener);
+ Status s;
+ std::string old_manifest;
+ std::string new_manifest;
+
+ listener->EnableAutoRecovery(false);
+ DestroyAndReopen(options);
+ old_manifest = GetManifestNameFromLiveFiles();
+
+ Put(Key(0), "val");
+ Flush();
+ Put(Key(1), "val");
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::LogAndApply:WriteManifest", [&](void *) {
+ fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_env->SetFilesystemActive(true);
+ s = dbfull()->Resume();
+ ASSERT_EQ(s, Status::OK());
+
+ new_manifest = GetManifestNameFromLiveFiles();
+ ASSERT_NE(new_manifest, old_manifest);
+
+ Reopen(options);
+ ASSERT_EQ("val", Get(Key(0)));
+ ASSERT_EQ("val", Get(Key(1)));
+ Close();
+}
+
+TEST_F(DBErrorHandlingTest, DoubleManifestWriteError) {
+ std::unique_ptr<FaultInjectionTestEnv> fault_env(
+ new FaultInjectionTestEnv(Env::Default()));
+ std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.env = fault_env.get();
+ options.listeners.emplace_back(listener);
+ Status s;
+ std::string old_manifest;
+ std::string new_manifest;
+
+ listener->EnableAutoRecovery(false);
+ DestroyAndReopen(options);
+ old_manifest = GetManifestNameFromLiveFiles();
+
+ Put(Key(0), "val");
+ Flush();
+ Put(Key(1), "val");
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::LogAndApply:WriteManifest", [&](void *) {
+ fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+ fault_env->SetFilesystemActive(true);
+
+ // This Resume() will attempt to create a new manifest file and fail again
+ s = dbfull()->Resume();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+ fault_env->SetFilesystemActive(true);
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->DisableProcessing();
+
+ // A successful Resume() will create a new manifest file
+ s = dbfull()->Resume();
+ ASSERT_EQ(s, Status::OK());
+
+ new_manifest = GetManifestNameFromLiveFiles();
+ ASSERT_NE(new_manifest, old_manifest);
+
+ Reopen(options);
+ ASSERT_EQ("val", Get(Key(0)));
+ ASSERT_EQ("val", Get(Key(1)));
+ Close();
+}
+
+TEST_F(DBErrorHandlingTest, CompactionManifestWriteError) {
+ std::unique_ptr<FaultInjectionTestEnv> fault_env(
+ new FaultInjectionTestEnv(Env::Default()));
+ std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.level0_file_num_compaction_trigger = 2;
+ options.listeners.emplace_back(listener);
+ options.env = fault_env.get();
+ Status s;
+ std::string old_manifest;
+ std::string new_manifest;
+ std::atomic<bool> fail_manifest(false);
+ DestroyAndReopen(options);
+ old_manifest = GetManifestNameFromLiveFiles();
+
+ Put(Key(0), "val");
+ Put(Key(2), "val");
+ s = Flush();
+ ASSERT_EQ(s, Status::OK());
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ // Wait for flush of 2nd L0 file before starting compaction
+ {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+ "BackgroundCallCompaction:0"},
+ // Wait for compaction to detect manifest write error
+ {"BackgroundCallCompaction:1", "CompactionManifestWriteError:0"},
+ // Make compaction thread wait for error to be cleared
+ {"CompactionManifestWriteError:1",
+ "DBImpl::BackgroundCallCompaction:FoundObsoleteFiles"},
+ // Wait for DB instance to clear bg_error before calling
+ // TEST_WaitForCompact
+ {"SstFileManagerImpl::ErrorCleared", "CompactionManifestWriteError:2"}});
+ // trigger manifest write failure in compaction thread
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BackgroundCallCompaction:0", [&](void*) { fail_manifest.store(true); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+ if (fail_manifest.load()) {
+ fault_env->SetFilesystemActive(false,
+ Status::NoSpace("Out of space"));
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Put(Key(1), "val");
+ // This Flush will trigger a compaction, which will fail when appending to
+ // the manifest
+ s = Flush();
+ ASSERT_EQ(s, Status::OK());
+
+ TEST_SYNC_POINT("CompactionManifestWriteError:0");
+ // Clear all errors so when the compaction is retried, it will succeed
+ fault_env->SetFilesystemActive(true);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ TEST_SYNC_POINT("CompactionManifestWriteError:1");
+ TEST_SYNC_POINT("CompactionManifestWriteError:2");
+
+ s = dbfull()->TEST_WaitForCompact();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ASSERT_EQ(s, Status::OK());
+
+ new_manifest = GetManifestNameFromLiveFiles();
+ ASSERT_NE(new_manifest, old_manifest);
+ Reopen(options);
+ ASSERT_EQ("val", Get(Key(0)));
+ ASSERT_EQ("val", Get(Key(1)));
+ ASSERT_EQ("val", Get(Key(2)));
+ Close();
+}
+
+TEST_F(DBErrorHandlingTest, CompactionWriteError) {
+ std::unique_ptr<FaultInjectionTestEnv> fault_env(
+ new FaultInjectionTestEnv(Env::Default()));
+ std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.level0_file_num_compaction_trigger = 2;
+ options.listeners.emplace_back(listener);
+ options.env = fault_env.get();
+ Status s;
+ DestroyAndReopen(options);
+
+ Put(Key(0), "va;");
+ Put(Key(2), "va;");
+ s = Flush();
+ ASSERT_EQ(s, Status::OK());
+
+ listener->OverrideBGError(
+ Status(Status::NoSpace(), Status::Severity::kHardError)
+ );
+ listener->EnableAutoRecovery(false);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+ "BackgroundCallCompaction:0"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BackgroundCallCompaction:0", [&](void*) {
+ fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Put(Key(1), "val");
+ s = Flush();
+ ASSERT_EQ(s, Status::OK());
+
+ s = dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+
+ fault_env->SetFilesystemActive(true);
+ s = dbfull()->Resume();
+ ASSERT_EQ(s, Status::OK());
+ Destroy(options);
+}
+
+TEST_F(DBErrorHandlingTest, CorruptionError) {
+ std::unique_ptr<FaultInjectionTestEnv> fault_env(
+ new FaultInjectionTestEnv(Env::Default()));
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.level0_file_num_compaction_trigger = 2;
+ options.env = fault_env.get();
+ Status s;
+ DestroyAndReopen(options);
+
+ Put(Key(0), "va;");
+ Put(Key(2), "va;");
+ s = Flush();
+ ASSERT_EQ(s, Status::OK());
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+ "BackgroundCallCompaction:0"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BackgroundCallCompaction:0", [&](void*) {
+ fault_env->SetFilesystemActive(false, Status::Corruption("Corruption"));
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Put(Key(1), "val");
+ s = Flush();
+ ASSERT_EQ(s, Status::OK());
+
+ s = dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(s.severity(),
+ ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError);
+
+ fault_env->SetFilesystemActive(true);
+ s = dbfull()->Resume();
+ ASSERT_NE(s, Status::OK());
+ Destroy(options);
+}
+
+TEST_F(DBErrorHandlingTest, AutoRecoverFlushError) {
+ std::unique_ptr<FaultInjectionTestEnv> fault_env(
+ new FaultInjectionTestEnv(Env::Default()));
+ std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.env = fault_env.get();
+ options.listeners.emplace_back(listener);
+ Status s;
+
+ listener->EnableAutoRecovery();
+ DestroyAndReopen(options);
+
+ Put(Key(0), "val");
+ SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+ fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_env->SetFilesystemActive(true);
+ ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+
+ s = Put(Key(1), "val");
+ ASSERT_EQ(s, Status::OK());
+
+ Reopen(options);
+ ASSERT_EQ("val", Get(Key(0)));
+ ASSERT_EQ("val", Get(Key(1)));
+ Destroy(options);
+}
+
+TEST_F(DBErrorHandlingTest, FailRecoverFlushError) {
+ std::unique_ptr<FaultInjectionTestEnv> fault_env(
+ new FaultInjectionTestEnv(Env::Default()));
+ std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.env = fault_env.get();
+ options.listeners.emplace_back(listener);
+ Status s;
+
+ listener->EnableAutoRecovery();
+ DestroyAndReopen(options);
+
+ Put(Key(0), "val");
+ SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+ fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+ // We should be able to shutdown the database while auto recovery is going
+ // on in the background
+ Close();
+ DestroyDB(dbname_, options);
+}
+
+TEST_F(DBErrorHandlingTest, WALWriteError) {
+ std::unique_ptr<FaultInjectionTestEnv> fault_env(
+ new FaultInjectionTestEnv(Env::Default()));
+ std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.writable_file_max_buffer_size = 32768;
+ options.env = fault_env.get();
+ options.listeners.emplace_back(listener);
+ Status s;
+ Random rnd(301);
+
+ listener->EnableAutoRecovery();
+ DestroyAndReopen(options);
+
+ {
+ WriteBatch batch;
+
+ for (auto i = 0; i<100; ++i) {
+ batch.Put(Key(i), RandomString(&rnd, 1024));
+ }
+
+ WriteOptions wopts;
+ wopts.sync = true;
+ ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK());
+ };
+
+ {
+ WriteBatch batch;
+ int write_error = 0;
+
+ for (auto i = 100; i<199; ++i) {
+ batch.Put(Key(i), RandomString(&rnd, 1024));
+ }
+
+ SyncPoint::GetInstance()->SetCallBack("WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
+ write_error++;
+ if (write_error > 2) {
+ fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+ }
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ WriteOptions wopts;
+ wopts.sync = true;
+ s = dbfull()->Write(wopts, &batch);
+ ASSERT_EQ(s, s.NoSpace());
+ }
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_env->SetFilesystemActive(true);
+ ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+ for (auto i=0; i<199; ++i) {
+ if (i < 100) {
+ ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+ } else {
+ ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+ }
+ }
+ Reopen(options);
+ for (auto i=0; i<199; ++i) {
+ if (i < 100) {
+ ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+ } else {
+ ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+ }
+ }
+ Close();
+}
+
+TEST_F(DBErrorHandlingTest, MultiCFWALWriteError) {
+ std::unique_ptr<FaultInjectionTestEnv> fault_env(
+ new FaultInjectionTestEnv(Env::Default()));
+ std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.writable_file_max_buffer_size = 32768;
+ options.env = fault_env.get();
+ options.listeners.emplace_back(listener);
+ Status s;
+ Random rnd(301);
+
+ listener->EnableAutoRecovery();
+ CreateAndReopenWithCF({"one", "two", "three"}, options);
+
+ {
+ WriteBatch batch;
+
+ for (auto i = 1; i < 4; ++i) {
+ for (auto j = 0; j < 100; ++j) {
+ batch.Put(handles_[i], Key(j), RandomString(&rnd, 1024));
+ }
+ }
+
+ WriteOptions wopts;
+ wopts.sync = true;
+ ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK());
+ };
+
+ {
+ WriteBatch batch;
+ int write_error = 0;
+
+ // Write to one CF
+ for (auto i = 100; i < 199; ++i) {
+ batch.Put(handles_[2], Key(i), RandomString(&rnd, 1024));
+ }
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
+ write_error++;
+ if (write_error > 2) {
+ fault_env->SetFilesystemActive(false,
+ Status::NoSpace("Out of space"));
+ }
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ WriteOptions wopts;
+ wopts.sync = true;
+ s = dbfull()->Write(wopts, &batch);
+ ASSERT_EQ(s, s.NoSpace());
+ }
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_env->SetFilesystemActive(true);
+ ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+
+ for (auto i = 1; i < 4; ++i) {
+ // Every CF should have been flushed
+ ASSERT_EQ(NumTableFilesAtLevel(0, i), 1);
+ }
+
+ for (auto i = 1; i < 4; ++i) {
+ for (auto j = 0; j < 199; ++j) {
+ if (j < 100) {
+ ASSERT_NE(Get(i, Key(j)), "NOT_FOUND");
+ } else {
+ ASSERT_EQ(Get(i, Key(j)), "NOT_FOUND");
+ }
+ }
+ }
+ ReopenWithColumnFamilies({"default", "one", "two", "three"}, options);
+ for (auto i = 1; i < 4; ++i) {
+ for (auto j = 0; j < 199; ++j) {
+ if (j < 100) {
+ ASSERT_NE(Get(i, Key(j)), "NOT_FOUND");
+ } else {
+ ASSERT_EQ(Get(i, Key(j)), "NOT_FOUND");
+ }
+ }
+ }
+ Close();
+}
+
+TEST_F(DBErrorHandlingTest, MultiDBCompactionError) {
+ FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(Env::Default());
+ std::vector<std::unique_ptr<FaultInjectionTestEnv>> fault_env;
+ std::vector<Options> options;
+ std::vector<std::shared_ptr<ErrorHandlerListener>> listener;
+ std::vector<DB*> db;
+ std::shared_ptr<SstFileManager> sfm(NewSstFileManager(def_env));
+ int kNumDbInstances = 3;
+ Random rnd(301);
+
+ for (auto i = 0; i < kNumDbInstances; ++i) {
+ listener.emplace_back(new ErrorHandlerListener());
+ options.emplace_back(GetDefaultOptions());
+ fault_env.emplace_back(new FaultInjectionTestEnv(Env::Default()));
+ options[i].create_if_missing = true;
+ options[i].level0_file_num_compaction_trigger = 2;
+ options[i].writable_file_max_buffer_size = 32768;
+ options[i].env = fault_env[i].get();
+ options[i].listeners.emplace_back(listener[i]);
+ options[i].sst_file_manager = sfm;
+ DB* dbptr;
+ char buf[16];
+
+ listener[i]->EnableAutoRecovery();
+ // Setup for returning error for the 3rd SST, which would be level 1
+ listener[i]->InjectFileCreationError(fault_env[i].get(), 3,
+ Status::NoSpace("Out of space"));
+ snprintf(buf, sizeof(buf), "_%d", i);
+ DestroyDB(dbname_ + std::string(buf), options[i]);
+ ASSERT_EQ(DB::Open(options[i], dbname_ + std::string(buf), &dbptr),
+ Status::OK());
+ db.emplace_back(dbptr);
+ }
+
+ for (auto i = 0; i < kNumDbInstances; ++i) {
+ WriteBatch batch;
+
+ for (auto j = 0; j <= 100; ++j) {
+ batch.Put(Key(j), RandomString(&rnd, 1024));
+ }
+
+ WriteOptions wopts;
+ wopts.sync = true;
+ ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK());
+ ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK());
+ }
+
+ def_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+ for (auto i = 0; i < kNumDbInstances; ++i) {
+ WriteBatch batch;
+
+ // Write to one CF
+ for (auto j = 100; j < 199; ++j) {
+ batch.Put(Key(j), RandomString(&rnd, 1024));
+ }
+
+ WriteOptions wopts;
+ wopts.sync = true;
+ ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK());
+ ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK());
+ }
+
+ for (auto i = 0; i < kNumDbInstances; ++i) {
+ Status s = static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true);
+ ASSERT_EQ(s.severity(), Status::Severity::kSoftError);
+ fault_env[i]->SetFilesystemActive(true);
+ }
+
+ def_env->SetFilesystemActive(true);
+ for (auto i = 0; i < kNumDbInstances; ++i) {
+ std::string prop;
+ ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true);
+ ASSERT_EQ(static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true),
+ Status::OK());
+ EXPECT_TRUE(db[i]->GetProperty(
+ "rocksdb.num-files-at-level" + NumberToString(0), &prop));
+ EXPECT_EQ(atoi(prop.c_str()), 0);
+ EXPECT_TRUE(db[i]->GetProperty(
+ "rocksdb.num-files-at-level" + NumberToString(1), &prop));
+ EXPECT_EQ(atoi(prop.c_str()), 1);
+ }
+
+ for (auto i = 0; i < kNumDbInstances; ++i) {
+ char buf[16];
+ snprintf(buf, sizeof(buf), "_%d", i);
+ delete db[i];
+ fault_env[i]->SetFilesystemActive(true);
+ if (getenv("KEEP_DB")) {
+ printf("DB is still at %s%s\n", dbname_.c_str(), buf);
+ } else {
+ Status s = DestroyDB(dbname_ + std::string(buf), options[i]);
+ }
+ }
+ options.clear();
+ sfm.reset();
+ delete def_env;
+}
+
+TEST_F(DBErrorHandlingTest, MultiDBVariousErrors) {
+ FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(Env::Default());
+ std::vector<std::unique_ptr<FaultInjectionTestEnv>> fault_env;
+ std::vector<Options> options;
+ std::vector<std::shared_ptr<ErrorHandlerListener>> listener;
+ std::vector<DB*> db;
+ std::shared_ptr<SstFileManager> sfm(NewSstFileManager(def_env));
+ int kNumDbInstances = 3;
+ Random rnd(301);
+
+ for (auto i = 0; i < kNumDbInstances; ++i) {
+ listener.emplace_back(new ErrorHandlerListener());
+ options.emplace_back(GetDefaultOptions());
+ fault_env.emplace_back(new FaultInjectionTestEnv(Env::Default()));
+ options[i].create_if_missing = true;
+ options[i].level0_file_num_compaction_trigger = 2;
+ options[i].writable_file_max_buffer_size = 32768;
+ options[i].env = fault_env[i].get();
+ options[i].listeners.emplace_back(listener[i]);
+ options[i].sst_file_manager = sfm;
+ DB* dbptr;
+ char buf[16];
+
+ listener[i]->EnableAutoRecovery();
+ switch (i) {
+ case 0:
+ // Setup for returning error for the 3rd SST, which would be level 1
+ listener[i]->InjectFileCreationError(fault_env[i].get(), 3,
+ Status::NoSpace("Out of space"));
+ break;
+ case 1:
+ // Setup for returning error after the 1st SST, which would result
+ // in a hard error
+ listener[i]->InjectFileCreationError(fault_env[i].get(), 2,
+ Status::NoSpace("Out of space"));
+ break;
+ default:
+ break;
+ }
+ snprintf(buf, sizeof(buf), "_%d", i);
+ DestroyDB(dbname_ + std::string(buf), options[i]);
+ ASSERT_EQ(DB::Open(options[i], dbname_ + std::string(buf), &dbptr),
+ Status::OK());
+ db.emplace_back(dbptr);
+ }
+
+ for (auto i = 0; i < kNumDbInstances; ++i) {
+ WriteBatch batch;
+
+ for (auto j = 0; j <= 100; ++j) {
+ batch.Put(Key(j), RandomString(&rnd, 1024));
+ }
+
+ WriteOptions wopts;
+ wopts.sync = true;
+ ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK());
+ ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK());
+ }
+
+ def_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+ for (auto i = 0; i < kNumDbInstances; ++i) {
+ WriteBatch batch;
+
+ // Write to one CF
+ for (auto j = 100; j < 199; ++j) {
+ batch.Put(Key(j), RandomString(&rnd, 1024));
+ }
+
+ WriteOptions wopts;
+ wopts.sync = true;
+ ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK());
+ if (i != 1) {
+ ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK());
+ } else {
+ ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::NoSpace());
+ }
+ }
+
+ for (auto i = 0; i < kNumDbInstances; ++i) {
+ Status s = static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true);
+ switch (i) {
+ case 0:
+ ASSERT_EQ(s.severity(), Status::Severity::kSoftError);
+ break;
+ case 1:
+ ASSERT_EQ(s.severity(), Status::Severity::kHardError);
+ break;
+ case 2:
+ ASSERT_EQ(s, Status::OK());
+ break;
+ }
+ fault_env[i]->SetFilesystemActive(true);
+ }
+
+ def_env->SetFilesystemActive(true);
+ for (auto i = 0; i < kNumDbInstances; ++i) {
+ std::string prop;
+ if (i < 2) {
+ ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true);
+ }
+ if (i == 1) {
+ ASSERT_EQ(static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true),
+ Status::OK());
+ }
+ EXPECT_TRUE(db[i]->GetProperty(
+ "rocksdb.num-files-at-level" + NumberToString(0), &prop));
+ EXPECT_EQ(atoi(prop.c_str()), 0);
+ EXPECT_TRUE(db[i]->GetProperty(
+ "rocksdb.num-files-at-level" + NumberToString(1), &prop));
+ EXPECT_EQ(atoi(prop.c_str()), 1);
+ }
+
+ for (auto i = 0; i < kNumDbInstances; ++i) {
+ char buf[16];
+ snprintf(buf, sizeof(buf), "_%d", i);
+ fault_env[i]->SetFilesystemActive(true);
+ delete db[i];
+ if (getenv("KEEP_DB")) {
+ printf("DB is still at %s%s\n", dbname_.c_str(), buf);
+ } else {
+ DestroyDB(dbname_ + std::string(buf), options[i]);
+ }
+ }
+ options.clear();
+ delete def_env;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/db/event_helpers.cc b/src/rocksdb/db/event_helpers.cc
new file mode 100644
index 000000000..57aa711fc
--- /dev/null
+++ b/src/rocksdb/db/event_helpers.cc
@@ -0,0 +1,223 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/event_helpers.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+template <class T>
+inline T SafeDivide(T a, T b) {
+ return b == 0 ? 0 : a / b;
+}
+} // namespace
+
+void EventHelpers::AppendCurrentTime(JSONWriter* jwriter) {
+ *jwriter << "time_micros"
+ << std::chrono::duration_cast<std::chrono::microseconds>(
+ std::chrono::system_clock::now().time_since_epoch())
+ .count();
+}
+
+#ifndef ROCKSDB_LITE
+void EventHelpers::NotifyTableFileCreationStarted(
+ const std::vector<std::shared_ptr<EventListener>>& listeners,
+ const std::string& db_name, const std::string& cf_name,
+ const std::string& file_path, int job_id, TableFileCreationReason reason) {
+ TableFileCreationBriefInfo info;
+ info.db_name = db_name;
+ info.cf_name = cf_name;
+ info.file_path = file_path;
+ info.job_id = job_id;
+ info.reason = reason;
+ for (auto& listener : listeners) {
+ listener->OnTableFileCreationStarted(info);
+ }
+}
+#endif // !ROCKSDB_LITE
+
+void EventHelpers::NotifyOnBackgroundError(
+ const std::vector<std::shared_ptr<EventListener>>& listeners,
+ BackgroundErrorReason reason, Status* bg_error, InstrumentedMutex* db_mutex,
+ bool* auto_recovery) {
+#ifndef ROCKSDB_LITE
+ if (listeners.size() == 0U) {
+ return;
+ }
+ db_mutex->AssertHeld();
+ // release lock while notifying events
+ db_mutex->Unlock();
+ for (auto& listener : listeners) {
+ listener->OnBackgroundError(reason, bg_error);
+ if (*auto_recovery) {
+ listener->OnErrorRecoveryBegin(reason, *bg_error, auto_recovery);
+ }
+ }
+ db_mutex->Lock();
+#else
+ (void)listeners;
+ (void)reason;
+ (void)bg_error;
+ (void)db_mutex;
+ (void)auto_recovery;
+#endif // ROCKSDB_LITE
+}
+
+void EventHelpers::LogAndNotifyTableFileCreationFinished(
+ EventLogger* event_logger,
+ const std::vector<std::shared_ptr<EventListener>>& listeners,
+ const std::string& db_name, const std::string& cf_name,
+ const std::string& file_path, int job_id, const FileDescriptor& fd,
+ uint64_t oldest_blob_file_number, const TableProperties& table_properties,
+ TableFileCreationReason reason, const Status& s) {
+ if (s.ok() && event_logger) {
+ JSONWriter jwriter;
+ AppendCurrentTime(&jwriter);
+ jwriter << "cf_name" << cf_name << "job" << job_id << "event"
+ << "table_file_creation"
+ << "file_number" << fd.GetNumber() << "file_size"
+ << fd.GetFileSize();
+
+ // table_properties
+ {
+ jwriter << "table_properties";
+ jwriter.StartObject();
+
+ // basic properties:
+ jwriter << "data_size" << table_properties.data_size << "index_size"
+ << table_properties.index_size << "index_partitions"
+ << table_properties.index_partitions << "top_level_index_size"
+ << table_properties.top_level_index_size
+ << "index_key_is_user_key"
+ << table_properties.index_key_is_user_key
+ << "index_value_is_delta_encoded"
+ << table_properties.index_value_is_delta_encoded << "filter_size"
+ << table_properties.filter_size << "raw_key_size"
+ << table_properties.raw_key_size << "raw_average_key_size"
+ << SafeDivide(table_properties.raw_key_size,
+ table_properties.num_entries)
+ << "raw_value_size" << table_properties.raw_value_size
+ << "raw_average_value_size"
+ << SafeDivide(table_properties.raw_value_size,
+ table_properties.num_entries)
+ << "num_data_blocks" << table_properties.num_data_blocks
+ << "num_entries" << table_properties.num_entries
+ << "num_deletions" << table_properties.num_deletions
+ << "num_merge_operands" << table_properties.num_merge_operands
+ << "num_range_deletions" << table_properties.num_range_deletions
+ << "format_version" << table_properties.format_version
+ << "fixed_key_len" << table_properties.fixed_key_len
+ << "filter_policy" << table_properties.filter_policy_name
+ << "column_family_name" << table_properties.column_family_name
+ << "column_family_id" << table_properties.column_family_id
+ << "comparator" << table_properties.comparator_name
+ << "merge_operator" << table_properties.merge_operator_name
+ << "prefix_extractor_name"
+ << table_properties.prefix_extractor_name << "property_collectors"
+ << table_properties.property_collectors_names << "compression"
+ << table_properties.compression_name << "compression_options"
+ << table_properties.compression_options << "creation_time"
+ << table_properties.creation_time << "oldest_key_time"
+ << table_properties.oldest_key_time << "file_creation_time"
+ << table_properties.file_creation_time;
+
+ // user collected properties
+ for (const auto& prop : table_properties.readable_properties) {
+ jwriter << prop.first << prop.second;
+ }
+ jwriter.EndObject();
+ }
+
+ if (oldest_blob_file_number != kInvalidBlobFileNumber) {
+ jwriter << "oldest_blob_file_number" << oldest_blob_file_number;
+ }
+
+ jwriter.EndObject();
+
+ event_logger->Log(jwriter);
+ }
+
+#ifndef ROCKSDB_LITE
+ if (listeners.size() == 0) {
+ return;
+ }
+ TableFileCreationInfo info;
+ info.db_name = db_name;
+ info.cf_name = cf_name;
+ info.file_path = file_path;
+ info.file_size = fd.file_size;
+ info.job_id = job_id;
+ info.table_properties = table_properties;
+ info.reason = reason;
+ info.status = s;
+ for (auto& listener : listeners) {
+ listener->OnTableFileCreated(info);
+ }
+#else
+ (void)listeners;
+ (void)db_name;
+ (void)cf_name;
+ (void)file_path;
+ (void)reason;
+#endif // !ROCKSDB_LITE
+}
+
+void EventHelpers::LogAndNotifyTableFileDeletion(
+ EventLogger* event_logger, int job_id, uint64_t file_number,
+ const std::string& file_path, const Status& status,
+ const std::string& dbname,
+ const std::vector<std::shared_ptr<EventListener>>& listeners) {
+ JSONWriter jwriter;
+ AppendCurrentTime(&jwriter);
+
+ jwriter << "job" << job_id << "event"
+ << "table_file_deletion"
+ << "file_number" << file_number;
+ if (!status.ok()) {
+ jwriter << "status" << status.ToString();
+ }
+
+ jwriter.EndObject();
+
+ event_logger->Log(jwriter);
+
+#ifndef ROCKSDB_LITE
+ TableFileDeletionInfo info;
+ info.db_name = dbname;
+ info.job_id = job_id;
+ info.file_path = file_path;
+ info.status = status;
+ for (auto& listener : listeners) {
+ listener->OnTableFileDeleted(info);
+ }
+#else
+ (void)file_path;
+ (void)dbname;
+ (void)listeners;
+#endif // !ROCKSDB_LITE
+}
+
+void EventHelpers::NotifyOnErrorRecoveryCompleted(
+ const std::vector<std::shared_ptr<EventListener>>& listeners,
+ Status old_bg_error, InstrumentedMutex* db_mutex) {
+#ifndef ROCKSDB_LITE
+ if (listeners.size() == 0U) {
+ return;
+ }
+ db_mutex->AssertHeld();
+ // release lock while notifying events
+ db_mutex->Unlock();
+ for (auto& listener : listeners) {
+ listener->OnErrorRecoveryCompleted(old_bg_error);
+ }
+ db_mutex->Lock();
+#else
+ (void)listeners;
+ (void)old_bg_error;
+ (void)db_mutex;
+#endif // ROCKSDB_LITE
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/event_helpers.h b/src/rocksdb/db/event_helpers.h
new file mode 100644
index 000000000..87cc1cb8c
--- /dev/null
+++ b/src/rocksdb/db/event_helpers.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/version_edit.h"
+#include "logging/event_logger.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/table_properties.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class EventHelpers {
+ public:
+ static void AppendCurrentTime(JSONWriter* json_writer);
+#ifndef ROCKSDB_LITE
+ static void NotifyTableFileCreationStarted(
+ const std::vector<std::shared_ptr<EventListener>>& listeners,
+ const std::string& db_name, const std::string& cf_name,
+ const std::string& file_path, int job_id, TableFileCreationReason reason);
+#endif // !ROCKSDB_LITE
+ static void NotifyOnBackgroundError(
+ const std::vector<std::shared_ptr<EventListener>>& listeners,
+ BackgroundErrorReason reason, Status* bg_error,
+ InstrumentedMutex* db_mutex, bool* auto_recovery);
+ static void LogAndNotifyTableFileCreationFinished(
+ EventLogger* event_logger,
+ const std::vector<std::shared_ptr<EventListener>>& listeners,
+ const std::string& db_name, const std::string& cf_name,
+ const std::string& file_path, int job_id, const FileDescriptor& fd,
+ uint64_t oldest_blob_file_number, const TableProperties& table_properties,
+ TableFileCreationReason reason, const Status& s);
+ static void LogAndNotifyTableFileDeletion(
+ EventLogger* event_logger, int job_id,
+ uint64_t file_number, const std::string& file_path,
+ const Status& status, const std::string& db_name,
+ const std::vector<std::shared_ptr<EventListener>>& listeners);
+ static void NotifyOnErrorRecoveryCompleted(
+ const std::vector<std::shared_ptr<EventListener>>& listeners,
+ Status bg_error, InstrumentedMutex* db_mutex);
+
+ private:
+ static void LogAndNotifyTableFileCreation(
+ EventLogger* event_logger,
+ const std::vector<std::shared_ptr<EventListener>>& listeners,
+ const FileDescriptor& fd, const TableFileCreationInfo& info);
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/experimental.cc b/src/rocksdb/db/experimental.cc
new file mode 100644
index 000000000..d12882c8f
--- /dev/null
+++ b/src/rocksdb/db/experimental.cc
@@ -0,0 +1,50 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/experimental.h"
+
+#include "db/db_impl/db_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace experimental {
+
+#ifndef ROCKSDB_LITE
+
+Status SuggestCompactRange(DB* db, ColumnFamilyHandle* column_family,
+ const Slice* begin, const Slice* end) {
+ if (db == nullptr) {
+ return Status::InvalidArgument("DB is empty");
+ }
+
+ return db->SuggestCompactRange(column_family, begin, end);
+}
+
+Status PromoteL0(DB* db, ColumnFamilyHandle* column_family, int target_level) {
+ if (db == nullptr) {
+ return Status::InvalidArgument("Didn't recognize DB object");
+ }
+ return db->PromoteL0(column_family, target_level);
+}
+
+#else // ROCKSDB_LITE
+
+Status SuggestCompactRange(DB* /*db*/, ColumnFamilyHandle* /*column_family*/,
+ const Slice* /*begin*/, const Slice* /*end*/) {
+ return Status::NotSupported("Not supported in RocksDB LITE");
+}
+
+Status PromoteL0(DB* /*db*/, ColumnFamilyHandle* /*column_family*/,
+ int /*target_level*/) {
+ return Status::NotSupported("Not supported in RocksDB LITE");
+}
+
+#endif // ROCKSDB_LITE
+
+Status SuggestCompactRange(DB* db, const Slice* begin, const Slice* end) {
+ return SuggestCompactRange(db, db->DefaultColumnFamily(), begin, end);
+}
+
+} // namespace experimental
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/external_sst_file_basic_test.cc b/src/rocksdb/db/external_sst_file_basic_test.cc
new file mode 100644
index 000000000..b184df20e
--- /dev/null
+++ b/src/rocksdb/db/external_sst_file_basic_test.cc
@@ -0,0 +1,1128 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <functional>
+
+#include "db/db_test_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/sst_file_writer.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+class ExternalSSTFileBasicTest
+ : public DBTestBase,
+ public ::testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+ ExternalSSTFileBasicTest() : DBTestBase("/external_sst_file_basic_test") {
+ sst_files_dir_ = dbname_ + "/sst_files/";
+ fault_injection_test_env_.reset(new FaultInjectionTestEnv(Env::Default()));
+ DestroyAndRecreateExternalSSTFilesDir();
+ }
+
+ void DestroyAndRecreateExternalSSTFilesDir() {
+ test::DestroyDir(env_, sst_files_dir_);
+ env_->CreateDir(sst_files_dir_);
+ }
+
+ Status DeprecatedAddFile(const std::vector<std::string>& files,
+ bool move_files = false,
+ bool skip_snapshot_check = false) {
+ IngestExternalFileOptions opts;
+ opts.move_files = move_files;
+ opts.snapshot_consistency = !skip_snapshot_check;
+ opts.allow_global_seqno = false;
+ opts.allow_blocking_flush = false;
+ return db_->IngestExternalFile(files, opts);
+ }
+
+ Status GenerateAndAddExternalFile(
+ const Options options, std::vector<int> keys,
+ const std::vector<ValueType>& value_types,
+ std::vector<std::pair<int, int>> range_deletions, int file_id,
+ bool write_global_seqno, bool verify_checksums_before_ingest,
+ std::map<std::string, std::string>* true_data) {
+ assert(value_types.size() == 1 || keys.size() == value_types.size());
+ std::string file_path = sst_files_dir_ + ToString(file_id);
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+
+ Status s = sst_file_writer.Open(file_path);
+ if (!s.ok()) {
+ return s;
+ }
+ for (size_t i = 0; i < range_deletions.size(); i++) {
+ // Account for the effect of range deletions on true_data before
+ // all point operators, even though sst_file_writer.DeleteRange
+ // must be called before other sst_file_writer methods. This is
+ // because point writes take precedence over range deletions
+ // in the same ingested sst.
+ std::string start_key = Key(range_deletions[i].first);
+ std::string end_key = Key(range_deletions[i].second);
+ s = sst_file_writer.DeleteRange(start_key, end_key);
+ if (!s.ok()) {
+ sst_file_writer.Finish();
+ return s;
+ }
+ auto start_key_it = true_data->find(start_key);
+ if (start_key_it == true_data->end()) {
+ start_key_it = true_data->upper_bound(start_key);
+ }
+ auto end_key_it = true_data->find(end_key);
+ if (end_key_it == true_data->end()) {
+ end_key_it = true_data->upper_bound(end_key);
+ }
+ true_data->erase(start_key_it, end_key_it);
+ }
+ for (size_t i = 0; i < keys.size(); i++) {
+ std::string key = Key(keys[i]);
+ std::string value = Key(keys[i]) + ToString(file_id);
+ ValueType value_type =
+ (value_types.size() == 1 ? value_types[0] : value_types[i]);
+ switch (value_type) {
+ case ValueType::kTypeValue:
+ s = sst_file_writer.Put(key, value);
+ (*true_data)[key] = value;
+ break;
+ case ValueType::kTypeMerge:
+ s = sst_file_writer.Merge(key, value);
+ // we only use TestPutOperator in this test
+ (*true_data)[key] = value;
+ break;
+ case ValueType::kTypeDeletion:
+ s = sst_file_writer.Delete(key);
+ true_data->erase(key);
+ break;
+ default:
+ return Status::InvalidArgument("Value type is not supported");
+ }
+ if (!s.ok()) {
+ sst_file_writer.Finish();
+ return s;
+ }
+ }
+ s = sst_file_writer.Finish();
+
+ if (s.ok()) {
+ IngestExternalFileOptions ifo;
+ ifo.allow_global_seqno = true;
+ ifo.write_global_seqno = write_global_seqno;
+ ifo.verify_checksums_before_ingest = verify_checksums_before_ingest;
+ s = db_->IngestExternalFile({file_path}, ifo);
+ }
+ return s;
+ }
+
+ Status GenerateAndAddExternalFile(
+ const Options options, std::vector<int> keys,
+ const std::vector<ValueType>& value_types, int file_id,
+ bool write_global_seqno, bool verify_checksums_before_ingest,
+ std::map<std::string, std::string>* true_data) {
+ return GenerateAndAddExternalFile(
+ options, keys, value_types, {}, file_id, write_global_seqno,
+ verify_checksums_before_ingest, true_data);
+ }
+
+ Status GenerateAndAddExternalFile(
+ const Options options, std::vector<int> keys, const ValueType value_type,
+ int file_id, bool write_global_seqno, bool verify_checksums_before_ingest,
+ std::map<std::string, std::string>* true_data) {
+ return GenerateAndAddExternalFile(
+ options, keys, std::vector<ValueType>(1, value_type), file_id,
+ write_global_seqno, verify_checksums_before_ingest, true_data);
+ }
+
+ ~ExternalSSTFileBasicTest() override {
+ test::DestroyDir(env_, sst_files_dir_);
+ }
+
+ protected:
+ std::string sst_files_dir_;
+ std::unique_ptr<FaultInjectionTestEnv> fault_injection_test_env_;
+};
+
+TEST_F(ExternalSSTFileBasicTest, Basic) {
+ Options options = CurrentOptions();
+
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+
+ // Current file size should be 0 after sst_file_writer init and before open a
+ // file.
+ ASSERT_EQ(sst_file_writer.FileSize(), 0);
+
+ // file1.sst (0 => 99)
+ std::string file1 = sst_files_dir_ + "file1.sst";
+ ASSERT_OK(sst_file_writer.Open(file1));
+ for (int k = 0; k < 100; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ ExternalSstFileInfo file1_info;
+ Status s = sst_file_writer.Finish(&file1_info);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+
+ // Current file size should be non-zero after success write.
+ ASSERT_GT(sst_file_writer.FileSize(), 0);
+
+ ASSERT_EQ(file1_info.file_path, file1);
+ ASSERT_EQ(file1_info.num_entries, 100);
+ ASSERT_EQ(file1_info.smallest_key, Key(0));
+ ASSERT_EQ(file1_info.largest_key, Key(99));
+ ASSERT_EQ(file1_info.num_range_del_entries, 0);
+ ASSERT_EQ(file1_info.smallest_range_del_key, "");
+ ASSERT_EQ(file1_info.largest_range_del_key, "");
+ // sst_file_writer already finished, cannot add this value
+ s = sst_file_writer.Put(Key(100), "bad_val");
+ ASSERT_FALSE(s.ok()) << s.ToString();
+ s = sst_file_writer.DeleteRange(Key(100), Key(200));
+ ASSERT_FALSE(s.ok()) << s.ToString();
+
+ DestroyAndReopen(options);
+ // Add file using file path
+ s = DeprecatedAddFile({file1});
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+ for (int k = 0; k < 100; k++) {
+ ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+ }
+
+ DestroyAndRecreateExternalSSTFilesDir();
+}
+
+TEST_F(ExternalSSTFileBasicTest, NoCopy) {
+ Options options = CurrentOptions();
+ const ImmutableCFOptions ioptions(options);
+
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+
+ // file1.sst (0 => 99)
+ std::string file1 = sst_files_dir_ + "file1.sst";
+ ASSERT_OK(sst_file_writer.Open(file1));
+ for (int k = 0; k < 100; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ ExternalSstFileInfo file1_info;
+ Status s = sst_file_writer.Finish(&file1_info);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ ASSERT_EQ(file1_info.file_path, file1);
+ ASSERT_EQ(file1_info.num_entries, 100);
+ ASSERT_EQ(file1_info.smallest_key, Key(0));
+ ASSERT_EQ(file1_info.largest_key, Key(99));
+
+ // file2.sst (100 => 299)
+ std::string file2 = sst_files_dir_ + "file2.sst";
+ ASSERT_OK(sst_file_writer.Open(file2));
+ for (int k = 100; k < 300; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ ExternalSstFileInfo file2_info;
+ s = sst_file_writer.Finish(&file2_info);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ ASSERT_EQ(file2_info.file_path, file2);
+ ASSERT_EQ(file2_info.num_entries, 200);
+ ASSERT_EQ(file2_info.smallest_key, Key(100));
+ ASSERT_EQ(file2_info.largest_key, Key(299));
+
+ // file3.sst (110 => 124) .. overlap with file2.sst
+ std::string file3 = sst_files_dir_ + "file3.sst";
+ ASSERT_OK(sst_file_writer.Open(file3));
+ for (int k = 110; k < 125; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+ }
+ ExternalSstFileInfo file3_info;
+ s = sst_file_writer.Finish(&file3_info);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ ASSERT_EQ(file3_info.file_path, file3);
+ ASSERT_EQ(file3_info.num_entries, 15);
+ ASSERT_EQ(file3_info.smallest_key, Key(110));
+ ASSERT_EQ(file3_info.largest_key, Key(124));
+
+ s = DeprecatedAddFile({file1}, true /* move file */);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ ASSERT_EQ(Status::NotFound(), env_->FileExists(file1));
+
+ s = DeprecatedAddFile({file2}, false /* copy file */);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ ASSERT_OK(env_->FileExists(file2));
+
+ // This file has overlapping values with the existing data
+ s = DeprecatedAddFile({file3}, true /* move file */);
+ ASSERT_FALSE(s.ok()) << s.ToString();
+ ASSERT_OK(env_->FileExists(file3));
+
+ for (int k = 0; k < 300; k++) {
+ ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+ }
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestFileWithGlobalSeqnoPickedSeqno) {
+ bool write_global_seqno = std::get<0>(GetParam());
+ bool verify_checksums_before_ingest = std::get<1>(GetParam());
+ do {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+ std::map<std::string, std::string> true_data;
+
+ int file_id = 1;
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1, 2, 3, 4, 5, 6}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File doesn't overwrite any keys, no seqno needed
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {10, 11, 12, 13}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File doesn't overwrite any keys, no seqno needed
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1, 4, 6}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 1);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {11, 15, 19}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {120, 130}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File doesn't overwrite any keys, no seqno needed
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1, 130}, ValueType::kTypeValue, file_id++, write_global_seqno,
+ verify_checksums_before_ingest, &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3);
+
+ // Write some keys through normal write path
+ for (int i = 0; i < 50; i++) {
+ ASSERT_OK(Put(Key(i), "memtable"));
+ true_data[Key(i)] = "memtable";
+ }
+ SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber();
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {60, 61, 62}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File doesn't overwrite any keys, no seqno needed
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {40, 41, 42}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 1);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {20, 30, 40}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 2);
+
+ const Snapshot* snapshot = db_->GetSnapshot();
+
+ // We will need a seqno for the file regardless if the file overwrite
+ // keys in the DB or not because we have a snapshot
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1000, 1002}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // A global seqno will be assigned anyway because of the snapshot
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 3);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {2000, 3002}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // A global seqno will be assigned anyway because of the snapshot
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 4);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1, 20, 40, 100, 150}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // A global seqno will be assigned anyway because of the snapshot
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
+
+ db_->ReleaseSnapshot(snapshot);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {5000, 5001}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // No snapshot anymore, no need to assign a seqno
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
+
+ size_t kcnt = 0;
+ VerifyDBFromMap(true_data, &kcnt, false);
+ } while (ChangeOptionsForFileIngestionTest());
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestFileWithMultipleValueType) {
+ bool write_global_seqno = std::get<0>(GetParam());
+ bool verify_checksums_before_ingest = std::get<1>(GetParam());
+ do {
+ Options options = CurrentOptions();
+ options.merge_operator.reset(new TestPutOperator());
+ DestroyAndReopen(options);
+ std::map<std::string, std::string> true_data;
+
+ int file_id = 1;
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1, 2, 3, 4, 5, 6}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File doesn't overwrite any keys, no seqno needed
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {10, 11, 12, 13}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File doesn't overwrite any keys, no seqno needed
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1, 4, 6}, ValueType::kTypeMerge, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 1);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {11, 15, 19}, ValueType::kTypeDeletion, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {120, 130}, ValueType::kTypeMerge, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File doesn't overwrite any keys, no seqno needed
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1, 130}, ValueType::kTypeDeletion, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {120}, {ValueType::kTypeValue}, {{120, 135}}, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {}, {}, {{110, 120}}, file_id++, write_global_seqno,
+ verify_checksums_before_ingest, &true_data));
+ // The range deletion ends on a key, but it doesn't actually delete
+ // this key because the largest key in the range is exclusive. Still,
+ // it counts as an overlap so a new seqno will be assigned.
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 5);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {}, {}, {{100, 109}}, file_id++, write_global_seqno,
+ verify_checksums_before_ingest, &true_data));
+ // File doesn't overwrite any keys, no seqno needed
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 5);
+
+ // Write some keys through normal write path
+ for (int i = 0; i < 50; i++) {
+ ASSERT_OK(Put(Key(i), "memtable"));
+ true_data[Key(i)] = "memtable";
+ }
+ SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber();
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {60, 61, 62}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File doesn't overwrite any keys, no seqno needed
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {40, 41, 42}, ValueType::kTypeMerge, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 1);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {20, 30, 40}, ValueType::kTypeDeletion, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 2);
+
+ const Snapshot* snapshot = db_->GetSnapshot();
+
+ // We will need a seqno for the file regardless if the file overwrite
+ // keys in the DB or not because we have a snapshot
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1000, 1002}, ValueType::kTypeMerge, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // A global seqno will be assigned anyway because of the snapshot
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 3);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {2000, 3002}, ValueType::kTypeMerge, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // A global seqno will be assigned anyway because of the snapshot
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 4);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1, 20, 40, 100, 150}, ValueType::kTypeMerge, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // A global seqno will be assigned anyway because of the snapshot
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
+
+ db_->ReleaseSnapshot(snapshot);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {5000, 5001}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // No snapshot anymore, no need to assign a seqno
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
+
+ size_t kcnt = 0;
+ VerifyDBFromMap(true_data, &kcnt, false);
+ } while (ChangeOptionsForFileIngestionTest());
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestFileWithMixedValueType) {
+ bool write_global_seqno = std::get<0>(GetParam());
+ bool verify_checksums_before_ingest = std::get<1>(GetParam());
+ do {
+ Options options = CurrentOptions();
+ options.merge_operator.reset(new TestPutOperator());
+ DestroyAndReopen(options);
+ std::map<std::string, std::string> true_data;
+
+ int file_id = 1;
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1, 2, 3, 4, 5, 6},
+ {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue,
+ ValueType::kTypeMerge, ValueType::kTypeValue, ValueType::kTypeMerge},
+ file_id++, write_global_seqno, verify_checksums_before_ingest,
+ &true_data));
+ // File doesn't overwrite any keys, no seqno needed
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {10, 11, 12, 13},
+ {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue,
+ ValueType::kTypeMerge},
+ file_id++, write_global_seqno, verify_checksums_before_ingest,
+ &true_data));
+ // File doesn't overwrite any keys, no seqno needed
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1, 4, 6},
+ {ValueType::kTypeDeletion, ValueType::kTypeValue,
+ ValueType::kTypeMerge},
+ file_id++, write_global_seqno, verify_checksums_before_ingest,
+ &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 1);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {11, 15, 19},
+ {ValueType::kTypeDeletion, ValueType::kTypeMerge,
+ ValueType::kTypeValue},
+ file_id++, write_global_seqno, verify_checksums_before_ingest,
+ &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {120, 130}, {ValueType::kTypeValue, ValueType::kTypeMerge},
+ file_id++, write_global_seqno, verify_checksums_before_ingest,
+ &true_data));
+ // File doesn't overwrite any keys, no seqno needed
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1, 130}, {ValueType::kTypeMerge, ValueType::kTypeDeletion},
+ file_id++, write_global_seqno, verify_checksums_before_ingest,
+ &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {150, 151, 152},
+ {ValueType::kTypeValue, ValueType::kTypeMerge,
+ ValueType::kTypeDeletion},
+ {{150, 160}, {180, 190}}, file_id++, write_global_seqno,
+ verify_checksums_before_ingest, &true_data));
+ // File doesn't overwrite any keys, no seqno needed
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {150, 151, 152},
+ {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue},
+ {{200, 250}}, file_id++, write_global_seqno,
+ verify_checksums_before_ingest, &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {300, 301, 302},
+ {ValueType::kTypeValue, ValueType::kTypeMerge,
+ ValueType::kTypeDeletion},
+ {{1, 2}, {152, 154}}, file_id++, write_global_seqno,
+ verify_checksums_before_ingest, &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 5);
+
+ // Write some keys through normal write path
+ for (int i = 0; i < 50; i++) {
+ ASSERT_OK(Put(Key(i), "memtable"));
+ true_data[Key(i)] = "memtable";
+ }
+ SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber();
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {60, 61, 62},
+ {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue},
+ file_id++, write_global_seqno, verify_checksums_before_ingest,
+ &true_data));
+ // File doesn't overwrite any keys, no seqno needed
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {40, 41, 42},
+ {ValueType::kTypeValue, ValueType::kTypeDeletion,
+ ValueType::kTypeDeletion},
+ file_id++, write_global_seqno, verify_checksums_before_ingest,
+ &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 1);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {20, 30, 40},
+ {ValueType::kTypeDeletion, ValueType::kTypeDeletion,
+ ValueType::kTypeDeletion},
+ file_id++, write_global_seqno, verify_checksums_before_ingest,
+ &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 2);
+
+ const Snapshot* snapshot = db_->GetSnapshot();
+
+ // We will need a seqno for the file regardless if the file overwrite
+ // keys in the DB or not because we have a snapshot
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1000, 1002}, {ValueType::kTypeValue, ValueType::kTypeMerge},
+ file_id++, write_global_seqno, verify_checksums_before_ingest,
+ &true_data));
+ // A global seqno will be assigned anyway because of the snapshot
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 3);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {2000, 3002}, {ValueType::kTypeValue, ValueType::kTypeMerge},
+ file_id++, write_global_seqno, verify_checksums_before_ingest,
+ &true_data));
+ // A global seqno will be assigned anyway because of the snapshot
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 4);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1, 20, 40, 100, 150},
+ {ValueType::kTypeDeletion, ValueType::kTypeDeletion,
+ ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeMerge},
+ file_id++, write_global_seqno, verify_checksums_before_ingest,
+ &true_data));
+ // A global seqno will be assigned anyway because of the snapshot
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
+
+ db_->ReleaseSnapshot(snapshot);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {5000, 5001}, {ValueType::kTypeValue, ValueType::kTypeMerge},
+ file_id++, write_global_seqno, verify_checksums_before_ingest,
+ &true_data));
+ // No snapshot anymore, no need to assign a seqno
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
+
+ size_t kcnt = 0;
+ VerifyDBFromMap(true_data, &kcnt, false);
+ } while (ChangeOptionsForFileIngestionTest());
+}
+
+TEST_F(ExternalSSTFileBasicTest, FadviseTrigger) {
+ Options options = CurrentOptions();
+ const int kNumKeys = 10000;
+
+ size_t total_fadvised_bytes = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SstFileWriter::Rep::InvalidatePageCache", [&](void* arg) {
+ size_t fadvise_size = *(reinterpret_cast<size_t*>(arg));
+ total_fadvised_bytes += fadvise_size;
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ std::unique_ptr<SstFileWriter> sst_file_writer;
+
+ std::string sst_file_path = sst_files_dir_ + "file_fadvise_disable.sst";
+ sst_file_writer.reset(
+ new SstFileWriter(EnvOptions(), options, nullptr, false));
+ ASSERT_OK(sst_file_writer->Open(sst_file_path));
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(sst_file_writer->Put(Key(i), Key(i)));
+ }
+ ASSERT_OK(sst_file_writer->Finish());
+ // fadvise disabled
+ ASSERT_EQ(total_fadvised_bytes, 0);
+
+ sst_file_path = sst_files_dir_ + "file_fadvise_enable.sst";
+ sst_file_writer.reset(
+ new SstFileWriter(EnvOptions(), options, nullptr, true));
+ ASSERT_OK(sst_file_writer->Open(sst_file_path));
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(sst_file_writer->Put(Key(i), Key(i)));
+ }
+ ASSERT_OK(sst_file_writer->Finish());
+ // fadvise enabled
+ ASSERT_EQ(total_fadvised_bytes, sst_file_writer->FileSize());
+ ASSERT_GT(total_fadvised_bytes, 0);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(ExternalSSTFileBasicTest, SyncFailure) {
+ Options options;
+ options.create_if_missing = true;
+ options.env = fault_injection_test_env_.get();
+
+ std::vector<std::pair<std::string, std::string>> test_cases = {
+ {"ExternalSstFileIngestionJob::BeforeSyncIngestedFile",
+ "ExternalSstFileIngestionJob::AfterSyncIngestedFile"},
+ {"ExternalSstFileIngestionJob::BeforeSyncDir",
+ "ExternalSstFileIngestionJob::AfterSyncDir"},
+ {"ExternalSstFileIngestionJob::BeforeSyncGlobalSeqno",
+ "ExternalSstFileIngestionJob::AfterSyncGlobalSeqno"}};
+
+ for (size_t i = 0; i < test_cases.size(); i++) {
+ SyncPoint::GetInstance()->SetCallBack(test_cases[i].first, [&](void*) {
+ fault_injection_test_env_->SetFilesystemActive(false);
+ });
+ SyncPoint::GetInstance()->SetCallBack(test_cases[i].second, [&](void*) {
+ fault_injection_test_env_->SetFilesystemActive(true);
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ DestroyAndReopen(options);
+ if (i == 2) {
+ ASSERT_OK(Put("foo", "v1"));
+ }
+
+ Options sst_file_writer_options;
+ std::unique_ptr<SstFileWriter> sst_file_writer(
+ new SstFileWriter(EnvOptions(), sst_file_writer_options));
+ std::string file_name =
+ sst_files_dir_ + "sync_failure_test_" + ToString(i) + ".sst";
+ ASSERT_OK(sst_file_writer->Open(file_name));
+ ASSERT_OK(sst_file_writer->Put("bar", "v2"));
+ ASSERT_OK(sst_file_writer->Finish());
+
+ IngestExternalFileOptions ingest_opt;
+ if (i == 0) {
+ ingest_opt.move_files = true;
+ }
+ const Snapshot* snapshot = db_->GetSnapshot();
+ if (i == 2) {
+ ingest_opt.write_global_seqno = true;
+ }
+ ASSERT_FALSE(db_->IngestExternalFile({file_name}, ingest_opt).ok());
+ db_->ReleaseSnapshot(snapshot);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ Destroy(options);
+ }
+}
+
+TEST_F(ExternalSSTFileBasicTest, VerifyChecksumReadahead) {
+ Options options;
+ options.create_if_missing = true;
+ SpecialEnv senv(Env::Default());
+ options.env = &senv;
+ DestroyAndReopen(options);
+
+ Options sst_file_writer_options;
+ std::unique_ptr<SstFileWriter> sst_file_writer(
+ new SstFileWriter(EnvOptions(), sst_file_writer_options));
+ std::string file_name = sst_files_dir_ + "verify_checksum_readahead_test.sst";
+ ASSERT_OK(sst_file_writer->Open(file_name));
+ Random rnd(301);
+ std::string value = DBTestBase::RandomString(&rnd, 4000);
+ for (int i = 0; i < 5000; i++) {
+ ASSERT_OK(sst_file_writer->Put(DBTestBase::Key(i), value));
+ }
+ ASSERT_OK(sst_file_writer->Finish());
+
+ // Ingest it once without verifying checksums to see the baseline
+ // preads.
+ IngestExternalFileOptions ingest_opt;
+ ingest_opt.move_files = false;
+ senv.count_random_reads_ = true;
+ senv.random_read_bytes_counter_ = 0;
+ ASSERT_OK(db_->IngestExternalFile({file_name}, ingest_opt));
+
+ auto base_num_reads = senv.random_read_counter_.Read();
+ // Make sure the counter is enabled.
+ ASSERT_GT(base_num_reads, 0);
+
+ // Ingest again and observe the reads made for for readahead.
+ ingest_opt.move_files = false;
+ ingest_opt.verify_checksums_before_ingest = true;
+ ingest_opt.verify_checksums_readahead_size = size_t{2 * 1024 * 1024};
+
+ senv.count_random_reads_ = true;
+ senv.random_read_bytes_counter_ = 0;
+ ASSERT_OK(db_->IngestExternalFile({file_name}, ingest_opt));
+
+ // Make sure the counter is enabled.
+ ASSERT_GT(senv.random_read_counter_.Read() - base_num_reads, 0);
+
+ // The SST file is about 20MB. Readahead size is 2MB.
+ // Give a conservative 15 reads for metadata blocks, the number
+ // of random reads should be within 20 MB / 2MB + 15 = 25.
+ ASSERT_LE(senv.random_read_counter_.Read() - base_num_reads, 40);
+
+ Destroy(options);
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) {
+ int kNumLevels = 7;
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.num_levels = kNumLevels;
+ Reopen(options);
+
+ std::map<std::string, std::string> true_data;
+ int file_id = 1;
+ // prevent range deletions from being dropped due to becoming obsolete.
+ const Snapshot* snapshot = db_->GetSnapshot();
+
+ // range del [0, 50) in L6 file, [50, 100) in L0 file, [100, 150) in memtable
+ for (int i = 0; i < 3; i++) {
+ if (i != 0) {
+ db_->Flush(FlushOptions());
+ if (i == 1) {
+ MoveFilesToLevel(kNumLevels - 1);
+ }
+ }
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ Key(50 * i), Key(50 * (i + 1))));
+ }
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+ ASSERT_EQ(0, NumTableFilesAtLevel(kNumLevels - 2));
+ ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 1));
+
+ bool write_global_seqno = std::get<0>(GetParam());
+ bool verify_checksums_before_ingest = std::get<1>(GetParam());
+ // overlaps with L0 file but not memtable, so flush is skipped and file is
+ // ingested into L0
+ SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber();
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {60, 90}, {ValueType::kTypeValue, ValueType::kTypeValue},
+ {{65, 70}, {70, 85}}, file_id++, write_global_seqno,
+ verify_checksums_before_ingest, &true_data));
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
+ ASSERT_EQ(2, NumTableFilesAtLevel(0));
+ ASSERT_EQ(0, NumTableFilesAtLevel(kNumLevels - 2));
+ ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));
+
+ // overlaps with L6 file but not memtable or L0 file, so flush is skipped and
+ // file is ingested into L5
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {10, 40}, {ValueType::kTypeValue, ValueType::kTypeValue},
+ file_id++, write_global_seqno, verify_checksums_before_ingest,
+ &true_data));
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
+ ASSERT_EQ(2, NumTableFilesAtLevel(0));
+ ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
+ ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));
+
+ // overlaps with L5 file but not memtable or L0 file, so flush is skipped and
+ // file is ingested into L4
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {}, {}, {{5, 15}}, file_id++, write_global_seqno,
+ verify_checksums_before_ingest, &true_data));
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
+ ASSERT_EQ(2, NumTableFilesAtLevel(0));
+ ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
+ ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 2));
+ ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));
+
+ // ingested file overlaps with memtable, so flush is triggered before the file
+ // is ingested such that the ingested data is considered newest. So L0 file
+ // count increases by two.
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {100, 140}, {ValueType::kTypeValue, ValueType::kTypeValue},
+ file_id++, write_global_seqno, verify_checksums_before_ingest,
+ &true_data));
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
+ ASSERT_EQ(4, NumTableFilesAtLevel(0));
+ ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
+ ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));
+
+ // snapshot unneeded now that all range deletions are persisted
+ db_->ReleaseSnapshot(snapshot);
+
+ // overlaps with nothing, so places at bottom level and skips incrementing
+ // seqnum.
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {151, 175}, {ValueType::kTypeValue, ValueType::kTypeValue},
+ {{160, 200}}, file_id++, write_global_seqno,
+ verify_checksums_before_ingest, &true_data));
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno);
+ ASSERT_EQ(4, NumTableFilesAtLevel(0));
+ ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
+ ASSERT_EQ(2, NumTableFilesAtLevel(options.num_levels - 1));
+}
+
+TEST_F(ExternalSSTFileBasicTest, AdjacentRangeDeletionTombstones) {
+ Options options = CurrentOptions();
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+
+ // file8.sst (delete 300 => 400)
+ std::string file8 = sst_files_dir_ + "file8.sst";
+ ASSERT_OK(sst_file_writer.Open(file8));
+ ASSERT_OK(sst_file_writer.DeleteRange(Key(300), Key(400)));
+ ExternalSstFileInfo file8_info;
+ Status s = sst_file_writer.Finish(&file8_info);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ ASSERT_EQ(file8_info.file_path, file8);
+ ASSERT_EQ(file8_info.num_entries, 0);
+ ASSERT_EQ(file8_info.smallest_key, "");
+ ASSERT_EQ(file8_info.largest_key, "");
+ ASSERT_EQ(file8_info.num_range_del_entries, 1);
+ ASSERT_EQ(file8_info.smallest_range_del_key, Key(300));
+ ASSERT_EQ(file8_info.largest_range_del_key, Key(400));
+
+ // file9.sst (delete 400 => 500)
+ std::string file9 = sst_files_dir_ + "file9.sst";
+ ASSERT_OK(sst_file_writer.Open(file9));
+ ASSERT_OK(sst_file_writer.DeleteRange(Key(400), Key(500)));
+ ExternalSstFileInfo file9_info;
+ s = sst_file_writer.Finish(&file9_info);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ ASSERT_EQ(file9_info.file_path, file9);
+ ASSERT_EQ(file9_info.num_entries, 0);
+ ASSERT_EQ(file9_info.smallest_key, "");
+ ASSERT_EQ(file9_info.largest_key, "");
+ ASSERT_EQ(file9_info.num_range_del_entries, 1);
+ ASSERT_EQ(file9_info.smallest_range_del_key, Key(400));
+ ASSERT_EQ(file9_info.largest_range_del_key, Key(500));
+
+ // Range deletion tombstones are exclusive on their end key, so these SSTs
+ // should not be considered as overlapping.
+ s = DeprecatedAddFile({file8, file9});
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+ DestroyAndRecreateExternalSSTFilesDir();
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestFileWithBadBlockChecksum) {
+ bool change_checksum_called = false;
+ const auto& change_checksum = [&](void* arg) {
+ if (!change_checksum_called) {
+ char* buf = reinterpret_cast<char*>(arg);
+ assert(nullptr != buf);
+ buf[0] ^= 0x1;
+ change_checksum_called = true;
+ }
+ };
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlockBasedTableBuilder::WriteRawBlock:TamperWithChecksum",
+ change_checksum);
+ SyncPoint::GetInstance()->EnableProcessing();
+ int file_id = 0;
+ bool write_global_seqno = std::get<0>(GetParam());
+ bool verify_checksums_before_ingest = std::get<1>(GetParam());
+ do {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+ std::map<std::string, std::string> true_data;
+ Status s = GenerateAndAddExternalFile(
+ options, {1, 2, 3, 4, 5, 6}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data);
+ if (verify_checksums_before_ingest) {
+ ASSERT_NOK(s);
+ } else {
+ ASSERT_OK(s);
+ }
+ change_checksum_called = false;
+ } while (ChangeOptionsForFileIngestionTest());
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestFileWithFirstByteTampered) {
+ SyncPoint::GetInstance()->DisableProcessing();
+ int file_id = 0;
+ EnvOptions env_options;
+ do {
+ Options options = CurrentOptions();
+ std::string file_path = sst_files_dir_ + ToString(file_id++);
+ SstFileWriter sst_file_writer(env_options, options);
+ Status s = sst_file_writer.Open(file_path);
+ ASSERT_OK(s);
+ for (int i = 0; i != 100; ++i) {
+ std::string key = Key(i);
+ std::string value = Key(i) + ToString(0);
+ ASSERT_OK(sst_file_writer.Put(key, value));
+ }
+ ASSERT_OK(sst_file_writer.Finish());
+ {
+ // Get file size
+ uint64_t file_size = 0;
+ ASSERT_OK(env_->GetFileSize(file_path, &file_size));
+ ASSERT_GT(file_size, 8);
+ std::unique_ptr<RandomRWFile> rwfile;
+ ASSERT_OK(env_->NewRandomRWFile(file_path, &rwfile, EnvOptions()));
+ // Manually corrupt the file
+ // We deterministically corrupt the first byte because we currently
+ // cannot choose a random offset. The reason for this limitation is that
+ // we do not checksum property block at present.
+ const uint64_t offset = 0;
+ char scratch[8] = {0};
+ Slice buf;
+ ASSERT_OK(rwfile->Read(offset, sizeof(scratch), &buf, scratch));
+ scratch[0] ^= 0xff; // flip one bit
+ ASSERT_OK(rwfile->Write(offset, buf));
+ }
+ // Ingest file.
+ IngestExternalFileOptions ifo;
+ ifo.write_global_seqno = std::get<0>(GetParam());
+ ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+ s = db_->IngestExternalFile({file_path}, ifo);
+ if (ifo.verify_checksums_before_ingest) {
+ ASSERT_NOK(s);
+ } else {
+ ASSERT_OK(s);
+ }
+ } while (ChangeOptionsForFileIngestionTest());
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestExternalFileWithCorruptedPropsBlock) {
+ bool verify_checksums_before_ingest = std::get<1>(GetParam());
+ if (!verify_checksums_before_ingest) {
+ return;
+ }
+ uint64_t props_block_offset = 0;
+ size_t props_block_size = 0;
+ const auto& get_props_block_offset = [&](void* arg) {
+ props_block_offset = *reinterpret_cast<uint64_t*>(arg);
+ };
+ const auto& get_props_block_size = [&](void* arg) {
+ props_block_size = *reinterpret_cast<uint64_t*>(arg);
+ };
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockOffset",
+ get_props_block_offset);
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockSize",
+ get_props_block_size);
+ SyncPoint::GetInstance()->EnableProcessing();
+ int file_id = 0;
+ Random64 rand(time(nullptr));
+ do {
+ std::string file_path = sst_files_dir_ + ToString(file_id++);
+ Options options = CurrentOptions();
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+ Status s = sst_file_writer.Open(file_path);
+ ASSERT_OK(s);
+ for (int i = 0; i != 100; ++i) {
+ std::string key = Key(i);
+ std::string value = Key(i) + ToString(0);
+ ASSERT_OK(sst_file_writer.Put(key, value));
+ }
+ ASSERT_OK(sst_file_writer.Finish());
+
+ {
+ std::unique_ptr<RandomRWFile> rwfile;
+ ASSERT_OK(env_->NewRandomRWFile(file_path, &rwfile, EnvOptions()));
+ // Manually corrupt the file
+ ASSERT_GT(props_block_size, 8);
+ uint64_t offset =
+ props_block_offset + rand.Next() % (props_block_size - 8);
+ char scratch[8] = {0};
+ Slice buf;
+ ASSERT_OK(rwfile->Read(offset, sizeof(scratch), &buf, scratch));
+ scratch[0] ^= 0xff; // flip one bit
+ ASSERT_OK(rwfile->Write(offset, buf));
+ }
+
+ // Ingest file.
+ IngestExternalFileOptions ifo;
+ ifo.write_global_seqno = std::get<0>(GetParam());
+ ifo.verify_checksums_before_ingest = true;
+ s = db_->IngestExternalFile({file_path}, ifo);
+ ASSERT_NOK(s);
+ } while (ChangeOptionsForFileIngestionTest());
+}
+
+TEST_F(ExternalSSTFileBasicTest, OverlappingFiles) {
+ Options options = CurrentOptions();
+
+ std::vector<std::string> files;
+ {
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+ std::string file1 = sst_files_dir_ + "file1.sst";
+ ASSERT_OK(sst_file_writer.Open(file1));
+ ASSERT_OK(sst_file_writer.Put("a", "z"));
+ ASSERT_OK(sst_file_writer.Put("i", "m"));
+ ExternalSstFileInfo file1_info;
+ ASSERT_OK(sst_file_writer.Finish(&file1_info));
+ files.push_back(std::move(file1));
+ }
+ {
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+ std::string file2 = sst_files_dir_ + "file2.sst";
+ ASSERT_OK(sst_file_writer.Open(file2));
+ ASSERT_OK(sst_file_writer.Put("i", "k"));
+ ExternalSstFileInfo file2_info;
+ ASSERT_OK(sst_file_writer.Finish(&file2_info));
+ files.push_back(std::move(file2));
+ }
+
+ IngestExternalFileOptions ifo;
+ ASSERT_OK(db_->IngestExternalFile(files, ifo));
+ ASSERT_EQ(Get("a"), "z");
+ ASSERT_EQ(Get("i"), "k");
+
+ int total_keys = 0;
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_OK(iter->status());
+ total_keys++;
+ }
+ delete iter;
+ ASSERT_EQ(total_keys, 2);
+
+ ASSERT_EQ(2, NumTableFilesAtLevel(0));
+}
+
+INSTANTIATE_TEST_CASE_P(ExternalSSTFileBasicTest, ExternalSSTFileBasicTest,
+ testing::Values(std::make_tuple(true, true),
+ std::make_tuple(true, false),
+ std::make_tuple(false, true),
+ std::make_tuple(false, false)));
+
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/external_sst_file_ingestion_job.cc b/src/rocksdb/db/external_sst_file_ingestion_job.cc
new file mode 100644
index 000000000..4cec5d376
--- /dev/null
+++ b/src/rocksdb/db/external_sst_file_ingestion_job.cc
@@ -0,0 +1,731 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/external_sst_file_ingestion_job.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "db/version_edit.h"
+#include "file/file_util.h"
+#include "file/random_access_file_reader.h"
+#include "table/merging_iterator.h"
+#include "table/scoped_arena_iterator.h"
+#include "table/sst_file_writer_collectors.h"
+#include "table/table_builder.h"
+#include "test_util/sync_point.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status ExternalSstFileIngestionJob::Prepare(
+ const std::vector<std::string>& external_files_paths,
+ uint64_t next_file_number, SuperVersion* sv) {
+ Status status;
+
+ // Read the information of files we are ingesting
+ for (const std::string& file_path : external_files_paths) {
+ IngestedFileInfo file_to_ingest;
+ status = GetIngestedFileInfo(file_path, &file_to_ingest, sv);
+ if (!status.ok()) {
+ return status;
+ }
+ files_to_ingest_.push_back(file_to_ingest);
+ }
+
+ for (const IngestedFileInfo& f : files_to_ingest_) {
+ if (f.cf_id !=
+ TablePropertiesCollectorFactory::Context::kUnknownColumnFamily &&
+ f.cf_id != cfd_->GetID()) {
+ return Status::InvalidArgument(
+ "External file column family id dont match");
+ }
+ }
+
+ const Comparator* ucmp = cfd_->internal_comparator().user_comparator();
+ auto num_files = files_to_ingest_.size();
+ if (num_files == 0) {
+ return Status::InvalidArgument("The list of files is empty");
+ } else if (num_files > 1) {
+ // Verify that passed files dont have overlapping ranges
+ autovector<const IngestedFileInfo*> sorted_files;
+ for (size_t i = 0; i < num_files; i++) {
+ sorted_files.push_back(&files_to_ingest_[i]);
+ }
+
+ std::sort(
+ sorted_files.begin(), sorted_files.end(),
+ [&ucmp](const IngestedFileInfo* info1, const IngestedFileInfo* info2) {
+ return sstableKeyCompare(ucmp, info1->smallest_internal_key,
+ info2->smallest_internal_key) < 0;
+ });
+
+ for (size_t i = 0; i < num_files - 1; i++) {
+ if (sstableKeyCompare(ucmp, sorted_files[i]->largest_internal_key,
+ sorted_files[i + 1]->smallest_internal_key) >= 0) {
+ files_overlap_ = true;
+ break;
+ }
+ }
+ }
+
+ if (ingestion_options_.ingest_behind && files_overlap_) {
+ return Status::NotSupported("Files have overlapping ranges");
+ }
+
+ for (IngestedFileInfo& f : files_to_ingest_) {
+ if (f.num_entries == 0 && f.num_range_deletions == 0) {
+ return Status::InvalidArgument("File contain no entries");
+ }
+
+ if (!f.smallest_internal_key.Valid() || !f.largest_internal_key.Valid()) {
+ return Status::Corruption("Generated table have corrupted keys");
+ }
+ }
+
+ // Copy/Move external files into DB
+ std::unordered_set<size_t> ingestion_path_ids;
+ for (IngestedFileInfo& f : files_to_ingest_) {
+ f.fd = FileDescriptor(next_file_number++, 0, f.file_size);
+ f.copy_file = false;
+ const std::string path_outside_db = f.external_file_path;
+ const std::string path_inside_db =
+ TableFileName(cfd_->ioptions()->cf_paths, f.fd.GetNumber(),
+ f.fd.GetPathId());
+ if (ingestion_options_.move_files) {
+ status =
+ fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(), nullptr);
+ if (status.ok()) {
+ // It is unsafe to assume application had sync the file and file
+ // directory before ingest the file. For integrity of RocksDB we need
+ // to sync the file.
+ std::unique_ptr<FSWritableFile> file_to_sync;
+ status = fs_->ReopenWritableFile(path_inside_db, env_options_,
+ &file_to_sync, nullptr);
+ if (status.ok()) {
+ TEST_SYNC_POINT(
+ "ExternalSstFileIngestionJob::BeforeSyncIngestedFile");
+ status = SyncIngestedFile(file_to_sync.get());
+ TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncIngestedFile");
+ if (!status.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Failed to sync ingested file %s: %s",
+ path_inside_db.c_str(), status.ToString().c_str());
+ }
+ }
+ } else if (status.IsNotSupported() &&
+ ingestion_options_.failed_move_fall_back_to_copy) {
+ // Original file is on a different FS, use copy instead of hard linking.
+ f.copy_file = true;
+ }
+ } else {
+ f.copy_file = true;
+ }
+
+ if (f.copy_file) {
+ TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:CopyFile",
+ nullptr);
+ // CopyFile also sync the new file.
+ status = CopyFile(fs_, path_outside_db, path_inside_db, 0,
+ db_options_.use_fsync);
+ }
+ TEST_SYNC_POINT("ExternalSstFileIngestionJob::Prepare:FileAdded");
+ if (!status.ok()) {
+ break;
+ }
+ f.internal_file_path = path_inside_db;
+ ingestion_path_ids.insert(f.fd.GetPathId());
+ }
+
+ TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncDir");
+ if (status.ok()) {
+ for (auto path_id : ingestion_path_ids) {
+ status = directories_->GetDataDir(path_id)->Fsync();
+ if (!status.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Failed to sync directory %" ROCKSDB_PRIszt
+ " while ingest file: %s",
+ path_id, status.ToString().c_str());
+ break;
+ }
+ }
+ }
+ TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncDir");
+
+ // TODO: The following is duplicated with Cleanup().
+ if (!status.ok()) {
+ // We failed, remove all files that we copied into the db
+ for (IngestedFileInfo& f : files_to_ingest_) {
+ if (f.internal_file_path.empty()) {
+ continue;
+ }
+ Status s = env_->DeleteFile(f.internal_file_path);
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "AddFile() clean up for file %s failed : %s",
+ f.internal_file_path.c_str(), s.ToString().c_str());
+ }
+ }
+ }
+
+ return status;
+}
+
+Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed,
+ SuperVersion* super_version) {
+ autovector<Range> ranges;
+ for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) {
+ ranges.emplace_back(file_to_ingest.smallest_internal_key.user_key(),
+ file_to_ingest.largest_internal_key.user_key());
+ }
+ Status status =
+ cfd_->RangesOverlapWithMemtables(ranges, super_version, flush_needed);
+ if (status.ok() && *flush_needed &&
+ !ingestion_options_.allow_blocking_flush) {
+ status = Status::InvalidArgument("External file requires flush");
+ }
+ return status;
+}
+
+// REQUIRES: we have become the only writer by entering both write_thread_ and
+// nonmem_write_thread_
+Status ExternalSstFileIngestionJob::Run() {
+ Status status;
+ SuperVersion* super_version = cfd_->GetSuperVersion();
+#ifndef NDEBUG
+ // We should never run the job with a memtable that is overlapping
+ // with the files we are ingesting
+ bool need_flush = false;
+ status = NeedsFlush(&need_flush, super_version);
+ assert(status.ok() && need_flush == false);
+#endif
+
+ bool force_global_seqno = false;
+
+ if (ingestion_options_.snapshot_consistency && !db_snapshots_->empty()) {
+ // We need to assign a global sequence number to all the files even
+ // if the dont overlap with any ranges since we have snapshots
+ force_global_seqno = true;
+ }
+ // It is safe to use this instead of LastAllocatedSequence since we are
+ // the only active writer, and hence they are equal
+ SequenceNumber last_seqno = versions_->LastSequence();
+ edit_.SetColumnFamily(cfd_->GetID());
+ // The levels that the files will be ingested into
+
+ for (IngestedFileInfo& f : files_to_ingest_) {
+ SequenceNumber assigned_seqno = 0;
+ if (ingestion_options_.ingest_behind) {
+ status = CheckLevelForIngestedBehindFile(&f);
+ } else {
+ status = AssignLevelAndSeqnoForIngestedFile(
+ super_version, force_global_seqno, cfd_->ioptions()->compaction_style,
+ last_seqno, &f, &assigned_seqno);
+ }
+ if (!status.ok()) {
+ return status;
+ }
+ status = AssignGlobalSeqnoForIngestedFile(&f, assigned_seqno);
+ TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Run",
+ &assigned_seqno);
+ if (assigned_seqno > last_seqno) {
+ assert(assigned_seqno == last_seqno + 1);
+ last_seqno = assigned_seqno;
+ ++consumed_seqno_count_;
+ }
+ if (!status.ok()) {
+ return status;
+ }
+
+ // We use the import time as the ancester time. This is the time the data
+ // is written to the database.
+ int64_t temp_current_time = 0;
+ uint64_t current_time = kUnknownFileCreationTime;
+ uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
+ if (env_->GetCurrentTime(&temp_current_time).ok()) {
+ current_time = oldest_ancester_time =
+ static_cast<uint64_t>(temp_current_time);
+ }
+
+ edit_.AddFile(
+ f.picked_level, f.fd.GetNumber(), f.fd.GetPathId(), f.fd.GetFileSize(),
+ f.smallest_internal_key, f.largest_internal_key, f.assigned_seqno,
+ f.assigned_seqno, false, kInvalidBlobFileNumber, oldest_ancester_time,
+ current_time, kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+ }
+ return status;
+}
+
+void ExternalSstFileIngestionJob::UpdateStats() {
+ // Update internal stats for new ingested files
+ uint64_t total_keys = 0;
+ uint64_t total_l0_files = 0;
+ uint64_t total_time = env_->NowMicros() - job_start_time_;
+
+ EventLoggerStream stream = event_logger_->Log();
+ stream << "event"
+ << "ingest_finished";
+ stream << "files_ingested";
+ stream.StartArray();
+
+ for (IngestedFileInfo& f : files_to_ingest_) {
+ InternalStats::CompactionStats stats(CompactionReason::kExternalSstIngestion, 1);
+ stats.micros = total_time;
+ // If actual copy occurred for this file, then we need to count the file
+ // size as the actual bytes written. If the file was linked, then we ignore
+ // the bytes written for file metadata.
+ // TODO (yanqin) maybe account for file metadata bytes for exact accuracy?
+ if (f.copy_file) {
+ stats.bytes_written = f.fd.GetFileSize();
+ } else {
+ stats.bytes_moved = f.fd.GetFileSize();
+ }
+ stats.num_output_files = 1;
+ cfd_->internal_stats()->AddCompactionStats(f.picked_level,
+ Env::Priority::USER, stats);
+ cfd_->internal_stats()->AddCFStats(InternalStats::BYTES_INGESTED_ADD_FILE,
+ f.fd.GetFileSize());
+ total_keys += f.num_entries;
+ if (f.picked_level == 0) {
+ total_l0_files += 1;
+ }
+ ROCKS_LOG_INFO(
+ db_options_.info_log,
+ "[AddFile] External SST file %s was ingested in L%d with path %s "
+ "(global_seqno=%" PRIu64 ")\n",
+ f.external_file_path.c_str(), f.picked_level,
+ f.internal_file_path.c_str(), f.assigned_seqno);
+ stream << "file" << f.internal_file_path << "level" << f.picked_level;
+ }
+ stream.EndArray();
+
+ stream << "lsm_state";
+ stream.StartArray();
+ auto vstorage = cfd_->current()->storage_info();
+ for (int level = 0; level < vstorage->num_levels(); ++level) {
+ stream << vstorage->NumLevelFiles(level);
+ }
+ stream.EndArray();
+
+ cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_KEYS_TOTAL,
+ total_keys);
+ cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_FILES_TOTAL,
+ files_to_ingest_.size());
+ cfd_->internal_stats()->AddCFStats(
+ InternalStats::INGESTED_LEVEL0_NUM_FILES_TOTAL, total_l0_files);
+}
+
+void ExternalSstFileIngestionJob::Cleanup(const Status& status) {
+ if (!status.ok()) {
+ // We failed to add the files to the database
+ // remove all the files we copied
+ for (IngestedFileInfo& f : files_to_ingest_) {
+ if (f.internal_file_path.empty()) {
+ continue;
+ }
+ Status s = env_->DeleteFile(f.internal_file_path);
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "AddFile() clean up for file %s failed : %s",
+ f.internal_file_path.c_str(), s.ToString().c_str());
+ }
+ }
+ consumed_seqno_count_ = 0;
+ files_overlap_ = false;
+ } else if (status.ok() && ingestion_options_.move_files) {
+ // The files were moved and added successfully, remove original file links
+ for (IngestedFileInfo& f : files_to_ingest_) {
+ Status s = env_->DeleteFile(f.external_file_path);
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(
+ db_options_.info_log,
+ "%s was added to DB successfully but failed to remove original "
+ "file link : %s",
+ f.external_file_path.c_str(), s.ToString().c_str());
+ }
+ }
+ }
+}
+
+Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
+ const std::string& external_file, IngestedFileInfo* file_to_ingest,
+ SuperVersion* sv) {
+ file_to_ingest->external_file_path = external_file;
+
+ // Get external file size
+ Status status = fs_->GetFileSize(external_file, IOOptions(),
+ &file_to_ingest->file_size, nullptr);
+ if (!status.ok()) {
+ return status;
+ }
+
+ // Create TableReader for external file
+ std::unique_ptr<TableReader> table_reader;
+ std::unique_ptr<FSRandomAccessFile> sst_file;
+ std::unique_ptr<RandomAccessFileReader> sst_file_reader;
+
+ status = fs_->NewRandomAccessFile(external_file, env_options_,
+ &sst_file, nullptr);
+ if (!status.ok()) {
+ return status;
+ }
+ sst_file_reader.reset(new RandomAccessFileReader(std::move(sst_file),
+ external_file));
+
+ status = cfd_->ioptions()->table_factory->NewTableReader(
+ TableReaderOptions(*cfd_->ioptions(),
+ sv->mutable_cf_options.prefix_extractor.get(),
+ env_options_, cfd_->internal_comparator()),
+ std::move(sst_file_reader), file_to_ingest->file_size, &table_reader);
+ if (!status.ok()) {
+ return status;
+ }
+
+ if (ingestion_options_.verify_checksums_before_ingest) {
+ // If customized readahead size is needed, we can pass a user option
+ // all the way to here. Right now we just rely on the default readahead
+ // to keep things simple.
+ ReadOptions ro;
+ ro.readahead_size = ingestion_options_.verify_checksums_readahead_size;
+ status = table_reader->VerifyChecksum(
+ ro, TableReaderCaller::kExternalSSTIngestion);
+ }
+ if (!status.ok()) {
+ return status;
+ }
+
+ // Get the external file properties
+ auto props = table_reader->GetTableProperties();
+ const auto& uprops = props->user_collected_properties;
+
+ // Get table version
+ auto version_iter = uprops.find(ExternalSstFilePropertyNames::kVersion);
+ if (version_iter == uprops.end()) {
+ return Status::Corruption("External file version not found");
+ }
+ file_to_ingest->version = DecodeFixed32(version_iter->second.c_str());
+
+ auto seqno_iter = uprops.find(ExternalSstFilePropertyNames::kGlobalSeqno);
+ if (file_to_ingest->version == 2) {
+ // version 2 imply that we have global sequence number
+ if (seqno_iter == uprops.end()) {
+ return Status::Corruption(
+ "External file global sequence number not found");
+ }
+
+ // Set the global sequence number
+ file_to_ingest->original_seqno = DecodeFixed64(seqno_iter->second.c_str());
+ auto offsets_iter = props->properties_offsets.find(
+ ExternalSstFilePropertyNames::kGlobalSeqno);
+ if (offsets_iter == props->properties_offsets.end() ||
+ offsets_iter->second == 0) {
+ file_to_ingest->global_seqno_offset = 0;
+ return Status::Corruption("Was not able to find file global seqno field");
+ }
+ file_to_ingest->global_seqno_offset = static_cast<size_t>(offsets_iter->second);
+ } else if (file_to_ingest->version == 1) {
+ // SST file V1 should not have global seqno field
+ assert(seqno_iter == uprops.end());
+ file_to_ingest->original_seqno = 0;
+ if (ingestion_options_.allow_blocking_flush ||
+ ingestion_options_.allow_global_seqno) {
+ return Status::InvalidArgument(
+ "External SST file V1 does not support global seqno");
+ }
+ } else {
+ return Status::InvalidArgument("External file version is not supported");
+ }
+ // Get number of entries in table
+ file_to_ingest->num_entries = props->num_entries;
+ file_to_ingest->num_range_deletions = props->num_range_deletions;
+
+ ParsedInternalKey key;
+ ReadOptions ro;
+ // During reading the external file we can cache blocks that we read into
+ // the block cache, if we later change the global seqno of this file, we will
+ // have block in cache that will include keys with wrong seqno.
+ // We need to disable fill_cache so that we read from the file without
+ // updating the block cache.
+ ro.fill_cache = false;
+ std::unique_ptr<InternalIterator> iter(table_reader->NewIterator(
+ ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr,
+ /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion));
+ std::unique_ptr<InternalIterator> range_del_iter(
+ table_reader->NewRangeTombstoneIterator(ro));
+
+ // Get first (smallest) and last (largest) key from file.
+ file_to_ingest->smallest_internal_key =
+ InternalKey("", 0, ValueType::kTypeValue);
+ file_to_ingest->largest_internal_key =
+ InternalKey("", 0, ValueType::kTypeValue);
+ bool bounds_set = false;
+ iter->SeekToFirst();
+ if (iter->Valid()) {
+ if (!ParseInternalKey(iter->key(), &key)) {
+ return Status::Corruption("external file have corrupted keys");
+ }
+ if (key.sequence != 0) {
+ return Status::Corruption("external file have non zero sequence number");
+ }
+ file_to_ingest->smallest_internal_key.SetFrom(key);
+
+ iter->SeekToLast();
+ if (!ParseInternalKey(iter->key(), &key)) {
+ return Status::Corruption("external file have corrupted keys");
+ }
+ if (key.sequence != 0) {
+ return Status::Corruption("external file have non zero sequence number");
+ }
+ file_to_ingest->largest_internal_key.SetFrom(key);
+
+ bounds_set = true;
+ }
+
+ // We may need to adjust these key bounds, depending on whether any range
+ // deletion tombstones extend past them.
+ const Comparator* ucmp = cfd_->internal_comparator().user_comparator();
+ if (range_del_iter != nullptr) {
+ for (range_del_iter->SeekToFirst(); range_del_iter->Valid();
+ range_del_iter->Next()) {
+ if (!ParseInternalKey(range_del_iter->key(), &key)) {
+ return Status::Corruption("external file have corrupted keys");
+ }
+ RangeTombstone tombstone(key, range_del_iter->value());
+
+ InternalKey start_key = tombstone.SerializeKey();
+ if (!bounds_set ||
+ sstableKeyCompare(ucmp, start_key,
+ file_to_ingest->smallest_internal_key) < 0) {
+ file_to_ingest->smallest_internal_key = start_key;
+ }
+ InternalKey end_key = tombstone.SerializeEndKey();
+ if (!bounds_set ||
+ sstableKeyCompare(ucmp, end_key,
+ file_to_ingest->largest_internal_key) > 0) {
+ file_to_ingest->largest_internal_key = end_key;
+ }
+ bounds_set = true;
+ }
+ }
+
+ file_to_ingest->cf_id = static_cast<uint32_t>(props->column_family_id);
+
+ file_to_ingest->table_properties = *props;
+
+ return status;
+}
+
+Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
+ SuperVersion* sv, bool force_global_seqno, CompactionStyle compaction_style,
+ SequenceNumber last_seqno, IngestedFileInfo* file_to_ingest,
+ SequenceNumber* assigned_seqno) {
+ Status status;
+ *assigned_seqno = 0;
+ if (force_global_seqno) {
+ *assigned_seqno = last_seqno + 1;
+ if (compaction_style == kCompactionStyleUniversal || files_overlap_) {
+ file_to_ingest->picked_level = 0;
+ return status;
+ }
+ }
+
+ bool overlap_with_db = false;
+ Arena arena;
+ ReadOptions ro;
+ ro.total_order_seek = true;
+ int target_level = 0;
+ auto* vstorage = cfd_->current()->storage_info();
+
+ for (int lvl = 0; lvl < cfd_->NumberLevels(); lvl++) {
+ if (lvl > 0 && lvl < vstorage->base_level()) {
+ continue;
+ }
+
+ if (vstorage->NumLevelFiles(lvl) > 0) {
+ bool overlap_with_level = false;
+ status = sv->current->OverlapWithLevelIterator(
+ ro, env_options_, file_to_ingest->smallest_internal_key.user_key(),
+ file_to_ingest->largest_internal_key.user_key(), lvl,
+ &overlap_with_level);
+ if (!status.ok()) {
+ return status;
+ }
+ if (overlap_with_level) {
+ // We must use L0 or any level higher than `lvl` to be able to overwrite
+ // the keys that we overlap with in this level, We also need to assign
+ // this file a seqno to overwrite the existing keys in level `lvl`
+ overlap_with_db = true;
+ break;
+ }
+
+ if (compaction_style == kCompactionStyleUniversal && lvl != 0) {
+ const std::vector<FileMetaData*>& level_files =
+ vstorage->LevelFiles(lvl);
+ const SequenceNumber level_largest_seqno =
+ (*max_element(level_files.begin(), level_files.end(),
+ [](FileMetaData* f1, FileMetaData* f2) {
+ return f1->fd.largest_seqno < f2->fd.largest_seqno;
+ }))
+ ->fd.largest_seqno;
+ // should only assign seqno to current level's largest seqno when
+ // the file fits
+ if (level_largest_seqno != 0 &&
+ IngestedFileFitInLevel(file_to_ingest, lvl)) {
+ *assigned_seqno = level_largest_seqno;
+ } else {
+ continue;
+ }
+ }
+ } else if (compaction_style == kCompactionStyleUniversal) {
+ continue;
+ }
+
+ // We dont overlap with any keys in this level, but we still need to check
+ // if our file can fit in it
+ if (IngestedFileFitInLevel(file_to_ingest, lvl)) {
+ target_level = lvl;
+ }
+ }
+ // If files overlap, we have to ingest them at level 0 and assign the newest
+ // sequence number
+ if (files_overlap_) {
+ target_level = 0;
+ *assigned_seqno = last_seqno + 1;
+ }
+ TEST_SYNC_POINT_CALLBACK(
+ "ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile",
+ &overlap_with_db);
+ file_to_ingest->picked_level = target_level;
+ if (overlap_with_db && *assigned_seqno == 0) {
+ *assigned_seqno = last_seqno + 1;
+ }
+ return status;
+}
+
+Status ExternalSstFileIngestionJob::CheckLevelForIngestedBehindFile(
+ IngestedFileInfo* file_to_ingest) {
+ auto* vstorage = cfd_->current()->storage_info();
+ // first check if new files fit in the bottommost level
+ int bottom_lvl = cfd_->NumberLevels() - 1;
+ if(!IngestedFileFitInLevel(file_to_ingest, bottom_lvl)) {
+ return Status::InvalidArgument(
+ "Can't ingest_behind file as it doesn't fit "
+ "at the bottommost level!");
+ }
+
+ // second check if despite allow_ingest_behind=true we still have 0 seqnums
+ // at some upper level
+ for (int lvl = 0; lvl < cfd_->NumberLevels() - 1; lvl++) {
+ for (auto file : vstorage->LevelFiles(lvl)) {
+ if (file->fd.smallest_seqno == 0) {
+ return Status::InvalidArgument(
+ "Can't ingest_behind file as despite allow_ingest_behind=true "
+ "there are files with 0 seqno in database at upper levels!");
+ }
+ }
+ }
+
+ file_to_ingest->picked_level = bottom_lvl;
+ return Status::OK();
+}
+
+Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile(
+ IngestedFileInfo* file_to_ingest, SequenceNumber seqno) {
+ if (file_to_ingest->original_seqno == seqno) {
+ // This file already have the correct global seqno
+ return Status::OK();
+ } else if (!ingestion_options_.allow_global_seqno) {
+ return Status::InvalidArgument("Global seqno is required, but disabled");
+ } else if (file_to_ingest->global_seqno_offset == 0) {
+ return Status::InvalidArgument(
+ "Trying to set global seqno for a file that dont have a global seqno "
+ "field");
+ }
+
+ if (ingestion_options_.write_global_seqno) {
+ // Determine if we can write global_seqno to a given offset of file.
+ // If the file system does not support random write, then we should not.
+ // Otherwise we should.
+ std::unique_ptr<FSRandomRWFile> rwfile;
+ Status status =
+ fs_->NewRandomRWFile(file_to_ingest->internal_file_path, env_options_,
+ &rwfile, nullptr);
+ if (status.ok()) {
+ std::string seqno_val;
+ PutFixed64(&seqno_val, seqno);
+ status = rwfile->Write(file_to_ingest->global_seqno_offset, seqno_val,
+ IOOptions(), nullptr);
+ if (status.ok()) {
+ TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncGlobalSeqno");
+ status = SyncIngestedFile(rwfile.get());
+ TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncGlobalSeqno");
+ if (!status.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Failed to sync ingested file %s after writing global "
+ "sequence number: %s",
+ file_to_ingest->internal_file_path.c_str(),
+ status.ToString().c_str());
+ }
+ }
+ if (!status.ok()) {
+ return status;
+ }
+ } else if (!status.IsNotSupported()) {
+ return status;
+ }
+ }
+
+ file_to_ingest->assigned_seqno = seqno;
+ return Status::OK();
+}
+
+bool ExternalSstFileIngestionJob::IngestedFileFitInLevel(
+ const IngestedFileInfo* file_to_ingest, int level) {
+ if (level == 0) {
+ // Files can always fit in L0
+ return true;
+ }
+
+ auto* vstorage = cfd_->current()->storage_info();
+ Slice file_smallest_user_key(
+ file_to_ingest->smallest_internal_key.user_key());
+ Slice file_largest_user_key(file_to_ingest->largest_internal_key.user_key());
+
+ if (vstorage->OverlapInLevel(level, &file_smallest_user_key,
+ &file_largest_user_key)) {
+ // File overlap with another files in this level, we cannot
+ // add it to this level
+ return false;
+ }
+ if (cfd_->RangeOverlapWithCompaction(file_smallest_user_key,
+ file_largest_user_key, level)) {
+ // File overlap with a running compaction output that will be stored
+ // in this level, we cannot add this file to this level
+ return false;
+ }
+
+ // File did not overlap with level files, our compaction output
+ return true;
+}
+
+template <typename TWritableFile>
+Status ExternalSstFileIngestionJob::SyncIngestedFile(TWritableFile* file) {
+ assert(file != nullptr);
+ if (db_options_.use_fsync) {
+ return file->Fsync(IOOptions(), nullptr);
+ } else {
+ return file->Sync(IOOptions(), nullptr);
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/external_sst_file_ingestion_job.h b/src/rocksdb/db/external_sst_file_ingestion_job.h
new file mode 100644
index 000000000..7ddb6f3e8
--- /dev/null
+++ b/src/rocksdb/db/external_sst_file_ingestion_job.h
@@ -0,0 +1,180 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/dbformat.h"
+#include "db/internal_stats.h"
+#include "db/snapshot_impl.h"
+#include "logging/event_logger.h"
+#include "options/db_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/sst_file_writer.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Directories;
+
+struct IngestedFileInfo {
+ // External file path
+ std::string external_file_path;
+ // Smallest internal key in external file
+ InternalKey smallest_internal_key;
+ // Largest internal key in external file
+ InternalKey largest_internal_key;
+ // Sequence number for keys in external file
+ SequenceNumber original_seqno;
+ // Offset of the global sequence number field in the file, will
+ // be zero if version is 1 (global seqno is not supported)
+ size_t global_seqno_offset;
+ // External file size
+ uint64_t file_size;
+ // total number of keys in external file
+ uint64_t num_entries;
+ // total number of range deletions in external file
+ uint64_t num_range_deletions;
+ // Id of column family this file shoule be ingested into
+ uint32_t cf_id;
+ // TableProperties read from external file
+ TableProperties table_properties;
+ // Version of external file
+ int version;
+
+ // FileDescriptor for the file inside the DB
+ FileDescriptor fd;
+ // file path that we picked for file inside the DB
+ std::string internal_file_path;
+ // Global sequence number that we picked for the file inside the DB
+ SequenceNumber assigned_seqno = 0;
+ // Level inside the DB we picked for the external file.
+ int picked_level = 0;
+ // Whether to copy or link the external sst file. copy_file will be set to
+ // false if ingestion_options.move_files is true and underlying FS
+ // supports link operation. Need to provide a default value to make the
+ // undefined-behavior sanity check of llvm happy. Since
+ // ingestion_options.move_files is false by default, thus copy_file is true
+ // by default.
+ bool copy_file = true;
+};
+
+class ExternalSstFileIngestionJob {
+ public:
+ ExternalSstFileIngestionJob(
+ Env* env, VersionSet* versions, ColumnFamilyData* cfd,
+ const ImmutableDBOptions& db_options, const EnvOptions& env_options,
+ SnapshotList* db_snapshots,
+ const IngestExternalFileOptions& ingestion_options,
+ Directories* directories, EventLogger* event_logger)
+ : env_(env),
+ fs_(db_options.fs.get()),
+ versions_(versions),
+ cfd_(cfd),
+ db_options_(db_options),
+ env_options_(env_options),
+ db_snapshots_(db_snapshots),
+ ingestion_options_(ingestion_options),
+ directories_(directories),
+ event_logger_(event_logger),
+ job_start_time_(env_->NowMicros()),
+ consumed_seqno_count_(0) {
+ assert(directories != nullptr);
+ }
+
+ // Prepare the job by copying external files into the DB.
+ Status Prepare(const std::vector<std::string>& external_files_paths,
+ uint64_t next_file_number, SuperVersion* sv);
+
+ // Check if we need to flush the memtable before running the ingestion job
+ // This will be true if the files we are ingesting are overlapping with any
+ // key range in the memtable.
+ //
+ // @param super_version A referenced SuperVersion that will be held for the
+ // duration of this function.
+ //
+ // Thread-safe
+ Status NeedsFlush(bool* flush_needed, SuperVersion* super_version);
+
+ // Will execute the ingestion job and prepare edit() to be applied.
+ // REQUIRES: Mutex held
+ Status Run();
+
+ // Update column family stats.
+ // REQUIRES: Mutex held
+ void UpdateStats();
+
+ // Cleanup after successful/failed job
+ void Cleanup(const Status& status);
+
+ VersionEdit* edit() { return &edit_; }
+
+ const autovector<IngestedFileInfo>& files_to_ingest() const {
+ return files_to_ingest_;
+ }
+
+ // How many sequence numbers did we consume as part of the ingest job?
+ int ConsumedSequenceNumbersCount() const { return consumed_seqno_count_; }
+
+ private:
+ // Open the external file and populate `file_to_ingest` with all the
+ // external information we need to ingest this file.
+ Status GetIngestedFileInfo(const std::string& external_file,
+ IngestedFileInfo* file_to_ingest,
+ SuperVersion* sv);
+
+ // Assign `file_to_ingest` the appropriate sequence number and the lowest
+ // possible level that it can be ingested to according to compaction_style.
+ // REQUIRES: Mutex held
+ Status AssignLevelAndSeqnoForIngestedFile(SuperVersion* sv,
+ bool force_global_seqno,
+ CompactionStyle compaction_style,
+ SequenceNumber last_seqno,
+ IngestedFileInfo* file_to_ingest,
+ SequenceNumber* assigned_seqno);
+
+ // File that we want to ingest behind always goes to the lowest level;
+ // we just check that it fits in the level, that DB allows ingest_behind,
+ // and that we don't have 0 seqnums at the upper levels.
+ // REQUIRES: Mutex held
+ Status CheckLevelForIngestedBehindFile(IngestedFileInfo* file_to_ingest);
+
+ // Set the file global sequence number to `seqno`
+ Status AssignGlobalSeqnoForIngestedFile(IngestedFileInfo* file_to_ingest,
+ SequenceNumber seqno);
+
+ // Check if `file_to_ingest` can fit in level `level`
+ // REQUIRES: Mutex held
+ bool IngestedFileFitInLevel(const IngestedFileInfo* file_to_ingest,
+ int level);
+
+ // Helper method to sync given file.
+ template <typename TWritableFile>
+ Status SyncIngestedFile(TWritableFile* file);
+
+ Env* env_;
+ FileSystem* fs_;
+ VersionSet* versions_;
+ ColumnFamilyData* cfd_;
+ const ImmutableDBOptions& db_options_;
+ const EnvOptions& env_options_;
+ SnapshotList* db_snapshots_;
+ autovector<IngestedFileInfo> files_to_ingest_;
+ const IngestExternalFileOptions& ingestion_options_;
+ Directories* directories_;
+ EventLogger* event_logger_;
+ VersionEdit edit_;
+ uint64_t job_start_time_;
+ int consumed_seqno_count_;
+ // Set in ExternalSstFileIngestionJob::Prepare(), if true all files are
+ // ingested in L0
+ bool files_overlap_{false};
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/external_sst_file_test.cc b/src/rocksdb/db/external_sst_file_test.cc
new file mode 100644
index 000000000..0b91910a1
--- /dev/null
+++ b/src/rocksdb/db/external_sst_file_test.cc
@@ -0,0 +1,2832 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include <functional>
+#include "db/db_test_util.h"
+#include "file/filename.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/sst_file_writer.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A test environment that can be configured to fail the Link operation.
+class ExternalSSTTestEnv : public EnvWrapper {
+ public:
+ ExternalSSTTestEnv(Env* t, bool fail_link)
+ : EnvWrapper(t), fail_link_(fail_link) {}
+
+ Status LinkFile(const std::string& s, const std::string& t) override {
+ if (fail_link_) {
+ return Status::NotSupported("Link failed");
+ }
+ return target()->LinkFile(s, t);
+ }
+
+ void set_fail_link(bool fail_link) { fail_link_ = fail_link; }
+
+ private:
+ bool fail_link_;
+};
+
+class ExternSSTFileLinkFailFallbackTest
+ : public DBTestBase,
+ public ::testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+ ExternSSTFileLinkFailFallbackTest()
+ : DBTestBase("/external_sst_file_test"),
+ test_env_(new ExternalSSTTestEnv(env_, true)) {
+ sst_files_dir_ = dbname_ + "/sst_files/";
+ test::DestroyDir(env_, sst_files_dir_);
+ env_->CreateDir(sst_files_dir_);
+ options_ = CurrentOptions();
+ options_.disable_auto_compactions = true;
+ options_.env = test_env_;
+ }
+
+ void TearDown() override {
+ delete db_;
+ db_ = nullptr;
+ ASSERT_OK(DestroyDB(dbname_, options_));
+ delete test_env_;
+ test_env_ = nullptr;
+ }
+
+ protected:
+ std::string sst_files_dir_;
+ Options options_;
+ ExternalSSTTestEnv* test_env_;
+};
+
+class ExternalSSTFileTest
+ : public DBTestBase,
+ public ::testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+ ExternalSSTFileTest() : DBTestBase("/external_sst_file_test") {
+ sst_files_dir_ = dbname_ + "/sst_files/";
+ DestroyAndRecreateExternalSSTFilesDir();
+ }
+
+ void DestroyAndRecreateExternalSSTFilesDir() {
+ test::DestroyDir(env_, sst_files_dir_);
+ env_->CreateDir(sst_files_dir_);
+ }
+
+ Status GenerateOneExternalFile(
+ const Options& options, ColumnFamilyHandle* cfh,
+ std::vector<std::pair<std::string, std::string>>& data, int file_id,
+ bool sort_data, std::string* external_file_path,
+ std::map<std::string, std::string>* true_data) {
+ // Generate a file id if not provided
+ if (-1 == file_id) {
+ file_id = (++last_file_id_);
+ }
+ // Sort data if asked to do so
+ if (sort_data) {
+ std::sort(data.begin(), data.end(),
+ [&](const std::pair<std::string, std::string>& e1,
+ const std::pair<std::string, std::string>& e2) {
+ return options.comparator->Compare(e1.first, e2.first) < 0;
+ });
+ auto uniq_iter = std::unique(
+ data.begin(), data.end(),
+ [&](const std::pair<std::string, std::string>& e1,
+ const std::pair<std::string, std::string>& e2) {
+ return options.comparator->Compare(e1.first, e2.first) == 0;
+ });
+ data.resize(uniq_iter - data.begin());
+ }
+ std::string file_path = sst_files_dir_ + ToString(file_id);
+ SstFileWriter sst_file_writer(EnvOptions(), options, cfh);
+ Status s = sst_file_writer.Open(file_path);
+ if (!s.ok()) {
+ return s;
+ }
+ for (const auto& entry : data) {
+ s = sst_file_writer.Put(entry.first, entry.second);
+ if (!s.ok()) {
+ sst_file_writer.Finish();
+ return s;
+ }
+ }
+ s = sst_file_writer.Finish();
+ if (s.ok() && external_file_path != nullptr) {
+ *external_file_path = file_path;
+ }
+ if (s.ok() && nullptr != true_data) {
+ for (const auto& entry : data) {
+ true_data->insert({entry.first, entry.second});
+ }
+ }
+ return s;
+ }
+
+ Status GenerateAndAddExternalFile(
+ const Options options,
+ std::vector<std::pair<std::string, std::string>> data, int file_id = -1,
+ bool allow_global_seqno = false, bool write_global_seqno = false,
+ bool verify_checksums_before_ingest = true, bool ingest_behind = false,
+ bool sort_data = false,
+ std::map<std::string, std::string>* true_data = nullptr,
+ ColumnFamilyHandle* cfh = nullptr) {
+ // Generate a file id if not provided
+ if (file_id == -1) {
+ file_id = last_file_id_ + 1;
+ last_file_id_++;
+ }
+
+ // Sort data if asked to do so
+ if (sort_data) {
+ std::sort(data.begin(), data.end(),
+ [&](const std::pair<std::string, std::string>& e1,
+ const std::pair<std::string, std::string>& e2) {
+ return options.comparator->Compare(e1.first, e2.first) < 0;
+ });
+ auto uniq_iter = std::unique(
+ data.begin(), data.end(),
+ [&](const std::pair<std::string, std::string>& e1,
+ const std::pair<std::string, std::string>& e2) {
+ return options.comparator->Compare(e1.first, e2.first) == 0;
+ });
+ data.resize(uniq_iter - data.begin());
+ }
+ std::string file_path = sst_files_dir_ + ToString(file_id);
+ SstFileWriter sst_file_writer(EnvOptions(), options, cfh);
+
+ Status s = sst_file_writer.Open(file_path);
+ if (!s.ok()) {
+ return s;
+ }
+ for (auto& entry : data) {
+ s = sst_file_writer.Put(entry.first, entry.second);
+ if (!s.ok()) {
+ sst_file_writer.Finish();
+ return s;
+ }
+ }
+ s = sst_file_writer.Finish();
+
+ if (s.ok()) {
+ IngestExternalFileOptions ifo;
+ ifo.allow_global_seqno = allow_global_seqno;
+ ifo.write_global_seqno = allow_global_seqno ? write_global_seqno : false;
+ ifo.verify_checksums_before_ingest = verify_checksums_before_ingest;
+ ifo.ingest_behind = ingest_behind;
+ if (cfh) {
+ s = db_->IngestExternalFile(cfh, {file_path}, ifo);
+ } else {
+ s = db_->IngestExternalFile({file_path}, ifo);
+ }
+ }
+
+ if (s.ok() && true_data) {
+ for (auto& entry : data) {
+ (*true_data)[entry.first] = entry.second;
+ }
+ }
+
+ return s;
+ }
+
+ Status GenerateAndAddExternalFiles(
+ const Options& options,
+ const std::vector<ColumnFamilyHandle*>& column_families,
+ const std::vector<IngestExternalFileOptions>& ifos,
+ std::vector<std::vector<std::pair<std::string, std::string>>>& data,
+ int file_id, bool sort_data,
+ std::vector<std::map<std::string, std::string>>& true_data) {
+ if (-1 == file_id) {
+ file_id = (++last_file_id_);
+ }
+ // Generate external SST files, one for each column family
+ size_t num_cfs = column_families.size();
+ assert(ifos.size() == num_cfs);
+ assert(data.size() == num_cfs);
+ Status s;
+ std::vector<IngestExternalFileArg> args(num_cfs);
+ for (size_t i = 0; i != num_cfs; ++i) {
+ std::string external_file_path;
+ s = GenerateOneExternalFile(
+ options, column_families[i], data[i], file_id, sort_data,
+ &external_file_path,
+ true_data.size() == num_cfs ? &true_data[i] : nullptr);
+ if (!s.ok()) {
+ return s;
+ }
+ ++file_id;
+
+ args[i].column_family = column_families[i];
+ args[i].external_files.push_back(external_file_path);
+ args[i].options = ifos[i];
+ }
+ s = db_->IngestExternalFiles(args);
+ return s;
+ }
+
+ Status GenerateAndAddExternalFile(
+ const Options options, std::vector<std::pair<int, std::string>> data,
+ int file_id = -1, bool allow_global_seqno = false,
+ bool write_global_seqno = false,
+ bool verify_checksums_before_ingest = true, bool ingest_behind = false,
+ bool sort_data = false,
+ std::map<std::string, std::string>* true_data = nullptr,
+ ColumnFamilyHandle* cfh = nullptr) {
+ std::vector<std::pair<std::string, std::string>> file_data;
+ for (auto& entry : data) {
+ file_data.emplace_back(Key(entry.first), entry.second);
+ }
+ return GenerateAndAddExternalFile(options, file_data, file_id,
+ allow_global_seqno, write_global_seqno,
+ verify_checksums_before_ingest,
+ ingest_behind, sort_data, true_data, cfh);
+ }
+
+ Status GenerateAndAddExternalFile(
+ const Options options, std::vector<int> keys, int file_id = -1,
+ bool allow_global_seqno = false, bool write_global_seqno = false,
+ bool verify_checksums_before_ingest = true, bool ingest_behind = false,
+ bool sort_data = false,
+ std::map<std::string, std::string>* true_data = nullptr,
+ ColumnFamilyHandle* cfh = nullptr) {
+ std::vector<std::pair<std::string, std::string>> file_data;
+ for (auto& k : keys) {
+ file_data.emplace_back(Key(k), Key(k) + ToString(file_id));
+ }
+ return GenerateAndAddExternalFile(options, file_data, file_id,
+ allow_global_seqno, write_global_seqno,
+ verify_checksums_before_ingest,
+ ingest_behind, sort_data, true_data, cfh);
+ }
+
+ Status DeprecatedAddFile(const std::vector<std::string>& files,
+ bool move_files = false,
+ bool skip_snapshot_check = false,
+ bool skip_write_global_seqno = false) {
+ IngestExternalFileOptions opts;
+ opts.move_files = move_files;
+ opts.snapshot_consistency = !skip_snapshot_check;
+ opts.allow_global_seqno = false;
+ opts.allow_blocking_flush = false;
+ opts.write_global_seqno = !skip_write_global_seqno;
+ return db_->IngestExternalFile(files, opts);
+ }
+
+ ~ExternalSSTFileTest() override { test::DestroyDir(env_, sst_files_dir_); }
+
+ protected:
+ int last_file_id_ = 0;
+ std::string sst_files_dir_;
+};
+
+TEST_F(ExternalSSTFileTest, Basic) {
+ do {
+ Options options = CurrentOptions();
+
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+
+ // Current file size should be 0 after sst_file_writer init and before open a file.
+ ASSERT_EQ(sst_file_writer.FileSize(), 0);
+
+ // file1.sst (0 => 99)
+ std::string file1 = sst_files_dir_ + "file1.sst";
+ ASSERT_OK(sst_file_writer.Open(file1));
+ for (int k = 0; k < 100; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ ExternalSstFileInfo file1_info;
+ Status s = sst_file_writer.Finish(&file1_info);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+
+ // Current file size should be non-zero after success write.
+ ASSERT_GT(sst_file_writer.FileSize(), 0);
+
+ ASSERT_EQ(file1_info.file_path, file1);
+ ASSERT_EQ(file1_info.num_entries, 100);
+ ASSERT_EQ(file1_info.smallest_key, Key(0));
+ ASSERT_EQ(file1_info.largest_key, Key(99));
+ ASSERT_EQ(file1_info.num_range_del_entries, 0);
+ ASSERT_EQ(file1_info.smallest_range_del_key, "");
+ ASSERT_EQ(file1_info.largest_range_del_key, "");
+ // sst_file_writer already finished, cannot add this value
+ s = sst_file_writer.Put(Key(100), "bad_val");
+ ASSERT_FALSE(s.ok()) << s.ToString();
+
+ // file2.sst (100 => 199)
+ std::string file2 = sst_files_dir_ + "file2.sst";
+ ASSERT_OK(sst_file_writer.Open(file2));
+ for (int k = 100; k < 200; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ // Cannot add this key because it's not after last added key
+ s = sst_file_writer.Put(Key(99), "bad_val");
+ ASSERT_FALSE(s.ok()) << s.ToString();
+ ExternalSstFileInfo file2_info;
+ s = sst_file_writer.Finish(&file2_info);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ ASSERT_EQ(file2_info.file_path, file2);
+ ASSERT_EQ(file2_info.num_entries, 100);
+ ASSERT_EQ(file2_info.smallest_key, Key(100));
+ ASSERT_EQ(file2_info.largest_key, Key(199));
+
+ // file3.sst (195 => 299)
+ // This file values overlap with file2 values
+ std::string file3 = sst_files_dir_ + "file3.sst";
+ ASSERT_OK(sst_file_writer.Open(file3));
+ for (int k = 195; k < 300; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+ }
+ ExternalSstFileInfo file3_info;
+ s = sst_file_writer.Finish(&file3_info);
+
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ // Current file size should be non-zero after success finish.
+ ASSERT_GT(sst_file_writer.FileSize(), 0);
+ ASSERT_EQ(file3_info.file_path, file3);
+ ASSERT_EQ(file3_info.num_entries, 105);
+ ASSERT_EQ(file3_info.smallest_key, Key(195));
+ ASSERT_EQ(file3_info.largest_key, Key(299));
+
+ // file4.sst (30 => 39)
+ // This file values overlap with file1 values
+ std::string file4 = sst_files_dir_ + "file4.sst";
+ ASSERT_OK(sst_file_writer.Open(file4));
+ for (int k = 30; k < 40; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+ }
+ ExternalSstFileInfo file4_info;
+ s = sst_file_writer.Finish(&file4_info);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ ASSERT_EQ(file4_info.file_path, file4);
+ ASSERT_EQ(file4_info.num_entries, 10);
+ ASSERT_EQ(file4_info.smallest_key, Key(30));
+ ASSERT_EQ(file4_info.largest_key, Key(39));
+
+ // file5.sst (400 => 499)
+ std::string file5 = sst_files_dir_ + "file5.sst";
+ ASSERT_OK(sst_file_writer.Open(file5));
+ for (int k = 400; k < 500; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ ExternalSstFileInfo file5_info;
+ s = sst_file_writer.Finish(&file5_info);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ ASSERT_EQ(file5_info.file_path, file5);
+ ASSERT_EQ(file5_info.num_entries, 100);
+ ASSERT_EQ(file5_info.smallest_key, Key(400));
+ ASSERT_EQ(file5_info.largest_key, Key(499));
+
+ // file6.sst (delete 400 => 500)
+ std::string file6 = sst_files_dir_ + "file6.sst";
+ ASSERT_OK(sst_file_writer.Open(file6));
+ sst_file_writer.DeleteRange(Key(400), Key(500));
+ ExternalSstFileInfo file6_info;
+ s = sst_file_writer.Finish(&file6_info);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ ASSERT_EQ(file6_info.file_path, file6);
+ ASSERT_EQ(file6_info.num_entries, 0);
+ ASSERT_EQ(file6_info.smallest_key, "");
+ ASSERT_EQ(file6_info.largest_key, "");
+ ASSERT_EQ(file6_info.num_range_del_entries, 1);
+ ASSERT_EQ(file6_info.smallest_range_del_key, Key(400));
+ ASSERT_EQ(file6_info.largest_range_del_key, Key(500));
+
+ // file7.sst (delete 500 => 570, put 520 => 599 divisible by 2)
+ std::string file7 = sst_files_dir_ + "file7.sst";
+ ASSERT_OK(sst_file_writer.Open(file7));
+ sst_file_writer.DeleteRange(Key(500), Key(550));
+ for (int k = 520; k < 560; k += 2) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ sst_file_writer.DeleteRange(Key(525), Key(575));
+ for (int k = 560; k < 600; k += 2) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ ExternalSstFileInfo file7_info;
+ s = sst_file_writer.Finish(&file7_info);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ ASSERT_EQ(file7_info.file_path, file7);
+ ASSERT_EQ(file7_info.num_entries, 40);
+ ASSERT_EQ(file7_info.smallest_key, Key(520));
+ ASSERT_EQ(file7_info.largest_key, Key(598));
+ ASSERT_EQ(file7_info.num_range_del_entries, 2);
+ ASSERT_EQ(file7_info.smallest_range_del_key, Key(500));
+ ASSERT_EQ(file7_info.largest_range_del_key, Key(575));
+
+ // file8.sst (delete 600 => 700)
+ std::string file8 = sst_files_dir_ + "file8.sst";
+ ASSERT_OK(sst_file_writer.Open(file8));
+ sst_file_writer.DeleteRange(Key(600), Key(700));
+ ExternalSstFileInfo file8_info;
+ s = sst_file_writer.Finish(&file8_info);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ ASSERT_EQ(file8_info.file_path, file8);
+ ASSERT_EQ(file8_info.num_entries, 0);
+ ASSERT_EQ(file8_info.smallest_key, "");
+ ASSERT_EQ(file8_info.largest_key, "");
+ ASSERT_EQ(file8_info.num_range_del_entries, 1);
+ ASSERT_EQ(file8_info.smallest_range_del_key, Key(600));
+ ASSERT_EQ(file8_info.largest_range_del_key, Key(700));
+
+ // Cannot create an empty sst file
+ std::string file_empty = sst_files_dir_ + "file_empty.sst";
+ ExternalSstFileInfo file_empty_info;
+ s = sst_file_writer.Finish(&file_empty_info);
+ ASSERT_NOK(s);
+
+ DestroyAndReopen(options);
+ // Add file using file path
+ s = DeprecatedAddFile({file1});
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+ for (int k = 0; k < 100; k++) {
+ ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+ }
+
+ // Add file while holding a snapshot will fail
+ const Snapshot* s1 = db_->GetSnapshot();
+ if (s1 != nullptr) {
+ ASSERT_NOK(DeprecatedAddFile({file2}));
+ db_->ReleaseSnapshot(s1);
+ }
+ // We can add the file after releaseing the snapshot
+ ASSERT_OK(DeprecatedAddFile({file2}));
+
+ ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+ for (int k = 0; k < 200; k++) {
+ ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+ }
+
+ // This file has overlapping values with the existing data
+ s = DeprecatedAddFile({file3});
+ ASSERT_FALSE(s.ok()) << s.ToString();
+
+ // This file has overlapping values with the existing data
+ s = DeprecatedAddFile({file4});
+ ASSERT_FALSE(s.ok()) << s.ToString();
+
+ // Overwrite values of keys divisible by 5
+ for (int k = 0; k < 200; k += 5) {
+ ASSERT_OK(Put(Key(k), Key(k) + "_val_new"));
+ }
+ ASSERT_NE(db_->GetLatestSequenceNumber(), 0U);
+
+ // Key range of file5 (400 => 499) dont overlap with any keys in DB
+ ASSERT_OK(DeprecatedAddFile({file5}));
+
+ // This file has overlapping values with the existing data
+ s = DeprecatedAddFile({file6});
+ ASSERT_FALSE(s.ok()) << s.ToString();
+
+ // Key range of file7 (500 => 598) dont overlap with any keys in DB
+ ASSERT_OK(DeprecatedAddFile({file7}));
+
+ // Key range of file7 (600 => 700) dont overlap with any keys in DB
+ ASSERT_OK(DeprecatedAddFile({file8}));
+
+ // Make sure values are correct before and after flush/compaction
+ for (int i = 0; i < 2; i++) {
+ for (int k = 0; k < 200; k++) {
+ std::string value = Key(k) + "_val";
+ if (k % 5 == 0) {
+ value += "_new";
+ }
+ ASSERT_EQ(Get(Key(k)), value);
+ }
+ for (int k = 400; k < 500; k++) {
+ std::string value = Key(k) + "_val";
+ ASSERT_EQ(Get(Key(k)), value);
+ }
+ for (int k = 500; k < 600; k++) {
+ std::string value = Key(k) + "_val";
+ if (k < 520 || k % 2 == 1) {
+ value = "NOT_FOUND";
+ }
+ ASSERT_EQ(Get(Key(k)), value);
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ }
+
+ Close();
+ options.disable_auto_compactions = true;
+ Reopen(options);
+
+ // Delete keys in range (400 => 499)
+ for (int k = 400; k < 500; k++) {
+ ASSERT_OK(Delete(Key(k)));
+ }
+ // We deleted range (400 => 499) but cannot add file5 because
+ // of the range tombstones
+ ASSERT_NOK(DeprecatedAddFile({file5}));
+
+ // Compacting the DB will remove the tombstones
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ // Now we can add the file
+ ASSERT_OK(DeprecatedAddFile({file5}));
+
+ // Verify values of file5 in DB
+ for (int k = 400; k < 500; k++) {
+ std::string value = Key(k) + "_val";
+ ASSERT_EQ(Get(Key(k)), value);
+ }
+ DestroyAndRecreateExternalSSTFilesDir();
+ } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction |
+ kRangeDelSkipConfigs));
+}
+
+class SstFileWriterCollector : public TablePropertiesCollector {
+ public:
+ explicit SstFileWriterCollector(const std::string prefix) : prefix_(prefix) {
+ name_ = prefix_ + "_SstFileWriterCollector";
+ }
+
+ const char* Name() const override { return name_.c_str(); }
+
+ Status Finish(UserCollectedProperties* properties) override {
+ std::string count = std::to_string(count_);
+ *properties = UserCollectedProperties{
+ {prefix_ + "_SstFileWriterCollector", "YES"},
+ {prefix_ + "_Count", count},
+ };
+ return Status::OK();
+ }
+
+ Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/,
+ EntryType /*type*/, SequenceNumber /*seq*/,
+ uint64_t /*file_size*/) override {
+ ++count_;
+ return Status::OK();
+ }
+
+ UserCollectedProperties GetReadableProperties() const override {
+ return UserCollectedProperties{};
+ }
+
+ private:
+ uint32_t count_ = 0;
+ std::string prefix_;
+ std::string name_;
+};
+
+class SstFileWriterCollectorFactory : public TablePropertiesCollectorFactory {
+ public:
+ explicit SstFileWriterCollectorFactory(std::string prefix)
+ : prefix_(prefix), num_created_(0) {}
+ TablePropertiesCollector* CreateTablePropertiesCollector(
+ TablePropertiesCollectorFactory::Context /*context*/) override {
+ num_created_++;
+ return new SstFileWriterCollector(prefix_);
+ }
+ const char* Name() const override { return "SstFileWriterCollectorFactory"; }
+
+ std::string prefix_;
+ uint32_t num_created_;
+};
+
+TEST_F(ExternalSSTFileTest, AddList) {
+ do {
+ Options options = CurrentOptions();
+
+ auto abc_collector = std::make_shared<SstFileWriterCollectorFactory>("abc");
+ auto xyz_collector = std::make_shared<SstFileWriterCollectorFactory>("xyz");
+
+ options.table_properties_collector_factories.emplace_back(abc_collector);
+ options.table_properties_collector_factories.emplace_back(xyz_collector);
+
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+
+ // file1.sst (0 => 99)
+ std::string file1 = sst_files_dir_ + "file1.sst";
+ ASSERT_OK(sst_file_writer.Open(file1));
+ for (int k = 0; k < 100; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ ExternalSstFileInfo file1_info;
+ Status s = sst_file_writer.Finish(&file1_info);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ ASSERT_EQ(file1_info.file_path, file1);
+ ASSERT_EQ(file1_info.num_entries, 100);
+ ASSERT_EQ(file1_info.smallest_key, Key(0));
+ ASSERT_EQ(file1_info.largest_key, Key(99));
+ // sst_file_writer already finished, cannot add this value
+ s = sst_file_writer.Put(Key(100), "bad_val");
+ ASSERT_FALSE(s.ok()) << s.ToString();
+
+ // file2.sst (100 => 199)
+ std::string file2 = sst_files_dir_ + "file2.sst";
+ ASSERT_OK(sst_file_writer.Open(file2));
+ for (int k = 100; k < 200; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ // Cannot add this key because it's not after last added key
+ s = sst_file_writer.Put(Key(99), "bad_val");
+ ASSERT_FALSE(s.ok()) << s.ToString();
+ ExternalSstFileInfo file2_info;
+ s = sst_file_writer.Finish(&file2_info);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ ASSERT_EQ(file2_info.file_path, file2);
+ ASSERT_EQ(file2_info.num_entries, 100);
+ ASSERT_EQ(file2_info.smallest_key, Key(100));
+ ASSERT_EQ(file2_info.largest_key, Key(199));
+
+ // file3.sst (195 => 199)
+ // This file values overlap with file2 values
+ std::string file3 = sst_files_dir_ + "file3.sst";
+ ASSERT_OK(sst_file_writer.Open(file3));
+ for (int k = 195; k < 200; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+ }
+ ExternalSstFileInfo file3_info;
+ s = sst_file_writer.Finish(&file3_info);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ ASSERT_EQ(file3_info.file_path, file3);
+ ASSERT_EQ(file3_info.num_entries, 5);
+ ASSERT_EQ(file3_info.smallest_key, Key(195));
+ ASSERT_EQ(file3_info.largest_key, Key(199));
+
+ // file4.sst (30 => 39)
+ // This file values overlap with file1 values
+ std::string file4 = sst_files_dir_ + "file4.sst";
+ ASSERT_OK(sst_file_writer.Open(file4));
+ for (int k = 30; k < 40; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+ }
+ ExternalSstFileInfo file4_info;
+ s = sst_file_writer.Finish(&file4_info);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ ASSERT_EQ(file4_info.file_path, file4);
+ ASSERT_EQ(file4_info.num_entries, 10);
+ ASSERT_EQ(file4_info.smallest_key, Key(30));
+ ASSERT_EQ(file4_info.largest_key, Key(39));
+
+ // file5.sst (200 => 299)
+ std::string file5 = sst_files_dir_ + "file5.sst";
+ ASSERT_OK(sst_file_writer.Open(file5));
+ for (int k = 200; k < 300; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ ExternalSstFileInfo file5_info;
+ s = sst_file_writer.Finish(&file5_info);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ ASSERT_EQ(file5_info.file_path, file5);
+ ASSERT_EQ(file5_info.num_entries, 100);
+ ASSERT_EQ(file5_info.smallest_key, Key(200));
+ ASSERT_EQ(file5_info.largest_key, Key(299));
+
+ // file6.sst (delete 0 => 100)
+ std::string file6 = sst_files_dir_ + "file6.sst";
+ ASSERT_OK(sst_file_writer.Open(file6));
+ ASSERT_OK(sst_file_writer.DeleteRange(Key(0), Key(75)));
+ ASSERT_OK(sst_file_writer.DeleteRange(Key(25), Key(100)));
+ ExternalSstFileInfo file6_info;
+ s = sst_file_writer.Finish(&file6_info);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ ASSERT_EQ(file6_info.file_path, file6);
+ ASSERT_EQ(file6_info.num_entries, 0);
+ ASSERT_EQ(file6_info.smallest_key, "");
+ ASSERT_EQ(file6_info.largest_key, "");
+ ASSERT_EQ(file6_info.num_range_del_entries, 2);
+ ASSERT_EQ(file6_info.smallest_range_del_key, Key(0));
+ ASSERT_EQ(file6_info.largest_range_del_key, Key(100));
+
+ // file7.sst (delete 99 => 201)
+ std::string file7 = sst_files_dir_ + "file7.sst";
+ ASSERT_OK(sst_file_writer.Open(file7));
+ ASSERT_OK(sst_file_writer.DeleteRange(Key(99), Key(201)));
+ ExternalSstFileInfo file7_info;
+ s = sst_file_writer.Finish(&file7_info);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ ASSERT_EQ(file7_info.file_path, file7);
+ ASSERT_EQ(file7_info.num_entries, 0);
+ ASSERT_EQ(file7_info.smallest_key, "");
+ ASSERT_EQ(file7_info.largest_key, "");
+ ASSERT_EQ(file7_info.num_range_del_entries, 1);
+ ASSERT_EQ(file7_info.smallest_range_del_key, Key(99));
+ ASSERT_EQ(file7_info.largest_range_del_key, Key(201));
+
+ // list 1 has internal key range conflict
+ std::vector<std::string> file_list0({file1, file2});
+ std::vector<std::string> file_list1({file3, file2, file1});
+ std::vector<std::string> file_list2({file5});
+ std::vector<std::string> file_list3({file3, file4});
+ std::vector<std::string> file_list4({file5, file7});
+ std::vector<std::string> file_list5({file6, file7});
+
+ DestroyAndReopen(options);
+
+ // These lists of files have key ranges that overlap with each other
+ s = DeprecatedAddFile(file_list1);
+ ASSERT_FALSE(s.ok()) << s.ToString();
+ // Both of the following overlap on the range deletion tombstone.
+ s = DeprecatedAddFile(file_list4);
+ ASSERT_FALSE(s.ok()) << s.ToString();
+ s = DeprecatedAddFile(file_list5);
+ ASSERT_FALSE(s.ok()) << s.ToString();
+
+ // Add files using file path list
+ s = DeprecatedAddFile(file_list0);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+ for (int k = 0; k < 200; k++) {
+ ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+ }
+
+ TablePropertiesCollection props;
+ ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+ ASSERT_EQ(props.size(), 2);
+ for (auto file_props : props) {
+ auto user_props = file_props.second->user_collected_properties;
+ ASSERT_EQ(user_props["abc_SstFileWriterCollector"], "YES");
+ ASSERT_EQ(user_props["xyz_SstFileWriterCollector"], "YES");
+ ASSERT_EQ(user_props["abc_Count"], "100");
+ ASSERT_EQ(user_props["xyz_Count"], "100");
+ }
+
+ // Add file while holding a snapshot will fail
+ const Snapshot* s1 = db_->GetSnapshot();
+ if (s1 != nullptr) {
+ ASSERT_NOK(DeprecatedAddFile(file_list2));
+ db_->ReleaseSnapshot(s1);
+ }
+ // We can add the file after releaseing the snapshot
+ ASSERT_OK(DeprecatedAddFile(file_list2));
+ ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+ for (int k = 0; k < 300; k++) {
+ ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+ }
+
+ ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+ ASSERT_EQ(props.size(), 3);
+ for (auto file_props : props) {
+ auto user_props = file_props.second->user_collected_properties;
+ ASSERT_EQ(user_props["abc_SstFileWriterCollector"], "YES");
+ ASSERT_EQ(user_props["xyz_SstFileWriterCollector"], "YES");
+ ASSERT_EQ(user_props["abc_Count"], "100");
+ ASSERT_EQ(user_props["xyz_Count"], "100");
+ }
+
+ // This file list has overlapping values with the existing data
+ s = DeprecatedAddFile(file_list3);
+ ASSERT_FALSE(s.ok()) << s.ToString();
+
+ // Overwrite values of keys divisible by 5
+ for (int k = 0; k < 200; k += 5) {
+ ASSERT_OK(Put(Key(k), Key(k) + "_val_new"));
+ }
+ ASSERT_NE(db_->GetLatestSequenceNumber(), 0U);
+
+ // Make sure values are correct before and after flush/compaction
+ for (int i = 0; i < 2; i++) {
+ for (int k = 0; k < 200; k++) {
+ std::string value = Key(k) + "_val";
+ if (k % 5 == 0) {
+ value += "_new";
+ }
+ ASSERT_EQ(Get(Key(k)), value);
+ }
+ for (int k = 200; k < 300; k++) {
+ std::string value = Key(k) + "_val";
+ ASSERT_EQ(Get(Key(k)), value);
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ }
+
+ // Delete keys in range (200 => 299)
+ for (int k = 200; k < 300; k++) {
+ ASSERT_OK(Delete(Key(k)));
+ }
+ // We deleted range (200 => 299) but cannot add file5 because
+ // of the range tombstones
+ ASSERT_NOK(DeprecatedAddFile(file_list2));
+
+ // Compacting the DB will remove the tombstones
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ // Now we can add the file
+ ASSERT_OK(DeprecatedAddFile(file_list2));
+
+ // Verify values of file5 in DB
+ for (int k = 200; k < 300; k++) {
+ std::string value = Key(k) + "_val";
+ ASSERT_EQ(Get(Key(k)), value);
+ }
+ DestroyAndRecreateExternalSSTFilesDir();
+ } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction |
+ kRangeDelSkipConfigs));
+}
+
+TEST_F(ExternalSSTFileTest, AddListAtomicity) {
+ do {
+ Options options = CurrentOptions();
+
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+
+ // files[0].sst (0 => 99)
+ // files[1].sst (100 => 199)
+ // ...
+ // file[8].sst (800 => 899)
+ int n = 9;
+ std::vector<std::string> files(n);
+ std::vector<ExternalSstFileInfo> files_info(n);
+ for (int i = 0; i < n; i++) {
+ files[i] = sst_files_dir_ + "file" + std::to_string(i) + ".sst";
+ ASSERT_OK(sst_file_writer.Open(files[i]));
+ for (int k = i * 100; k < (i + 1) * 100; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ Status s = sst_file_writer.Finish(&files_info[i]);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ ASSERT_EQ(files_info[i].file_path, files[i]);
+ ASSERT_EQ(files_info[i].num_entries, 100);
+ ASSERT_EQ(files_info[i].smallest_key, Key(i * 100));
+ ASSERT_EQ(files_info[i].largest_key, Key((i + 1) * 100 - 1));
+ }
+ files.push_back(sst_files_dir_ + "file" + std::to_string(n) + ".sst");
+ auto s = DeprecatedAddFile(files);
+ ASSERT_NOK(s) << s.ToString();
+ for (int k = 0; k < n * 100; k++) {
+ ASSERT_EQ("NOT_FOUND", Get(Key(k)));
+ }
+ files.pop_back();
+ ASSERT_OK(DeprecatedAddFile(files));
+ for (int k = 0; k < n * 100; k++) {
+ std::string value = Key(k) + "_val";
+ ASSERT_EQ(Get(Key(k)), value);
+ }
+ DestroyAndRecreateExternalSSTFilesDir();
+ } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction));
+}
+// This test reporduce a bug that can happen in some cases if the DB started
+// purging obsolete files when we are adding an external sst file.
+// This situation may result in deleting the file while it's being added.
+TEST_F(ExternalSSTFileTest, PurgeObsoleteFilesBug) {
+ Options options = CurrentOptions();
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+
+ // file1.sst (0 => 500)
+ std::string sst_file_path = sst_files_dir_ + "file1.sst";
+ Status s = sst_file_writer.Open(sst_file_path);
+ ASSERT_OK(s);
+ for (int i = 0; i < 500; i++) {
+ std::string k = Key(i);
+ s = sst_file_writer.Put(k, k + "_val");
+ ASSERT_OK(s);
+ }
+
+ ExternalSstFileInfo sst_file_info;
+ s = sst_file_writer.Finish(&sst_file_info);
+ ASSERT_OK(s);
+
+ options.delete_obsolete_files_period_micros = 0;
+ options.disable_auto_compactions = true;
+ DestroyAndReopen(options);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "ExternalSstFileIngestionJob::Prepare:FileAdded", [&](void* /* arg */) {
+ ASSERT_OK(Put("aaa", "bbb"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("aaa", "xxx"));
+ ASSERT_OK(Flush());
+ db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ s = DeprecatedAddFile({sst_file_path});
+ ASSERT_OK(s);
+
+ for (int i = 0; i < 500; i++) {
+ std::string k = Key(i);
+ std::string v = k + "_val";
+ ASSERT_EQ(Get(k), v);
+ }
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(ExternalSSTFileTest, SkipSnapshot) {
+ Options options = CurrentOptions();
+
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+
+ // file1.sst (0 => 99)
+ std::string file1 = sst_files_dir_ + "file1.sst";
+ ASSERT_OK(sst_file_writer.Open(file1));
+ for (int k = 0; k < 100; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ ExternalSstFileInfo file1_info;
+ Status s = sst_file_writer.Finish(&file1_info);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ ASSERT_EQ(file1_info.file_path, file1);
+ ASSERT_EQ(file1_info.num_entries, 100);
+ ASSERT_EQ(file1_info.smallest_key, Key(0));
+ ASSERT_EQ(file1_info.largest_key, Key(99));
+
+ // file2.sst (100 => 299)
+ std::string file2 = sst_files_dir_ + "file2.sst";
+ ASSERT_OK(sst_file_writer.Open(file2));
+ for (int k = 100; k < 300; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ ExternalSstFileInfo file2_info;
+ s = sst_file_writer.Finish(&file2_info);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ ASSERT_EQ(file2_info.file_path, file2);
+ ASSERT_EQ(file2_info.num_entries, 200);
+ ASSERT_EQ(file2_info.smallest_key, Key(100));
+ ASSERT_EQ(file2_info.largest_key, Key(299));
+
+ ASSERT_OK(DeprecatedAddFile({file1}));
+
+ // Add file will fail when holding snapshot and use the default
+ // skip_snapshot_check to false
+ const Snapshot* s1 = db_->GetSnapshot();
+ if (s1 != nullptr) {
+ ASSERT_NOK(DeprecatedAddFile({file2}));
+ }
+
+ // Add file will success when set skip_snapshot_check to true even db holding
+ // snapshot
+ if (s1 != nullptr) {
+ ASSERT_OK(DeprecatedAddFile({file2}, false, true));
+ db_->ReleaseSnapshot(s1);
+ }
+
+ // file3.sst (300 => 399)
+ std::string file3 = sst_files_dir_ + "file3.sst";
+ ASSERT_OK(sst_file_writer.Open(file3));
+ for (int k = 300; k < 400; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ ExternalSstFileInfo file3_info;
+ s = sst_file_writer.Finish(&file3_info);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ ASSERT_EQ(file3_info.file_path, file3);
+ ASSERT_EQ(file3_info.num_entries, 100);
+ ASSERT_EQ(file3_info.smallest_key, Key(300));
+ ASSERT_EQ(file3_info.largest_key, Key(399));
+
+ // check that we have change the old key
+ ASSERT_EQ(Get(Key(300)), "NOT_FOUND");
+ const Snapshot* s2 = db_->GetSnapshot();
+ ASSERT_OK(DeprecatedAddFile({file3}, false, true));
+ ASSERT_EQ(Get(Key(300)), Key(300) + ("_val"));
+ ASSERT_EQ(Get(Key(300), s2), Key(300) + ("_val"));
+
+ db_->ReleaseSnapshot(s2);
+}
+
+TEST_F(ExternalSSTFileTest, MultiThreaded) {
+ // Bulk load 10 files every file contain 1000 keys
+ int num_files = 10;
+ int keys_per_file = 1000;
+
+ // Generate file names
+ std::vector<std::string> file_names;
+ for (int i = 0; i < num_files; i++) {
+ std::string file_name = "file_" + ToString(i) + ".sst";
+ file_names.push_back(sst_files_dir_ + file_name);
+ }
+
+ do {
+ Options options = CurrentOptions();
+
+ std::atomic<int> thread_num(0);
+ std::function<void()> write_file_func = [&]() {
+ int file_idx = thread_num.fetch_add(1);
+ int range_start = file_idx * keys_per_file;
+ int range_end = range_start + keys_per_file;
+
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+
+ ASSERT_OK(sst_file_writer.Open(file_names[file_idx]));
+
+ for (int k = range_start; k < range_end; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k)));
+ }
+
+ Status s = sst_file_writer.Finish();
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ };
+ // Write num_files files in parallel
+ std::vector<port::Thread> sst_writer_threads;
+ for (int i = 0; i < num_files; ++i) {
+ sst_writer_threads.emplace_back(write_file_func);
+ }
+
+ for (auto& t : sst_writer_threads) {
+ t.join();
+ }
+
+ fprintf(stderr, "Wrote %d files (%d keys)\n", num_files,
+ num_files * keys_per_file);
+
+ thread_num.store(0);
+ std::atomic<int> files_added(0);
+ // Thread 0 -> Load {f0,f1}
+ // Thread 1 -> Load {f0,f1}
+ // Thread 2 -> Load {f2,f3}
+ // Thread 3 -> Load {f2,f3}
+ // Thread 4 -> Load {f4,f5}
+ // Thread 5 -> Load {f4,f5}
+ // ...
+ std::function<void()> load_file_func = [&]() {
+ // We intentionally add every file twice, and assert that it was added
+ // only once and the other add failed
+ int thread_id = thread_num.fetch_add(1);
+ int file_idx = (thread_id / 2) * 2;
+ // sometimes we use copy, sometimes link .. the result should be the same
+ bool move_file = (thread_id % 3 == 0);
+
+ std::vector<std::string> files_to_add;
+
+ files_to_add = {file_names[file_idx]};
+ if (static_cast<size_t>(file_idx + 1) < file_names.size()) {
+ files_to_add.push_back(file_names[file_idx + 1]);
+ }
+
+ Status s = DeprecatedAddFile(files_to_add, move_file);
+ if (s.ok()) {
+ files_added += static_cast<int>(files_to_add.size());
+ }
+ };
+
+ // Bulk load num_files files in parallel
+ std::vector<port::Thread> add_file_threads;
+ DestroyAndReopen(options);
+ for (int i = 0; i < num_files; ++i) {
+ add_file_threads.emplace_back(load_file_func);
+ }
+
+ for (auto& t : add_file_threads) {
+ t.join();
+ }
+ ASSERT_EQ(files_added.load(), num_files);
+ fprintf(stderr, "Loaded %d files (%d keys)\n", num_files,
+ num_files * keys_per_file);
+
+ // Overwrite values of keys divisible by 100
+ for (int k = 0; k < num_files * keys_per_file; k += 100) {
+ std::string key = Key(k);
+ Status s = Put(key, key + "_new");
+ ASSERT_TRUE(s.ok());
+ }
+
+ for (int i = 0; i < 2; i++) {
+ // Make sure the values are correct before and after flush/compaction
+ for (int k = 0; k < num_files * keys_per_file; ++k) {
+ std::string key = Key(k);
+ std::string value = (k % 100 == 0) ? (key + "_new") : key;
+ ASSERT_EQ(Get(key), value);
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ }
+
+ fprintf(stderr, "Verified %d values\n", num_files * keys_per_file);
+ DestroyAndRecreateExternalSSTFilesDir();
+ } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction));
+}
+
+TEST_F(ExternalSSTFileTest, OverlappingRanges) {
+ Random rnd(301);
+ SequenceNumber assigned_seqno = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "ExternalSstFileIngestionJob::Run", [&assigned_seqno](void* arg) {
+ ASSERT_TRUE(arg != nullptr);
+ assigned_seqno = *(static_cast<SequenceNumber*>(arg));
+ });
+ bool need_flush = false;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::IngestExternalFile:NeedFlush", [&need_flush](void* arg) {
+ ASSERT_TRUE(arg != nullptr);
+ need_flush = *(static_cast<bool*>(arg));
+ });
+ bool overlap_with_db = false;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile",
+ [&overlap_with_db](void* arg) {
+ ASSERT_TRUE(arg != nullptr);
+ overlap_with_db = *(static_cast<bool*>(arg));
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ do {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+
+ printf("Option config = %d\n", option_config_);
+ std::vector<std::pair<int, int>> key_ranges;
+ for (int i = 0; i < 100; i++) {
+ int range_start = rnd.Uniform(20000);
+ int keys_per_range = 10 + rnd.Uniform(41);
+
+ key_ranges.emplace_back(range_start, range_start + keys_per_range);
+ }
+
+ int memtable_add = 0;
+ int success_add_file = 0;
+ int failed_add_file = 0;
+ std::map<std::string, std::string> true_data;
+ for (size_t i = 0; i < key_ranges.size(); i++) {
+ int range_start = key_ranges[i].first;
+ int range_end = key_ranges[i].second;
+
+ Status s;
+ std::string range_val = "range_" + ToString(i);
+
+ // For 20% of ranges we use DB::Put, for 80% we use DB::AddFile
+ if (i && i % 5 == 0) {
+ // Use DB::Put to insert range (insert into memtable)
+ range_val += "_put";
+ for (int k = range_start; k <= range_end; k++) {
+ s = Put(Key(k), range_val);
+ ASSERT_OK(s);
+ }
+ memtable_add++;
+ } else {
+ // Use DB::AddFile to insert range
+ range_val += "_add_file";
+
+ // Generate the file containing the range
+ std::string file_name = sst_files_dir_ + env_->GenerateUniqueId();
+ ASSERT_OK(sst_file_writer.Open(file_name));
+ for (int k = range_start; k <= range_end; k++) {
+ s = sst_file_writer.Put(Key(k), range_val);
+ ASSERT_OK(s);
+ }
+ ExternalSstFileInfo file_info;
+ s = sst_file_writer.Finish(&file_info);
+ ASSERT_OK(s);
+
+ // Insert the generated file
+ s = DeprecatedAddFile({file_name});
+ auto it = true_data.lower_bound(Key(range_start));
+ if (option_config_ != kUniversalCompaction &&
+ option_config_ != kUniversalCompactionMultiLevel &&
+ option_config_ != kUniversalSubcompactions) {
+ if (it != true_data.end() && it->first <= Key(range_end)) {
+ // This range overlap with data already exist in DB
+ ASSERT_NOK(s);
+ failed_add_file++;
+ } else {
+ ASSERT_OK(s);
+ success_add_file++;
+ }
+ } else {
+ if ((it != true_data.end() && it->first <= Key(range_end)) ||
+ need_flush || assigned_seqno > 0 || overlap_with_db) {
+ // This range overlap with data already exist in DB
+ ASSERT_NOK(s);
+ failed_add_file++;
+ } else {
+ ASSERT_OK(s);
+ success_add_file++;
+ }
+ }
+ }
+
+ if (s.ok()) {
+ // Update true_data map to include the new inserted data
+ for (int k = range_start; k <= range_end; k++) {
+ true_data[Key(k)] = range_val;
+ }
+ }
+
+ // Flush / Compact the DB
+ if (i && i % 50 == 0) {
+ Flush();
+ }
+ if (i && i % 75 == 0) {
+ db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ }
+ }
+
+ printf("Total: %" ROCKSDB_PRIszt
+ " ranges\n"
+ "AddFile()|Success: %d ranges\n"
+ "AddFile()|RangeConflict: %d ranges\n"
+ "Put(): %d ranges\n",
+ key_ranges.size(), success_add_file, failed_add_file, memtable_add);
+
+ // Verify the correctness of the data
+ for (const auto& kv : true_data) {
+ ASSERT_EQ(Get(kv.first), kv.second);
+ }
+ printf("keys/values verified\n");
+ DestroyAndRecreateExternalSSTFilesDir();
+ } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction));
+}
+
+TEST_P(ExternalSSTFileTest, PickedLevel) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = false;
+ options.level0_file_num_compaction_trigger = 4;
+ options.num_levels = 4;
+ DestroyAndReopen(options);
+
+ std::map<std::string, std::string> true_data;
+
+ // File 0 will go to last level (L3)
+ ASSERT_OK(GenerateAndAddExternalFile(options, {1, 10}, -1, false, false, true,
+ false, false, &true_data));
+ EXPECT_EQ(FilesPerLevel(), "0,0,0,1");
+
+ // File 1 will go to level L2 (since it overlap with file 0 in L3)
+ ASSERT_OK(GenerateAndAddExternalFile(options, {2, 9}, -1, false, false, true,
+ false, false, &true_data));
+ EXPECT_EQ(FilesPerLevel(), "0,0,1,1");
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"ExternalSSTFileTest::PickedLevel:0", "BackgroundCallCompaction:0"},
+ {"DBImpl::BackgroundCompaction:Start",
+ "ExternalSSTFileTest::PickedLevel:1"},
+ {"ExternalSSTFileTest::PickedLevel:2",
+ "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Flush 4 files containing the same keys
+ for (int i = 0; i < 4; i++) {
+ ASSERT_OK(Put(Key(3), Key(3) + "put"));
+ ASSERT_OK(Put(Key(8), Key(8) + "put"));
+ true_data[Key(3)] = Key(3) + "put";
+ true_data[Key(8)] = Key(8) + "put";
+ ASSERT_OK(Flush());
+ }
+
+ // Wait for BackgroundCompaction() to be called
+ TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevel:0");
+ TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevel:1");
+
+ EXPECT_EQ(FilesPerLevel(), "4,0,1,1");
+
+ // This file overlaps with file 0 (L3), file 1 (L2) and the
+ // output of compaction going to L1
+ ASSERT_OK(GenerateAndAddExternalFile(options, {4, 7}, -1, false, false, true,
+ false, false, &true_data));
+ EXPECT_EQ(FilesPerLevel(), "5,0,1,1");
+
+ // This file does not overlap with any file or with the running compaction
+ ASSERT_OK(GenerateAndAddExternalFile(options, {9000, 9001}, -1, false, false,
+ false, false, false, &true_data));
+ EXPECT_EQ(FilesPerLevel(), "5,0,1,2");
+
+ // Hold compaction from finishing
+ TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevel:2");
+
+ dbfull()->TEST_WaitForCompact();
+ EXPECT_EQ(FilesPerLevel(), "1,1,1,2");
+
+ size_t kcnt = 0;
+ VerifyDBFromMap(true_data, &kcnt, false);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(ExternalSSTFileTest, PickedLevelBug) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = false;
+ options.level0_file_num_compaction_trigger = 3;
+ options.num_levels = 2;
+ DestroyAndReopen(options);
+
+ std::vector<int> file_keys;
+
+ // file #1 in L0
+ file_keys = {0, 5, 7};
+ for (int k : file_keys) {
+ ASSERT_OK(Put(Key(k), Key(k)));
+ }
+ ASSERT_OK(Flush());
+
+ // file #2 in L0
+ file_keys = {4, 6, 8, 9};
+ for (int k : file_keys) {
+ ASSERT_OK(Put(Key(k), Key(k)));
+ }
+ ASSERT_OK(Flush());
+
+ // We have 2 overlapping files in L0
+ EXPECT_EQ(FilesPerLevel(), "2");
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::AddFile:MutexLock", "ExternalSSTFileTest::PickedLevelBug:0"},
+ {"ExternalSSTFileTest::PickedLevelBug:1", "DBImpl::AddFile:MutexUnlock"},
+ {"ExternalSSTFileTest::PickedLevelBug:2",
+ "DBImpl::RunManualCompaction:0"},
+ {"ExternalSSTFileTest::PickedLevelBug:3",
+ "DBImpl::RunManualCompaction:1"}});
+
+ std::atomic<bool> bg_compact_started(false);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:Start",
+ [&](void* /*arg*/) { bg_compact_started.store(true); });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // While writing the MANIFEST start a thread that will ask for compaction
+ ROCKSDB_NAMESPACE::port::Thread bg_compact([&]() {
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ });
+ TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:2");
+
+ // Start a thread that will ingest a new file
+ ROCKSDB_NAMESPACE::port::Thread bg_addfile([&]() {
+ file_keys = {1, 2, 3};
+ ASSERT_OK(GenerateAndAddExternalFile(options, file_keys, 1));
+ });
+
+ // Wait for AddFile to start picking levels and writing MANIFEST
+ TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:0");
+
+ TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:3");
+
+ // We need to verify that no compactions can run while AddFile is
+ // ingesting the files into the levels it find suitable. So we will
+ // wait for 2 seconds to give a chance for compactions to run during
+ // this period, and then make sure that no compactions where able to run
+ env_->SleepForMicroseconds(1000000 * 2);
+ ASSERT_FALSE(bg_compact_started.load());
+
+ // Hold AddFile from finishing writing the MANIFEST
+ TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:1");
+
+ bg_addfile.join();
+ bg_compact.join();
+
+ dbfull()->TEST_WaitForCompact();
+
+ int total_keys = 0;
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_OK(iter->status());
+ total_keys++;
+ }
+ ASSERT_EQ(total_keys, 10);
+
+ delete iter;
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(ExternalSSTFileTest, IngestNonExistingFile) {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+
+ Status s = db_->IngestExternalFile({"non_existing_file"},
+ IngestExternalFileOptions());
+ ASSERT_NOK(s);
+
+ // Verify file deletion is not impacted (verify a bug fix)
+ ASSERT_OK(Put(Key(1), Key(1)));
+ ASSERT_OK(Put(Key(9), Key(9)));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put(Key(1), Key(1)));
+ ASSERT_OK(Put(Key(9), Key(9)));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+ // After full compaction, there should be only 1 file.
+ std::vector<std::string> files;
+ env_->GetChildren(dbname_, &files);
+ int num_sst_files = 0;
+ for (auto& f : files) {
+ uint64_t number;
+ FileType type;
+ if (ParseFileName(f, &number, &type) && type == kTableFile) {
+ num_sst_files++;
+ }
+ }
+ ASSERT_EQ(1, num_sst_files);
+}
+
+TEST_F(ExternalSSTFileTest, CompactDuringAddFileRandom) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = false;
+ options.level0_file_num_compaction_trigger = 2;
+ options.num_levels = 2;
+ DestroyAndReopen(options);
+
+ std::function<void()> bg_compact = [&]() {
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ };
+
+ int range_id = 0;
+ std::vector<int> file_keys;
+ std::function<void()> bg_addfile = [&]() {
+ ASSERT_OK(GenerateAndAddExternalFile(options, file_keys, range_id));
+ };
+
+ const int num_of_ranges = 1000;
+ std::vector<port::Thread> threads;
+ while (range_id < num_of_ranges) {
+ int range_start = range_id * 10;
+ int range_end = range_start + 10;
+
+ file_keys.clear();
+ for (int k = range_start + 1; k < range_end; k++) {
+ file_keys.push_back(k);
+ }
+ ASSERT_OK(Put(Key(range_start), Key(range_start)));
+ ASSERT_OK(Put(Key(range_end), Key(range_end)));
+ ASSERT_OK(Flush());
+
+ if (range_id % 10 == 0) {
+ threads.emplace_back(bg_compact);
+ }
+ threads.emplace_back(bg_addfile);
+
+ for (auto& t : threads) {
+ t.join();
+ }
+ threads.clear();
+
+ range_id++;
+ }
+
+ for (int rid = 0; rid < num_of_ranges; rid++) {
+ int range_start = rid * 10;
+ int range_end = range_start + 10;
+
+ ASSERT_EQ(Get(Key(range_start)), Key(range_start)) << rid;
+ ASSERT_EQ(Get(Key(range_end)), Key(range_end)) << rid;
+ for (int k = range_start + 1; k < range_end; k++) {
+ std::string v = Key(k) + ToString(rid);
+ ASSERT_EQ(Get(Key(k)), v) << rid;
+ }
+ }
+}
+
+TEST_F(ExternalSSTFileTest, PickedLevelDynamic) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = false;
+ options.level0_file_num_compaction_trigger = 4;
+ options.level_compaction_dynamic_level_bytes = true;
+ options.num_levels = 4;
+ DestroyAndReopen(options);
+ std::map<std::string, std::string> true_data;
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"ExternalSSTFileTest::PickedLevelDynamic:0",
+ "BackgroundCallCompaction:0"},
+ {"DBImpl::BackgroundCompaction:Start",
+ "ExternalSSTFileTest::PickedLevelDynamic:1"},
+ {"ExternalSSTFileTest::PickedLevelDynamic:2",
+ "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Flush 4 files containing the same keys
+ for (int i = 0; i < 4; i++) {
+ for (int k = 20; k <= 30; k++) {
+ ASSERT_OK(Put(Key(k), Key(k) + "put"));
+ true_data[Key(k)] = Key(k) + "put";
+ }
+ for (int k = 50; k <= 60; k++) {
+ ASSERT_OK(Put(Key(k), Key(k) + "put"));
+ true_data[Key(k)] = Key(k) + "put";
+ }
+ ASSERT_OK(Flush());
+ }
+
+ // Wait for BackgroundCompaction() to be called
+ TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelDynamic:0");
+ TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelDynamic:1");
+
+ // This file overlaps with the output of the compaction (going to L3)
+ // so the file will be added to L0 since L3 is the base level
+ ASSERT_OK(GenerateAndAddExternalFile(options, {31, 32, 33, 34}, -1, false,
+ false, true, false, false, &true_data));
+ EXPECT_EQ(FilesPerLevel(), "5");
+
+ // This file does not overlap with the current running compactiong
+ ASSERT_OK(GenerateAndAddExternalFile(options, {9000, 9001}, -1, false, false,
+ true, false, false, &true_data));
+ EXPECT_EQ(FilesPerLevel(), "5,0,0,1");
+
+ // Hold compaction from finishing
+ TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelDynamic:2");
+
+ // Output of the compaction will go to L3
+ dbfull()->TEST_WaitForCompact();
+ EXPECT_EQ(FilesPerLevel(), "1,0,0,2");
+
+ Close();
+ options.disable_auto_compactions = true;
+ Reopen(options);
+
+ ASSERT_OK(GenerateAndAddExternalFile(options, {1, 15, 19}, -1, false, false,
+ true, false, false, &true_data));
+ ASSERT_EQ(FilesPerLevel(), "1,0,0,3");
+
+ ASSERT_OK(GenerateAndAddExternalFile(options, {1000, 1001, 1002}, -1, false,
+ false, true, false, false, &true_data));
+ ASSERT_EQ(FilesPerLevel(), "1,0,0,4");
+
+ ASSERT_OK(GenerateAndAddExternalFile(options, {500, 600, 700}, -1, false,
+ false, true, false, false, &true_data));
+ ASSERT_EQ(FilesPerLevel(), "1,0,0,5");
+
+ // File 5 overlaps with file 2 (L3 / base level)
+ ASSERT_OK(GenerateAndAddExternalFile(options, {2, 10}, -1, false, false, true,
+ false, false, &true_data));
+ ASSERT_EQ(FilesPerLevel(), "2,0,0,5");
+
+ // File 6 overlaps with file 2 (L3 / base level) and file 5 (L0)
+ ASSERT_OK(GenerateAndAddExternalFile(options, {3, 9}, -1, false, false, true,
+ false, false, &true_data));
+ ASSERT_EQ(FilesPerLevel(), "3,0,0,5");
+
+ // Verify data in files
+ size_t kcnt = 0;
+ VerifyDBFromMap(true_data, &kcnt, false);
+
+ // Write range [5 => 10] to L0
+ for (int i = 5; i <= 10; i++) {
+ std::string k = Key(i);
+ std::string v = k + "put";
+ ASSERT_OK(Put(k, v));
+ true_data[k] = v;
+ }
+ ASSERT_OK(Flush());
+ ASSERT_EQ(FilesPerLevel(), "4,0,0,5");
+
+ // File 7 overlaps with file 4 (L3)
+ ASSERT_OK(GenerateAndAddExternalFile(options, {650, 651, 652}, -1, false,
+ false, true, false, false, &true_data));
+ ASSERT_EQ(FilesPerLevel(), "5,0,0,5");
+
+ VerifyDBFromMap(true_data, &kcnt, false);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(ExternalSSTFileTest, AddExternalSstFileWithCustomCompartor) {
+ Options options = CurrentOptions();
+ options.comparator = ReverseBytewiseComparator();
+ DestroyAndReopen(options);
+
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+
+ // Generate files with these key ranges
+ // {14 -> 0}
+ // {24 -> 10}
+ // {34 -> 20}
+ // {44 -> 30}
+ // ..
+ std::vector<std::string> generated_files;
+ for (int i = 0; i < 10; i++) {
+ std::string file_name = sst_files_dir_ + env_->GenerateUniqueId();
+ ASSERT_OK(sst_file_writer.Open(file_name));
+
+ int range_end = i * 10;
+ int range_start = range_end + 15;
+ for (int k = (range_start - 1); k >= range_end; k--) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k)));
+ }
+ ExternalSstFileInfo file_info;
+ ASSERT_OK(sst_file_writer.Finish(&file_info));
+ generated_files.push_back(file_name);
+ }
+
+ std::vector<std::string> in_files;
+
+ // These 2nd and 3rd files overlap with each other
+ in_files = {generated_files[0], generated_files[4], generated_files[5],
+ generated_files[7]};
+ ASSERT_NOK(DeprecatedAddFile(in_files));
+
+ // These 2 files dont overlap with each other
+ in_files = {generated_files[0], generated_files[2]};
+ ASSERT_OK(DeprecatedAddFile(in_files));
+
+ // These 2 files dont overlap with each other but overlap with keys in DB
+ in_files = {generated_files[3], generated_files[7]};
+ ASSERT_NOK(DeprecatedAddFile(in_files));
+
+ // Files dont overlap and dont overlap with DB key range
+ in_files = {generated_files[4], generated_files[6], generated_files[8]};
+ ASSERT_OK(DeprecatedAddFile(in_files));
+
+ for (int i = 0; i < 100; i++) {
+ if (i % 20 <= 14) {
+ ASSERT_EQ(Get(Key(i)), Key(i));
+ } else {
+ ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+ }
+ }
+}
+
+TEST_F(ExternalSSTFileTest, AddFileTrivialMoveBug) {
+ Options options = CurrentOptions();
+ options.num_levels = 3;
+ options.IncreaseParallelism(20);
+ DestroyAndReopen(options);
+
+ ASSERT_OK(GenerateAndAddExternalFile(options, {1, 4}, 1)); // L3
+ ASSERT_OK(GenerateAndAddExternalFile(options, {2, 3}, 2)); // L2
+
+ ASSERT_OK(GenerateAndAddExternalFile(options, {10, 14}, 3)); // L3
+ ASSERT_OK(GenerateAndAddExternalFile(options, {12, 13}, 4)); // L2
+
+ ASSERT_OK(GenerateAndAddExternalFile(options, {20, 24}, 5)); // L3
+ ASSERT_OK(GenerateAndAddExternalFile(options, {22, 23}, 6)); // L2
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::Run():Start", [&](void* /*arg*/) {
+ // fit in L3 but will overlap with compaction so will be added
+ // to L2 but a compaction will trivially move it to L3
+ // and break LSM consistency
+ static std::atomic<bool> called = {false};
+ if (!called) {
+ called = true;
+ ASSERT_OK(dbfull()->SetOptions({{"max_bytes_for_level_base", "1"}}));
+ ASSERT_OK(GenerateAndAddExternalFile(options, {15, 16}, 7));
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = false;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ dbfull()->TEST_WaitForCompact();
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(ExternalSSTFileTest, CompactAddedFiles) {
+ Options options = CurrentOptions();
+ options.num_levels = 3;
+ DestroyAndReopen(options);
+
+ ASSERT_OK(GenerateAndAddExternalFile(options, {1, 10}, 1)); // L3
+ ASSERT_OK(GenerateAndAddExternalFile(options, {2, 9}, 2)); // L2
+ ASSERT_OK(GenerateAndAddExternalFile(options, {3, 8}, 3)); // L1
+ ASSERT_OK(GenerateAndAddExternalFile(options, {4, 7}, 4)); // L0
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+}
+
+TEST_F(ExternalSSTFileTest, SstFileWriterNonSharedKeys) {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+ std::string file_path = sst_files_dir_ + "/not_shared";
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+
+ std::string suffix(100, 'X');
+ ASSERT_OK(sst_file_writer.Open(file_path));
+ ASSERT_OK(sst_file_writer.Put("A" + suffix, "VAL"));
+ ASSERT_OK(sst_file_writer.Put("BB" + suffix, "VAL"));
+ ASSERT_OK(sst_file_writer.Put("CC" + suffix, "VAL"));
+ ASSERT_OK(sst_file_writer.Put("CXD" + suffix, "VAL"));
+ ASSERT_OK(sst_file_writer.Put("CZZZ" + suffix, "VAL"));
+ ASSERT_OK(sst_file_writer.Put("ZAAAX" + suffix, "VAL"));
+
+ ASSERT_OK(sst_file_writer.Finish());
+ ASSERT_OK(DeprecatedAddFile({file_path}));
+}
+
+TEST_F(ExternalSSTFileTest, WithUnorderedWrite) {
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL",
+ "ExternalSSTFileTest::WithUnorderedWrite:WaitWriteWAL"},
+ {"DBImpl::WaitForPendingWrites:BeforeBlock",
+ "DBImpl::WriteImpl:BeforeUnorderedWriteMemtable"}});
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::IngestExternalFile:NeedFlush", [&](void* need_flush) {
+ ASSERT_TRUE(*reinterpret_cast<bool*>(need_flush));
+ });
+
+ Options options = CurrentOptions();
+ options.unordered_write = true;
+ DestroyAndReopen(options);
+ Put("foo", "v1");
+ SyncPoint::GetInstance()->EnableProcessing();
+ port::Thread writer([&]() { Put("bar", "v2"); });
+
+ TEST_SYNC_POINT("ExternalSSTFileTest::WithUnorderedWrite:WaitWriteWAL");
+ ASSERT_OK(GenerateAndAddExternalFile(options, {{"bar", "v3"}}, -1,
+ true /* allow_global_seqno */));
+ ASSERT_EQ(Get("bar"), "v3");
+
+ writer.join();
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoRandomized) {
+ Options options = CurrentOptions();
+ options.IncreaseParallelism(20);
+ options.level0_slowdown_writes_trigger = 256;
+ options.level0_stop_writes_trigger = 256;
+
+ bool write_global_seqno = std::get<0>(GetParam());
+ bool verify_checksums_before_ingest = std::get<1>(GetParam());
+ for (int iter = 0; iter < 2; iter++) {
+ bool write_to_memtable = (iter == 0);
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ std::map<std::string, std::string> true_data;
+ for (int i = 0; i < 500; i++) {
+ std::vector<std::pair<std::string, std::string>> random_data;
+ for (int j = 0; j < 100; j++) {
+ std::string k;
+ std::string v;
+ test::RandomString(&rnd, rnd.Next() % 20, &k);
+ test::RandomString(&rnd, rnd.Next() % 50, &v);
+ random_data.emplace_back(k, v);
+ }
+
+ if (write_to_memtable && rnd.OneIn(4)) {
+ // 25% of writes go through memtable
+ for (auto& entry : random_data) {
+ ASSERT_OK(Put(entry.first, entry.second));
+ true_data[entry.first] = entry.second;
+ }
+ } else {
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, random_data, -1, true, write_global_seqno,
+ verify_checksums_before_ingest, false, true, &true_data));
+ }
+ }
+ size_t kcnt = 0;
+ VerifyDBFromMap(true_data, &kcnt, false);
+ db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ VerifyDBFromMap(true_data, &kcnt, false);
+ }
+}
+
+TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoAssignedLevel) {
+ Options options = CurrentOptions();
+ options.num_levels = 5;
+ options.disable_auto_compactions = true;
+ DestroyAndReopen(options);
+ std::vector<std::pair<std::string, std::string>> file_data;
+ std::map<std::string, std::string> true_data;
+
+ // Insert 100 -> 200 into the memtable
+ for (int i = 100; i <= 200; i++) {
+ ASSERT_OK(Put(Key(i), "memtable"));
+ true_data[Key(i)] = "memtable";
+ }
+
+ // Insert 0 -> 20 using AddFile
+ file_data.clear();
+ for (int i = 0; i <= 20; i++) {
+ file_data.emplace_back(Key(i), "L4");
+ }
+ bool write_global_seqno = std::get<0>(GetParam());
+ bool verify_checksums_before_ingest = std::get<1>(GetParam());
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, file_data, -1, true, write_global_seqno,
+ verify_checksums_before_ingest, false, false, &true_data));
+
+ // This file dont overlap with anything in the DB, will go to L4
+ ASSERT_EQ("0,0,0,0,1", FilesPerLevel());
+
+ // Insert 80 -> 130 using AddFile
+ file_data.clear();
+ for (int i = 80; i <= 130; i++) {
+ file_data.emplace_back(Key(i), "L0");
+ }
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, file_data, -1, true, write_global_seqno,
+ verify_checksums_before_ingest, false, false, &true_data));
+
+ // This file overlap with the memtable, so it will flush it and add
+ // it self to L0
+ ASSERT_EQ("2,0,0,0,1", FilesPerLevel());
+
+ // Insert 30 -> 50 using AddFile
+ file_data.clear();
+ for (int i = 30; i <= 50; i++) {
+ file_data.emplace_back(Key(i), "L4");
+ }
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, file_data, -1, true, write_global_seqno,
+ verify_checksums_before_ingest, false, false, &true_data));
+
+ // This file dont overlap with anything in the DB and fit in L4 as well
+ ASSERT_EQ("2,0,0,0,2", FilesPerLevel());
+
+ // Insert 10 -> 40 using AddFile
+ file_data.clear();
+ for (int i = 10; i <= 40; i++) {
+ file_data.emplace_back(Key(i), "L3");
+ }
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, file_data, -1, true, write_global_seqno,
+ verify_checksums_before_ingest, false, false, &true_data));
+
+ // This file overlap with files in L4, we will ingest it in L3
+ ASSERT_EQ("2,0,0,1,2", FilesPerLevel());
+
+ size_t kcnt = 0;
+ VerifyDBFromMap(true_data, &kcnt, false);
+}
+
+TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoMemtableFlush) {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+ uint64_t entries_in_memtable;
+ std::map<std::string, std::string> true_data;
+
+ for (int k : {10, 20, 40, 80}) {
+ ASSERT_OK(Put(Key(k), "memtable"));
+ true_data[Key(k)] = "memtable";
+ }
+ db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+ &entries_in_memtable);
+ ASSERT_GE(entries_in_memtable, 1);
+
+ bool write_global_seqno = std::get<0>(GetParam());
+ bool verify_checksums_before_ingest = std::get<1>(GetParam());
+ // No need for flush
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {90, 100, 110}, -1, true, write_global_seqno,
+ verify_checksums_before_ingest, false, false, &true_data));
+ db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+ &entries_in_memtable);
+ ASSERT_GE(entries_in_memtable, 1);
+
+ // This file will flush the memtable
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {19, 20, 21}, -1, true, write_global_seqno,
+ verify_checksums_before_ingest, false, false, &true_data));
+ db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+ &entries_in_memtable);
+ ASSERT_EQ(entries_in_memtable, 0);
+
+ for (int k : {200, 201, 205, 206}) {
+ ASSERT_OK(Put(Key(k), "memtable"));
+ true_data[Key(k)] = "memtable";
+ }
+ db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+ &entries_in_memtable);
+ ASSERT_GE(entries_in_memtable, 1);
+
+ // No need for flush, this file keys fit between the memtable keys
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {202, 203, 204}, -1, true, write_global_seqno,
+ verify_checksums_before_ingest, false, false, &true_data));
+ db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+ &entries_in_memtable);
+ ASSERT_GE(entries_in_memtable, 1);
+
+ // This file will flush the memtable
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {206, 207}, -1, true, write_global_seqno,
+ verify_checksums_before_ingest, false, false, &true_data));
+ db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+ &entries_in_memtable);
+ ASSERT_EQ(entries_in_memtable, 0);
+
+ size_t kcnt = 0;
+ VerifyDBFromMap(true_data, &kcnt, false);
+}
+
+TEST_P(ExternalSSTFileTest, L0SortingIssue) {
+ Options options = CurrentOptions();
+ options.num_levels = 2;
+ DestroyAndReopen(options);
+ std::map<std::string, std::string> true_data;
+
+ ASSERT_OK(Put(Key(1), "memtable"));
+ ASSERT_OK(Put(Key(10), "memtable"));
+
+ bool write_global_seqno = std::get<0>(GetParam());
+ bool verify_checksums_before_ingest = std::get<1>(GetParam());
+ // No Flush needed, No global seqno needed, Ingest in L1
+ ASSERT_OK(
+ GenerateAndAddExternalFile(options, {7, 8}, -1, true, write_global_seqno,
+ verify_checksums_before_ingest, false, false));
+ // No Flush needed, but need a global seqno, Ingest in L0
+ ASSERT_OK(
+ GenerateAndAddExternalFile(options, {7, 8}, -1, true, write_global_seqno,
+ verify_checksums_before_ingest, false, false));
+ printf("%s\n", FilesPerLevel().c_str());
+
+ // Overwrite what we added using external files
+ ASSERT_OK(Put(Key(7), "memtable"));
+ ASSERT_OK(Put(Key(8), "memtable"));
+
+ // Read values from memtable
+ ASSERT_EQ(Get(Key(7)), "memtable");
+ ASSERT_EQ(Get(Key(8)), "memtable");
+
+ // Flush and read from L0
+ ASSERT_OK(Flush());
+ printf("%s\n", FilesPerLevel().c_str());
+ ASSERT_EQ(Get(Key(7)), "memtable");
+ ASSERT_EQ(Get(Key(8)), "memtable");
+}
+
+TEST_F(ExternalSSTFileTest, CompactionDeadlock) {
+ Options options = CurrentOptions();
+ options.num_levels = 2;
+ options.level0_file_num_compaction_trigger = 4;
+ options.level0_slowdown_writes_trigger = 4;
+ options.level0_stop_writes_trigger = 4;
+ DestroyAndReopen(options);
+
+ // atomic conter of currently running bg threads
+ std::atomic<int> running_threads(0);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"DBImpl::DelayWrite:Wait", "ExternalSSTFileTest::DeadLock:0"},
+ {"ExternalSSTFileTest::DeadLock:1", "DBImpl::AddFile:Start"},
+ {"DBImpl::AddFile:MutexLock", "ExternalSSTFileTest::DeadLock:2"},
+ {"ExternalSSTFileTest::DeadLock:3", "BackgroundCallCompaction:0"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Start ingesting and extrnal file in the background
+ ROCKSDB_NAMESPACE::port::Thread bg_ingest_file([&]() {
+ running_threads += 1;
+ ASSERT_OK(GenerateAndAddExternalFile(options, {5, 6}));
+ running_threads -= 1;
+ });
+
+ ASSERT_OK(Put(Key(1), "memtable"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put(Key(2), "memtable"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put(Key(3), "memtable"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put(Key(4), "memtable"));
+ ASSERT_OK(Flush());
+
+ // This thread will try to insert into the memtable but since we have 4 L0
+ // files this thread will be blocked and hold the writer thread
+ ROCKSDB_NAMESPACE::port::Thread bg_block_put([&]() {
+ running_threads += 1;
+ ASSERT_OK(Put(Key(10), "memtable"));
+ running_threads -= 1;
+ });
+
+ // Make sure DelayWrite is called first
+ TEST_SYNC_POINT("ExternalSSTFileTest::DeadLock:0");
+
+ // `DBImpl::AddFile:Start` will wait until we be here
+ TEST_SYNC_POINT("ExternalSSTFileTest::DeadLock:1");
+
+ // Wait for IngestExternalFile() to start and aquire mutex
+ TEST_SYNC_POINT("ExternalSSTFileTest::DeadLock:2");
+
+ // Now let compaction start
+ TEST_SYNC_POINT("ExternalSSTFileTest::DeadLock:3");
+
+ // Wait for max 5 seconds, if we did not finish all bg threads
+ // then we hit the deadlock bug
+ for (int i = 0; i < 10; i++) {
+ if (running_threads.load() == 0) {
+ break;
+ }
+ env_->SleepForMicroseconds(500000);
+ }
+
+ ASSERT_EQ(running_threads.load(), 0);
+
+ bg_ingest_file.join();
+ bg_block_put.join();
+}
+
+TEST_F(ExternalSSTFileTest, DirtyExit) {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+ std::string file_path = sst_files_dir_ + "/dirty_exit";
+ std::unique_ptr<SstFileWriter> sst_file_writer;
+
+ // Destruct SstFileWriter without calling Finish()
+ sst_file_writer.reset(new SstFileWriter(EnvOptions(), options));
+ ASSERT_OK(sst_file_writer->Open(file_path));
+ sst_file_writer.reset();
+
+ // Destruct SstFileWriter with a failing Finish
+ sst_file_writer.reset(new SstFileWriter(EnvOptions(), options));
+ ASSERT_OK(sst_file_writer->Open(file_path));
+ ASSERT_NOK(sst_file_writer->Finish());
+}
+
+TEST_F(ExternalSSTFileTest, FileWithCFInfo) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"koko", "toto"}, options);
+
+ SstFileWriter sfw_default(EnvOptions(), options, handles_[0]);
+ SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]);
+ SstFileWriter sfw_cf2(EnvOptions(), options, handles_[2]);
+ SstFileWriter sfw_unknown(EnvOptions(), options);
+
+ // default_cf.sst
+ const std::string cf_default_sst = sst_files_dir_ + "/default_cf.sst";
+ ASSERT_OK(sfw_default.Open(cf_default_sst));
+ ASSERT_OK(sfw_default.Put("K1", "V1"));
+ ASSERT_OK(sfw_default.Put("K2", "V2"));
+ ASSERT_OK(sfw_default.Finish());
+
+ // cf1.sst
+ const std::string cf1_sst = sst_files_dir_ + "/cf1.sst";
+ ASSERT_OK(sfw_cf1.Open(cf1_sst));
+ ASSERT_OK(sfw_cf1.Put("K3", "V1"));
+ ASSERT_OK(sfw_cf1.Put("K4", "V2"));
+ ASSERT_OK(sfw_cf1.Finish());
+
+ // cf_unknown.sst
+ const std::string unknown_sst = sst_files_dir_ + "/cf_unknown.sst";
+ ASSERT_OK(sfw_unknown.Open(unknown_sst));
+ ASSERT_OK(sfw_unknown.Put("K5", "V1"));
+ ASSERT_OK(sfw_unknown.Put("K6", "V2"));
+ ASSERT_OK(sfw_unknown.Finish());
+
+ IngestExternalFileOptions ifo;
+
+ // SST CF dont match
+ ASSERT_NOK(db_->IngestExternalFile(handles_[0], {cf1_sst}, ifo));
+ // SST CF dont match
+ ASSERT_NOK(db_->IngestExternalFile(handles_[2], {cf1_sst}, ifo));
+ // SST CF match
+ ASSERT_OK(db_->IngestExternalFile(handles_[1], {cf1_sst}, ifo));
+
+ // SST CF dont match
+ ASSERT_NOK(db_->IngestExternalFile(handles_[1], {cf_default_sst}, ifo));
+ // SST CF dont match
+ ASSERT_NOK(db_->IngestExternalFile(handles_[2], {cf_default_sst}, ifo));
+ // SST CF match
+ ASSERT_OK(db_->IngestExternalFile(handles_[0], {cf_default_sst}, ifo));
+
+ // SST CF unknown
+ ASSERT_OK(db_->IngestExternalFile(handles_[1], {unknown_sst}, ifo));
+ // SST CF unknown
+ ASSERT_OK(db_->IngestExternalFile(handles_[2], {unknown_sst}, ifo));
+ // SST CF unknown
+ ASSERT_OK(db_->IngestExternalFile(handles_[0], {unknown_sst}, ifo));
+
+ // Cannot ingest a file into a dropped CF
+ ASSERT_OK(db_->DropColumnFamily(handles_[1]));
+ ASSERT_NOK(db_->IngestExternalFile(handles_[1], {unknown_sst}, ifo));
+
+ // CF was not dropped, ok to Ingest
+ ASSERT_OK(db_->IngestExternalFile(handles_[2], {unknown_sst}, ifo));
+}
+
+/*
+ * Test and verify the functionality of ingestion_options.move_files and
+ * ingestion_options.failed_move_fall_back_to_copy
+ */
+TEST_P(ExternSSTFileLinkFailFallbackTest, LinkFailFallBackExternalSst) {
+ const bool fail_link = std::get<0>(GetParam());
+ const bool failed_move_fall_back_to_copy = std::get<1>(GetParam());
+ test_env_->set_fail_link(fail_link);
+ const EnvOptions env_options;
+ DestroyAndReopen(options_);
+ const int kNumKeys = 10000;
+ IngestExternalFileOptions ifo;
+ ifo.move_files = true;
+ ifo.failed_move_fall_back_to_copy = failed_move_fall_back_to_copy;
+
+ std::string file_path = sst_files_dir_ + "file1.sst";
+ // Create SstFileWriter for default column family
+ SstFileWriter sst_file_writer(env_options, options_);
+ ASSERT_OK(sst_file_writer.Open(file_path));
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(sst_file_writer.Put(Key(i), Key(i) + "_value"));
+ }
+ ASSERT_OK(sst_file_writer.Finish());
+ uint64_t file_size = 0;
+ ASSERT_OK(env_->GetFileSize(file_path, &file_size));
+
+ bool copyfile = false;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "ExternalSstFileIngestionJob::Prepare:CopyFile",
+ [&](void* /* arg */) { copyfile = true; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ const Status s = db_->IngestExternalFile({file_path}, ifo);
+
+ ColumnFamilyHandleImpl* cfh =
+ static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
+ ColumnFamilyData* cfd = cfh->cfd();
+ const InternalStats* internal_stats_ptr = cfd->internal_stats();
+ const std::vector<InternalStats::CompactionStats>& comp_stats =
+ internal_stats_ptr->TEST_GetCompactionStats();
+ uint64_t bytes_copied = 0;
+ uint64_t bytes_moved = 0;
+ for (const auto& stats : comp_stats) {
+ bytes_copied += stats.bytes_written;
+ bytes_moved += stats.bytes_moved;
+ }
+
+ if (!fail_link) {
+ // Link operation succeeds. External SST should be moved.
+ ASSERT_OK(s);
+ ASSERT_EQ(0, bytes_copied);
+ ASSERT_EQ(file_size, bytes_moved);
+ ASSERT_FALSE(copyfile);
+ } else {
+ // Link operation fails.
+ ASSERT_EQ(0, bytes_moved);
+ if (failed_move_fall_back_to_copy) {
+ ASSERT_OK(s);
+ // Copy file is true since a failed link falls back to copy file.
+ ASSERT_TRUE(copyfile);
+ ASSERT_EQ(file_size, bytes_copied);
+ } else {
+ ASSERT_TRUE(s.IsNotSupported());
+ // Copy file is false since a failed link does not fall back to copy file.
+ ASSERT_FALSE(copyfile);
+ ASSERT_EQ(0, bytes_copied);
+ }
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+class TestIngestExternalFileListener : public EventListener {
+ public:
+ void OnExternalFileIngested(DB* /*db*/,
+ const ExternalFileIngestionInfo& info) override {
+ ingested_files.push_back(info);
+ }
+
+ std::vector<ExternalFileIngestionInfo> ingested_files;
+};
+
+TEST_P(ExternalSSTFileTest, IngestionListener) {
+ Options options = CurrentOptions();
+ TestIngestExternalFileListener* listener =
+ new TestIngestExternalFileListener();
+ options.listeners.emplace_back(listener);
+ CreateAndReopenWithCF({"koko", "toto"}, options);
+
+ bool write_global_seqno = std::get<0>(GetParam());
+ bool verify_checksums_before_ingest = std::get<1>(GetParam());
+ // Ingest into default cf
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1, 2}, -1, true, write_global_seqno,
+ verify_checksums_before_ingest, false, true, nullptr, handles_[0]));
+ ASSERT_EQ(listener->ingested_files.size(), 1);
+ ASSERT_EQ(listener->ingested_files.back().cf_name, "default");
+ ASSERT_EQ(listener->ingested_files.back().global_seqno, 0);
+ ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_id,
+ 0);
+ ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_name,
+ "default");
+
+ // Ingest into cf1
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1, 2}, -1, true, write_global_seqno,
+ verify_checksums_before_ingest, false, true, nullptr, handles_[1]));
+ ASSERT_EQ(listener->ingested_files.size(), 2);
+ ASSERT_EQ(listener->ingested_files.back().cf_name, "koko");
+ ASSERT_EQ(listener->ingested_files.back().global_seqno, 0);
+ ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_id,
+ 1);
+ ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_name,
+ "koko");
+
+ // Ingest into cf2
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1, 2}, -1, true, write_global_seqno,
+ verify_checksums_before_ingest, false, true, nullptr, handles_[2]));
+ ASSERT_EQ(listener->ingested_files.size(), 3);
+ ASSERT_EQ(listener->ingested_files.back().cf_name, "toto");
+ ASSERT_EQ(listener->ingested_files.back().global_seqno, 0);
+ ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_id,
+ 2);
+ ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_name,
+ "toto");
+}
+
+TEST_F(ExternalSSTFileTest, SnapshotInconsistencyBug) {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+ const int kNumKeys = 10000;
+
+ // Insert keys using normal path and take a snapshot
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(Put(Key(i), Key(i) + "_V1"));
+ }
+ const Snapshot* snap = db_->GetSnapshot();
+
+ // Overwrite all keys using IngestExternalFile
+ std::string sst_file_path = sst_files_dir_ + "file1.sst";
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+ ASSERT_OK(sst_file_writer.Open(sst_file_path));
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(sst_file_writer.Put(Key(i), Key(i) + "_V2"));
+ }
+ ASSERT_OK(sst_file_writer.Finish());
+
+ IngestExternalFileOptions ifo;
+ ifo.move_files = true;
+ ASSERT_OK(db_->IngestExternalFile({sst_file_path}, ifo));
+
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_EQ(Get(Key(i), snap), Key(i) + "_V1");
+ ASSERT_EQ(Get(Key(i)), Key(i) + "_V2");
+ }
+
+ db_->ReleaseSnapshot(snap);
+}
+
+TEST_P(ExternalSSTFileTest, IngestBehind) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = 3;
+ options.disable_auto_compactions = false;
+ DestroyAndReopen(options);
+ std::vector<std::pair<std::string, std::string>> file_data;
+ std::map<std::string, std::string> true_data;
+
+ // Insert 100 -> 200 into the memtable
+ for (int i = 100; i <= 200; i++) {
+ ASSERT_OK(Put(Key(i), "memtable"));
+ true_data[Key(i)] = "memtable";
+ }
+
+ // Insert 100 -> 200 using IngestExternalFile
+ file_data.clear();
+ for (int i = 0; i <= 20; i++) {
+ file_data.emplace_back(Key(i), "ingest_behind");
+ }
+
+ bool allow_global_seqno = true;
+ bool ingest_behind = true;
+ bool write_global_seqno = std::get<0>(GetParam());
+ bool verify_checksums_before_ingest = std::get<1>(GetParam());
+
+ // Can't ingest behind since allow_ingest_behind isn't set to true
+ ASSERT_NOK(GenerateAndAddExternalFile(
+ options, file_data, -1, allow_global_seqno, write_global_seqno,
+ verify_checksums_before_ingest, ingest_behind, false /*sort_data*/,
+ &true_data));
+
+ options.allow_ingest_behind = true;
+ // check that we still can open the DB, as num_levels should be
+ // sanitized to 3
+ options.num_levels = 2;
+ DestroyAndReopen(options);
+
+ options.num_levels = 3;
+ DestroyAndReopen(options);
+ // Insert 100 -> 200 into the memtable
+ for (int i = 100; i <= 200; i++) {
+ ASSERT_OK(Put(Key(i), "memtable"));
+ true_data[Key(i)] = "memtable";
+ }
+ db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ // Universal picker should go at second from the bottom level
+ ASSERT_EQ("0,1", FilesPerLevel());
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, file_data, -1, allow_global_seqno, write_global_seqno,
+ verify_checksums_before_ingest, true /*ingest_behind*/,
+ false /*sort_data*/, &true_data));
+ ASSERT_EQ("0,1,1", FilesPerLevel());
+ // this time ingest should fail as the file doesn't fit to the bottom level
+ ASSERT_NOK(GenerateAndAddExternalFile(
+ options, file_data, -1, allow_global_seqno, write_global_seqno,
+ verify_checksums_before_ingest, true /*ingest_behind*/,
+ false /*sort_data*/, &true_data));
+ ASSERT_EQ("0,1,1", FilesPerLevel());
+ db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ // bottom level should be empty
+ ASSERT_EQ("0,1", FilesPerLevel());
+
+ size_t kcnt = 0;
+ VerifyDBFromMap(true_data, &kcnt, false);
+}
+
+TEST_F(ExternalSSTFileTest, SkipBloomFilter) {
+ Options options = CurrentOptions();
+
+ BlockBasedTableOptions table_options;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+ table_options.cache_index_and_filter_blocks = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+
+ // Create external SST file and include bloom filters
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ DestroyAndReopen(options);
+ {
+ std::string file_path = sst_files_dir_ + "sst_with_bloom.sst";
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+ ASSERT_OK(sst_file_writer.Open(file_path));
+ ASSERT_OK(sst_file_writer.Put("Key1", "Value1"));
+ ASSERT_OK(sst_file_writer.Finish());
+
+ ASSERT_OK(
+ db_->IngestExternalFile({file_path}, IngestExternalFileOptions()));
+
+ ASSERT_EQ(Get("Key1"), "Value1");
+ ASSERT_GE(
+ options.statistics->getTickerCount(Tickers::BLOCK_CACHE_FILTER_ADD), 1);
+ }
+
+ // Create external SST file but skip bloom filters
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ DestroyAndReopen(options);
+ {
+ std::string file_path = sst_files_dir_ + "sst_with_no_bloom.sst";
+ SstFileWriter sst_file_writer(EnvOptions(), options, nullptr, true,
+ Env::IOPriority::IO_TOTAL,
+ true /* skip_filters */);
+ ASSERT_OK(sst_file_writer.Open(file_path));
+ ASSERT_OK(sst_file_writer.Put("Key1", "Value1"));
+ ASSERT_OK(sst_file_writer.Finish());
+
+ ASSERT_OK(
+ db_->IngestExternalFile({file_path}, IngestExternalFileOptions()));
+
+ ASSERT_EQ(Get("Key1"), "Value1");
+ ASSERT_EQ(
+ options.statistics->getTickerCount(Tickers::BLOCK_CACHE_FILTER_ADD), 0);
+ }
+}
+
+TEST_F(ExternalSSTFileTest, IngestFileWrittenWithCompressionDictionary) {
+ if (!ZSTD_Supported()) {
+ return;
+ }
+ const int kNumEntries = 1 << 10;
+ const int kNumBytesPerEntry = 1 << 10;
+ Options options = CurrentOptions();
+ options.compression = kZSTD;
+ options.compression_opts.max_dict_bytes = 1 << 14; // 16KB
+ options.compression_opts.zstd_max_train_bytes = 1 << 18; // 256KB
+ DestroyAndReopen(options);
+
+ std::atomic<int> num_compression_dicts(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict",
+ [&](void* /* arg */) { ++num_compression_dicts; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ std::vector<std::pair<std::string, std::string>> random_data;
+ for (int i = 0; i < kNumEntries; i++) {
+ std::string val;
+ test::RandomString(&rnd, kNumBytesPerEntry, &val);
+ random_data.emplace_back(Key(i), std::move(val));
+ }
+ ASSERT_OK(GenerateAndAddExternalFile(options, std::move(random_data)));
+ ASSERT_EQ(1, num_compression_dicts);
+}
+
+TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_Success) {
+ std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+ new FaultInjectionTestEnv(env_));
+ Options options = CurrentOptions();
+ options.env = fault_injection_env.get();
+ CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+ std::vector<ColumnFamilyHandle*> column_families;
+ column_families.push_back(handles_[0]);
+ column_families.push_back(handles_[1]);
+ column_families.push_back(handles_[2]);
+ std::vector<IngestExternalFileOptions> ifos(column_families.size());
+ for (auto& ifo : ifos) {
+ ifo.allow_global_seqno = true; // Always allow global_seqno
+ // May or may not write global_seqno
+ ifo.write_global_seqno = std::get<0>(GetParam());
+ // Whether to verify checksums before ingestion
+ ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+ }
+ std::vector<std::vector<std::pair<std::string, std::string>>> data;
+ data.push_back(
+ {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
+ data.push_back(
+ {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+ data.push_back(
+ {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
+
+ // Resize the true_data vector upon construction to avoid re-alloc
+ std::vector<std::map<std::string, std::string>> true_data(
+ column_families.size());
+ Status s = GenerateAndAddExternalFiles(options, column_families, ifos, data,
+ -1, true, true_data);
+ ASSERT_OK(s);
+ Close();
+ ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+ options);
+ ASSERT_EQ(3, handles_.size());
+ int cf = 0;
+ for (const auto& verify_map : true_data) {
+ for (const auto& elem : verify_map) {
+ const std::string& key = elem.first;
+ const std::string& value = elem.second;
+ ASSERT_EQ(value, Get(cf, key));
+ }
+ ++cf;
+ }
+ Close();
+ Destroy(options, true /* delete_cf_paths */);
+}
+
+TEST_P(ExternalSSTFileTest,
+ IngestFilesIntoMultipleColumnFamilies_NoMixedStateWithSnapshot) {
+ std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+ new FaultInjectionTestEnv(env_));
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->LoadDependency({
+ {"DBImpl::IngestExternalFiles:InstallSVForFirstCF:0",
+ "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_MixedState:"
+ "BeforeRead"},
+ {"ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_MixedState:"
+ "AfterRead",
+ "DBImpl::IngestExternalFiles:InstallSVForFirstCF:1"},
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.env = fault_injection_env.get();
+ CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+ const std::vector<std::map<std::string, std::string>> data_before_ingestion =
+ {{{"foo1", "fv1_0"}, {"foo2", "fv2_0"}, {"foo3", "fv3_0"}},
+ {{"bar1", "bv1_0"}, {"bar2", "bv2_0"}, {"bar3", "bv3_0"}},
+ {{"bar4", "bv4_0"}, {"bar5", "bv5_0"}, {"bar6", "bv6_0"}}};
+ for (size_t i = 0; i != handles_.size(); ++i) {
+ int cf = static_cast<int>(i);
+ const auto& orig_data = data_before_ingestion[i];
+ for (const auto& kv : orig_data) {
+ ASSERT_OK(Put(cf, kv.first, kv.second));
+ }
+ ASSERT_OK(Flush(cf));
+ }
+
+ std::vector<ColumnFamilyHandle*> column_families;
+ column_families.push_back(handles_[0]);
+ column_families.push_back(handles_[1]);
+ column_families.push_back(handles_[2]);
+ std::vector<IngestExternalFileOptions> ifos(column_families.size());
+ for (auto& ifo : ifos) {
+ ifo.allow_global_seqno = true; // Always allow global_seqno
+ // May or may not write global_seqno
+ ifo.write_global_seqno = std::get<0>(GetParam());
+ // Whether to verify checksums before ingestion
+ ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+ }
+ std::vector<std::vector<std::pair<std::string, std::string>>> data;
+ data.push_back(
+ {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
+ data.push_back(
+ {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+ data.push_back(
+ {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
+ // Resize the true_data vector upon construction to avoid re-alloc
+ std::vector<std::map<std::string, std::string>> true_data(
+ column_families.size());
+ // Take snapshot before ingestion starts
+ ReadOptions read_opts;
+ read_opts.total_order_seek = true;
+ read_opts.snapshot = dbfull()->GetSnapshot();
+ std::vector<Iterator*> iters(handles_.size());
+
+ // Range scan checks first kv of each CF before ingestion starts.
+ for (size_t i = 0; i != handles_.size(); ++i) {
+ iters[i] = dbfull()->NewIterator(read_opts, handles_[i]);
+ iters[i]->SeekToFirst();
+ ASSERT_TRUE(iters[i]->Valid());
+ const std::string& key = iters[i]->key().ToString();
+ const std::string& value = iters[i]->value().ToString();
+ const std::map<std::string, std::string>& orig_data =
+ data_before_ingestion[i];
+ std::map<std::string, std::string>::const_iterator it = orig_data.find(key);
+ ASSERT_NE(orig_data.end(), it);
+ ASSERT_EQ(it->second, value);
+ iters[i]->Next();
+ }
+ port::Thread ingest_thread([&]() {
+ ASSERT_OK(GenerateAndAddExternalFiles(options, column_families, ifos, data,
+ -1, true, true_data));
+ });
+ TEST_SYNC_POINT(
+ "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_MixedState:"
+ "BeforeRead");
+ // Should see only data before ingestion
+ for (size_t i = 0; i != handles_.size(); ++i) {
+ const auto& orig_data = data_before_ingestion[i];
+ for (; iters[i]->Valid(); iters[i]->Next()) {
+ const std::string& key = iters[i]->key().ToString();
+ const std::string& value = iters[i]->value().ToString();
+ std::map<std::string, std::string>::const_iterator it =
+ orig_data.find(key);
+ ASSERT_NE(orig_data.end(), it);
+ ASSERT_EQ(it->second, value);
+ }
+ }
+ TEST_SYNC_POINT(
+ "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_MixedState:"
+ "AfterRead");
+ ingest_thread.join();
+ for (auto* iter : iters) {
+ delete iter;
+ }
+ iters.clear();
+ dbfull()->ReleaseSnapshot(read_opts.snapshot);
+
+ Close();
+ ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+ options);
+ // Should see consistent state after ingestion for all column families even
+ // without snapshot.
+ ASSERT_EQ(3, handles_.size());
+ int cf = 0;
+ for (const auto& verify_map : true_data) {
+ for (const auto& elem : verify_map) {
+ const std::string& key = elem.first;
+ const std::string& value = elem.second;
+ ASSERT_EQ(value, Get(cf, key));
+ }
+ ++cf;
+ }
+ Close();
+ Destroy(options, true /* delete_cf_paths */);
+}
+
+TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_PrepareFail) {
+ std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+ new FaultInjectionTestEnv(env_));
+ Options options = CurrentOptions();
+ options.env = fault_injection_env.get();
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->LoadDependency({
+ {"DBImpl::IngestExternalFiles:BeforeLastJobPrepare:0",
+ "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_PrepareFail:"
+ "0"},
+ {"ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies:PrepareFail:"
+ "1",
+ "DBImpl::IngestExternalFiles:BeforeLastJobPrepare:1"},
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+ std::vector<ColumnFamilyHandle*> column_families;
+ column_families.push_back(handles_[0]);
+ column_families.push_back(handles_[1]);
+ column_families.push_back(handles_[2]);
+ std::vector<IngestExternalFileOptions> ifos(column_families.size());
+ for (auto& ifo : ifos) {
+ ifo.allow_global_seqno = true; // Always allow global_seqno
+ // May or may not write global_seqno
+ ifo.write_global_seqno = std::get<0>(GetParam());
+ // Whether to verify block checksums before ingest
+ ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+ }
+ std::vector<std::vector<std::pair<std::string, std::string>>> data;
+ data.push_back(
+ {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
+ data.push_back(
+ {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+ data.push_back(
+ {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
+
+ // Resize the true_data vector upon construction to avoid re-alloc
+ std::vector<std::map<std::string, std::string>> true_data(
+ column_families.size());
+ port::Thread ingest_thread([&]() {
+ Status s = GenerateAndAddExternalFiles(options, column_families, ifos, data,
+ -1, true, true_data);
+ ASSERT_NOK(s);
+ });
+ TEST_SYNC_POINT(
+ "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_PrepareFail:"
+ "0");
+ fault_injection_env->SetFilesystemActive(false);
+ TEST_SYNC_POINT(
+ "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies:PrepareFail:"
+ "1");
+ ingest_thread.join();
+
+ fault_injection_env->SetFilesystemActive(true);
+ Close();
+ ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+ options);
+ ASSERT_EQ(3, handles_.size());
+ int cf = 0;
+ for (const auto& verify_map : true_data) {
+ for (const auto& elem : verify_map) {
+ const std::string& key = elem.first;
+ ASSERT_EQ("NOT_FOUND", Get(cf, key));
+ }
+ ++cf;
+ }
+ Close();
+ Destroy(options, true /* delete_cf_paths */);
+}
+
+TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_CommitFail) {
+ std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+ new FaultInjectionTestEnv(env_));
+ Options options = CurrentOptions();
+ options.env = fault_injection_env.get();
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->LoadDependency({
+ {"DBImpl::IngestExternalFiles:BeforeJobsRun:0",
+ "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:"
+ "0"},
+ {"ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:"
+ "1",
+ "DBImpl::IngestExternalFiles:BeforeJobsRun:1"},
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+ std::vector<ColumnFamilyHandle*> column_families;
+ column_families.push_back(handles_[0]);
+ column_families.push_back(handles_[1]);
+ column_families.push_back(handles_[2]);
+ std::vector<IngestExternalFileOptions> ifos(column_families.size());
+ for (auto& ifo : ifos) {
+ ifo.allow_global_seqno = true; // Always allow global_seqno
+ // May or may not write global_seqno
+ ifo.write_global_seqno = std::get<0>(GetParam());
+ // Whether to verify block checksums before ingestion
+ ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+ }
+ std::vector<std::vector<std::pair<std::string, std::string>>> data;
+ data.push_back(
+ {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
+ data.push_back(
+ {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+ data.push_back(
+ {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
+ // Resize the true_data vector upon construction to avoid re-alloc
+ std::vector<std::map<std::string, std::string>> true_data(
+ column_families.size());
+ port::Thread ingest_thread([&]() {
+ Status s = GenerateAndAddExternalFiles(options, column_families, ifos, data,
+ -1, true, true_data);
+ ASSERT_NOK(s);
+ });
+ TEST_SYNC_POINT(
+ "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:"
+ "0");
+ fault_injection_env->SetFilesystemActive(false);
+ TEST_SYNC_POINT(
+ "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:"
+ "1");
+ ingest_thread.join();
+
+ fault_injection_env->SetFilesystemActive(true);
+ Close();
+ ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+ options);
+ ASSERT_EQ(3, handles_.size());
+ int cf = 0;
+ for (const auto& verify_map : true_data) {
+ for (const auto& elem : verify_map) {
+ const std::string& key = elem.first;
+ ASSERT_EQ("NOT_FOUND", Get(cf, key));
+ }
+ ++cf;
+ }
+ Close();
+ Destroy(options, true /* delete_cf_paths */);
+}
+
+TEST_P(ExternalSSTFileTest,
+ IngestFilesIntoMultipleColumnFamilies_PartialManifestWriteFail) {
+ std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+ new FaultInjectionTestEnv(env_));
+ Options options = CurrentOptions();
+ options.env = fault_injection_env.get();
+
+ CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+
+ SyncPoint::GetInstance()->ClearTrace();
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->LoadDependency({
+ {"VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0",
+ "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_"
+ "PartialManifestWriteFail:0"},
+ {"ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_"
+ "PartialManifestWriteFail:1",
+ "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:1"},
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ std::vector<ColumnFamilyHandle*> column_families;
+ column_families.push_back(handles_[0]);
+ column_families.push_back(handles_[1]);
+ column_families.push_back(handles_[2]);
+ std::vector<IngestExternalFileOptions> ifos(column_families.size());
+ for (auto& ifo : ifos) {
+ ifo.allow_global_seqno = true; // Always allow global_seqno
+ // May or may not write global_seqno
+ ifo.write_global_seqno = std::get<0>(GetParam());
+ // Whether to verify block checksums before ingestion
+ ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+ }
+ std::vector<std::vector<std::pair<std::string, std::string>>> data;
+ data.push_back(
+ {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
+ data.push_back(
+ {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+ data.push_back(
+ {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
+ // Resize the true_data vector upon construction to avoid re-alloc
+ std::vector<std::map<std::string, std::string>> true_data(
+ column_families.size());
+ port::Thread ingest_thread([&]() {
+ Status s = GenerateAndAddExternalFiles(options, column_families, ifos, data,
+ -1, true, true_data);
+ ASSERT_NOK(s);
+ });
+ TEST_SYNC_POINT(
+ "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_"
+ "PartialManifestWriteFail:0");
+ fault_injection_env->SetFilesystemActive(false);
+ TEST_SYNC_POINT(
+ "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_"
+ "PartialManifestWriteFail:1");
+ ingest_thread.join();
+
+ fault_injection_env->DropUnsyncedFileData();
+ fault_injection_env->SetFilesystemActive(true);
+ Close();
+ ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+ options);
+ ASSERT_EQ(3, handles_.size());
+ int cf = 0;
+ for (const auto& verify_map : true_data) {
+ for (const auto& elem : verify_map) {
+ const std::string& key = elem.first;
+ ASSERT_EQ("NOT_FOUND", Get(cf, key));
+ }
+ ++cf;
+ }
+ Close();
+ Destroy(options, true /* delete_cf_paths */);
+}
+
+TEST_P(ExternalSSTFileTest, IngestFilesTriggerFlushingWithTwoWriteQueue) {
+ Options options = CurrentOptions();
+ // Use large buffer to avoid memtable flush
+ options.write_buffer_size = 1024 * 1024;
+ options.two_write_queues = true;
+ DestroyAndReopen(options);
+
+ ASSERT_OK(dbfull()->Put(WriteOptions(), "1000", "v1"));
+ ASSERT_OK(dbfull()->Put(WriteOptions(), "1001", "v1"));
+ ASSERT_OK(dbfull()->Put(WriteOptions(), "9999", "v1"));
+
+ // Put one key which is overlap with keys in memtable.
+ // It will trigger flushing memtable and require this thread is
+ // currently at the front of the 2nd writer queue. We must make
+ // sure that it won't enter the 2nd writer queue for the second time.
+ std::vector<std::pair<std::string, std::string>> data;
+ data.push_back(std::make_pair("1001", "v2"));
+ GenerateAndAddExternalFile(options, data);
+}
+
+INSTANTIATE_TEST_CASE_P(ExternalSSTFileTest, ExternalSSTFileTest,
+ testing::Values(std::make_tuple(false, false),
+ std::make_tuple(false, true),
+ std::make_tuple(true, false),
+ std::make_tuple(true, true)));
+
+INSTANTIATE_TEST_CASE_P(ExternSSTFileLinkFailFallbackTest,
+ ExternSSTFileLinkFailFallbackTest,
+ testing::Values(std::make_tuple(true, false),
+ std::make_tuple(true, true),
+ std::make_tuple(false, false)));
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr,
+ "SKIPPED as External SST File Writer and Ingestion are not supported "
+ "in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/fault_injection_test.cc b/src/rocksdb/db/fault_injection_test.cc
new file mode 100644
index 000000000..f4ca3458a
--- /dev/null
+++ b/src/rocksdb/db/fault_injection_test.cc
@@ -0,0 +1,555 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright 2014 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// This test uses a custom Env to keep track of the state of a filesystem as of
+// the last "sync". It then checks for data loss errors by purposely dropping
+// file data (or entire files) not protected by a "sync".
+
+#include "db/db_impl/db_impl.h"
+#include "db/log_format.h"
+#include "db/version_set.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "logging/logging.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/table.h"
+#include "rocksdb/write_batch.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static const int kValueSize = 1000;
+static const int kMaxNumValues = 2000;
+static const size_t kNumIterations = 3;
+
+enum FaultInjectionOptionConfig {
+ kDefault,
+ kDifferentDataDir,
+ kWalDir,
+ kSyncWal,
+ kWalDirSyncWal,
+ kMultiLevels,
+ kEnd,
+};
+class FaultInjectionTest
+ : public testing::Test,
+ public testing::WithParamInterface<std::tuple<
+ bool, FaultInjectionOptionConfig, FaultInjectionOptionConfig>> {
+ protected:
+ int option_config_;
+ int non_inclusive_end_range_; // kEnd or equivalent to that
+ // When need to make sure data is persistent, sync WAL
+ bool sync_use_wal_;
+ // When need to make sure data is persistent, call DB::CompactRange()
+ bool sync_use_compact_;
+
+ bool sequential_order_;
+
+ protected:
+ public:
+ enum ExpectedVerifResult { kValExpectFound, kValExpectNoError };
+ enum ResetMethod {
+ kResetDropUnsyncedData,
+ kResetDropRandomUnsyncedData,
+ kResetDeleteUnsyncedFiles,
+ kResetDropAndDeleteUnsynced
+ };
+
+ std::unique_ptr<Env> base_env_;
+ FaultInjectionTestEnv* env_;
+ std::string dbname_;
+ std::shared_ptr<Cache> tiny_cache_;
+ Options options_;
+ DB* db_;
+
+ FaultInjectionTest()
+ : option_config_(std::get<1>(GetParam())),
+ non_inclusive_end_range_(std::get<2>(GetParam())),
+ sync_use_wal_(false),
+ sync_use_compact_(true),
+ base_env_(nullptr),
+ env_(nullptr),
+ db_(nullptr) {}
+
+ ~FaultInjectionTest() override {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ }
+
+ bool ChangeOptions() {
+ option_config_++;
+ if (option_config_ >= non_inclusive_end_range_) {
+ return false;
+ } else {
+ if (option_config_ == kMultiLevels) {
+ base_env_.reset(new MockEnv(Env::Default()));
+ }
+ return true;
+ }
+ }
+
+ // Return the current option configuration.
+ Options CurrentOptions() {
+ sync_use_wal_ = false;
+ sync_use_compact_ = true;
+ Options options;
+ switch (option_config_) {
+ case kWalDir:
+ options.wal_dir = test::PerThreadDBPath(env_, "fault_test_wal");
+ break;
+ case kDifferentDataDir:
+ options.db_paths.emplace_back(
+ test::PerThreadDBPath(env_, "fault_test_data"), 1000000U);
+ break;
+ case kSyncWal:
+ sync_use_wal_ = true;
+ sync_use_compact_ = false;
+ break;
+ case kWalDirSyncWal:
+ options.wal_dir = test::PerThreadDBPath(env_, "/fault_test_wal");
+ sync_use_wal_ = true;
+ sync_use_compact_ = false;
+ break;
+ case kMultiLevels:
+ options.write_buffer_size = 64 * 1024;
+ options.target_file_size_base = 64 * 1024;
+ options.level0_file_num_compaction_trigger = 2;
+ options.level0_slowdown_writes_trigger = 2;
+ options.level0_stop_writes_trigger = 4;
+ options.max_bytes_for_level_base = 128 * 1024;
+ options.max_write_buffer_number = 2;
+ options.max_background_compactions = 8;
+ options.max_background_flushes = 8;
+ sync_use_wal_ = true;
+ sync_use_compact_ = false;
+ break;
+ default:
+ break;
+ }
+ return options;
+ }
+
+ Status NewDB() {
+ assert(db_ == nullptr);
+ assert(tiny_cache_ == nullptr);
+ assert(env_ == nullptr);
+
+ env_ =
+ new FaultInjectionTestEnv(base_env_ ? base_env_.get() : Env::Default());
+
+ options_ = CurrentOptions();
+ options_.env = env_;
+ options_.paranoid_checks = true;
+
+ BlockBasedTableOptions table_options;
+ tiny_cache_ = NewLRUCache(100);
+ table_options.block_cache = tiny_cache_;
+ options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ dbname_ = test::PerThreadDBPath("fault_test");
+
+ EXPECT_OK(DestroyDB(dbname_, options_));
+
+ options_.create_if_missing = true;
+ Status s = OpenDB();
+ options_.create_if_missing = false;
+ return s;
+ }
+
+ void SetUp() override {
+ sequential_order_ = std::get<0>(GetParam());
+ ASSERT_OK(NewDB());
+ }
+
+ void TearDown() override {
+ CloseDB();
+
+ Status s = DestroyDB(dbname_, options_);
+
+ delete env_;
+ env_ = nullptr;
+
+ tiny_cache_.reset();
+
+ ASSERT_OK(s);
+ }
+
+ void Build(const WriteOptions& write_options, int start_idx, int num_vals) {
+ std::string key_space, value_space;
+ WriteBatch batch;
+ for (int i = start_idx; i < start_idx + num_vals; i++) {
+ Slice key = Key(i, &key_space);
+ batch.Clear();
+ batch.Put(key, Value(i, &value_space));
+ ASSERT_OK(db_->Write(write_options, &batch));
+ }
+ }
+
+ Status ReadValue(int i, std::string* val) const {
+ std::string key_space, value_space;
+ Slice key = Key(i, &key_space);
+ Value(i, &value_space);
+ ReadOptions options;
+ return db_->Get(options, key, val);
+ }
+
+ Status Verify(int start_idx, int num_vals,
+ ExpectedVerifResult expected) const {
+ std::string val;
+ std::string value_space;
+ Status s;
+ for (int i = start_idx; i < start_idx + num_vals && s.ok(); i++) {
+ Value(i, &value_space);
+ s = ReadValue(i, &val);
+ if (s.ok()) {
+ EXPECT_EQ(value_space, val);
+ }
+ if (expected == kValExpectFound) {
+ if (!s.ok()) {
+ fprintf(stderr, "Error when read %dth record (expect found): %s\n", i,
+ s.ToString().c_str());
+ return s;
+ }
+ } else if (!s.ok() && !s.IsNotFound()) {
+ fprintf(stderr, "Error when read %dth record: %s\n", i,
+ s.ToString().c_str());
+ return s;
+ }
+ }
+ return Status::OK();
+ }
+
+ // Return the ith key
+ Slice Key(int i, std::string* storage) const {
+ unsigned long long num = i;
+ if (!sequential_order_) {
+ // random transfer
+ const int m = 0x5bd1e995;
+ num *= m;
+ num ^= num << 24;
+ }
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%016d", static_cast<int>(num));
+ storage->assign(buf, strlen(buf));
+ return Slice(*storage);
+ }
+
+ // Return the value to associate with the specified key
+ Slice Value(int k, std::string* storage) const {
+ Random r(k);
+ return test::RandomString(&r, kValueSize, storage);
+ }
+
+ void CloseDB() {
+ delete db_;
+ db_ = nullptr;
+ }
+
+ Status OpenDB() {
+ CloseDB();
+ env_->ResetState();
+ Status s = DB::Open(options_, dbname_, &db_);
+ assert(db_ != nullptr);
+ return s;
+ }
+
+ void DeleteAllData() {
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ WriteOptions options;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_OK(db_->Delete(WriteOptions(), iter->key()));
+ }
+
+ delete iter;
+
+ FlushOptions flush_options;
+ flush_options.wait = true;
+ db_->Flush(flush_options);
+ }
+
+ // rnd cannot be null for kResetDropRandomUnsyncedData
+ void ResetDBState(ResetMethod reset_method, Random* rnd = nullptr) {
+ env_->AssertNoOpenFile();
+ switch (reset_method) {
+ case kResetDropUnsyncedData:
+ ASSERT_OK(env_->DropUnsyncedFileData());
+ break;
+ case kResetDropRandomUnsyncedData:
+ ASSERT_OK(env_->DropRandomUnsyncedFileData(rnd));
+ break;
+ case kResetDeleteUnsyncedFiles:
+ ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync());
+ break;
+ case kResetDropAndDeleteUnsynced:
+ ASSERT_OK(env_->DropUnsyncedFileData());
+ ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync());
+ break;
+ default:
+ assert(false);
+ }
+ }
+
+ void PartialCompactTestPreFault(int num_pre_sync, int num_post_sync) {
+ DeleteAllData();
+
+ WriteOptions write_options;
+ write_options.sync = sync_use_wal_;
+
+ Build(write_options, 0, num_pre_sync);
+ if (sync_use_compact_) {
+ db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ }
+ write_options.sync = false;
+ Build(write_options, num_pre_sync, num_post_sync);
+ }
+
+ void PartialCompactTestReopenWithFault(ResetMethod reset_method,
+ int num_pre_sync, int num_post_sync,
+ Random* rnd = nullptr) {
+ env_->SetFilesystemActive(false);
+ CloseDB();
+ ResetDBState(reset_method, rnd);
+ ASSERT_OK(OpenDB());
+ ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::kValExpectFound));
+ ASSERT_OK(Verify(num_pre_sync, num_post_sync,
+ FaultInjectionTest::kValExpectNoError));
+ WaitCompactionFinish();
+ ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::kValExpectFound));
+ ASSERT_OK(Verify(num_pre_sync, num_post_sync,
+ FaultInjectionTest::kValExpectNoError));
+ }
+
+ void NoWriteTestPreFault() {
+ }
+
+ void NoWriteTestReopenWithFault(ResetMethod reset_method) {
+ CloseDB();
+ ResetDBState(reset_method);
+ ASSERT_OK(OpenDB());
+ }
+
+ void WaitCompactionFinish() {
+ static_cast<DBImpl*>(db_->GetRootDB())->TEST_WaitForCompact();
+ ASSERT_OK(db_->Put(WriteOptions(), "", ""));
+ }
+};
+
+class FaultInjectionTestSplitted : public FaultInjectionTest {};
+
+TEST_P(FaultInjectionTestSplitted, FaultTest) {
+ do {
+ Random rnd(301);
+
+ for (size_t idx = 0; idx < kNumIterations; idx++) {
+ int num_pre_sync = rnd.Uniform(kMaxNumValues);
+ int num_post_sync = rnd.Uniform(kMaxNumValues);
+
+ PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+ PartialCompactTestReopenWithFault(kResetDropUnsyncedData, num_pre_sync,
+ num_post_sync);
+ NoWriteTestPreFault();
+ NoWriteTestReopenWithFault(kResetDropUnsyncedData);
+
+ PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+ PartialCompactTestReopenWithFault(kResetDropRandomUnsyncedData,
+ num_pre_sync, num_post_sync, &rnd);
+ NoWriteTestPreFault();
+ NoWriteTestReopenWithFault(kResetDropUnsyncedData);
+
+ // Setting a separate data path won't pass the test as we don't sync
+ // it after creating new files,
+ PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+ PartialCompactTestReopenWithFault(kResetDropAndDeleteUnsynced,
+ num_pre_sync, num_post_sync);
+ NoWriteTestPreFault();
+ NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
+
+ PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+ // No new files created so we expect all values since no files will be
+ // dropped.
+ PartialCompactTestReopenWithFault(kResetDeleteUnsyncedFiles, num_pre_sync,
+ num_post_sync);
+ NoWriteTestPreFault();
+ NoWriteTestReopenWithFault(kResetDeleteUnsyncedFiles);
+ }
+ } while (ChangeOptions());
+}
+
+// Previous log file is not fsynced if sync is forced after log rolling.
+TEST_P(FaultInjectionTest, WriteOptionSyncTest) {
+ test::SleepingBackgroundTask sleeping_task_low;
+ env_->SetBackgroundThreads(1, Env::HIGH);
+ // Block the job queue to prevent flush job from running.
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::HIGH);
+ sleeping_task_low.WaitUntilSleeping();
+
+ WriteOptions write_options;
+ write_options.sync = false;
+
+ std::string key_space, value_space;
+ ASSERT_OK(
+ db_->Put(write_options, Key(1, &key_space), Value(1, &value_space)));
+ FlushOptions flush_options;
+ flush_options.wait = false;
+ ASSERT_OK(db_->Flush(flush_options));
+ write_options.sync = true;
+ ASSERT_OK(
+ db_->Put(write_options, Key(2, &key_space), Value(2, &value_space)));
+ db_->FlushWAL(false);
+
+ env_->SetFilesystemActive(false);
+ NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
+ sleeping_task_low.WakeUp();
+ sleeping_task_low.WaitUntilDone();
+
+ ASSERT_OK(OpenDB());
+ std::string val;
+ Value(2, &value_space);
+ ASSERT_OK(ReadValue(2, &val));
+ ASSERT_EQ(value_space, val);
+
+ Value(1, &value_space);
+ ASSERT_OK(ReadValue(1, &val));
+ ASSERT_EQ(value_space, val);
+}
+
+TEST_P(FaultInjectionTest, UninstalledCompaction) {
+ options_.target_file_size_base = 32 * 1024;
+ options_.write_buffer_size = 100 << 10; // 100KB
+ options_.level0_file_num_compaction_trigger = 6;
+ options_.level0_stop_writes_trigger = 1 << 10;
+ options_.level0_slowdown_writes_trigger = 1 << 10;
+ options_.max_background_compactions = 1;
+ OpenDB();
+
+ if (!sequential_order_) {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"FaultInjectionTest::FaultTest:0", "DBImpl::BGWorkCompaction"},
+ {"CompactionJob::Run():End", "FaultInjectionTest::FaultTest:1"},
+ {"FaultInjectionTest::FaultTest:2",
+ "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"},
+ });
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ int kNumKeys = 1000;
+ Build(WriteOptions(), 0, kNumKeys);
+ FlushOptions flush_options;
+ flush_options.wait = true;
+ db_->Flush(flush_options);
+ ASSERT_OK(db_->Put(WriteOptions(), "", ""));
+ TEST_SYNC_POINT("FaultInjectionTest::FaultTest:0");
+ TEST_SYNC_POINT("FaultInjectionTest::FaultTest:1");
+ env_->SetFilesystemActive(false);
+ TEST_SYNC_POINT("FaultInjectionTest::FaultTest:2");
+ CloseDB();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ResetDBState(kResetDropUnsyncedData);
+
+ std::atomic<bool> opened(false);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::Open:Opened", [&](void* /*arg*/) { opened.store(true); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BGWorkCompaction",
+ [&](void* /*arg*/) { ASSERT_TRUE(opened.load()); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(OpenDB());
+ ASSERT_OK(Verify(0, kNumKeys, FaultInjectionTest::kValExpectFound));
+ WaitCompactionFinish();
+ ASSERT_OK(Verify(0, kNumKeys, FaultInjectionTest::kValExpectFound));
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(FaultInjectionTest, ManualLogSyncTest) {
+ test::SleepingBackgroundTask sleeping_task_low;
+ env_->SetBackgroundThreads(1, Env::HIGH);
+ // Block the job queue to prevent flush job from running.
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::HIGH);
+ sleeping_task_low.WaitUntilSleeping();
+
+ WriteOptions write_options;
+ write_options.sync = false;
+
+ std::string key_space, value_space;
+ ASSERT_OK(
+ db_->Put(write_options, Key(1, &key_space), Value(1, &value_space)));
+ FlushOptions flush_options;
+ flush_options.wait = false;
+ ASSERT_OK(db_->Flush(flush_options));
+ ASSERT_OK(
+ db_->Put(write_options, Key(2, &key_space), Value(2, &value_space)));
+ ASSERT_OK(db_->FlushWAL(true));
+
+ env_->SetFilesystemActive(false);
+ NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
+ sleeping_task_low.WakeUp();
+ sleeping_task_low.WaitUntilDone();
+
+ ASSERT_OK(OpenDB());
+ std::string val;
+ Value(2, &value_space);
+ ASSERT_OK(ReadValue(2, &val));
+ ASSERT_EQ(value_space, val);
+
+ Value(1, &value_space);
+ ASSERT_OK(ReadValue(1, &val));
+ ASSERT_EQ(value_space, val);
+}
+
+TEST_P(FaultInjectionTest, WriteBatchWalTerminationTest) {
+ ReadOptions ro;
+ Options options = CurrentOptions();
+ options.env = env_;
+
+ WriteOptions wo;
+ wo.sync = true;
+ wo.disableWAL = false;
+ WriteBatch batch;
+ batch.Put("cats", "dogs");
+ batch.MarkWalTerminationPoint();
+ batch.Put("boys", "girls");
+ ASSERT_OK(db_->Write(wo, &batch));
+
+ env_->SetFilesystemActive(false);
+ NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
+ ASSERT_OK(OpenDB());
+
+ std::string val;
+ ASSERT_OK(db_->Get(ro, "cats", &val));
+ ASSERT_EQ("dogs", val);
+ ASSERT_EQ(db_->Get(ro, "boys", &val), Status::NotFound());
+}
+
+INSTANTIATE_TEST_CASE_P(
+ FaultTest, FaultInjectionTest,
+ ::testing::Values(std::make_tuple(false, kDefault, kEnd),
+ std::make_tuple(true, kDefault, kEnd)));
+
+INSTANTIATE_TEST_CASE_P(
+ FaultTest, FaultInjectionTestSplitted,
+ ::testing::Values(std::make_tuple(false, kDefault, kSyncWal),
+ std::make_tuple(true, kDefault, kSyncWal),
+ std::make_tuple(false, kSyncWal, kEnd),
+ std::make_tuple(true, kSyncWal, kEnd)));
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/file_indexer.cc b/src/rocksdb/db/file_indexer.cc
new file mode 100644
index 000000000..523cb3c16
--- /dev/null
+++ b/src/rocksdb/db/file_indexer.cc
@@ -0,0 +1,216 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/file_indexer.h"
+#include <algorithm>
+#include <functional>
+#include "db/version_edit.h"
+#include "rocksdb/comparator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+FileIndexer::FileIndexer(const Comparator* ucmp)
+ : num_levels_(0), ucmp_(ucmp), level_rb_(nullptr) {}
+
+size_t FileIndexer::NumLevelIndex() const { return next_level_index_.size(); }
+
+size_t FileIndexer::LevelIndexSize(size_t level) const {
+ if (level >= next_level_index_.size()) {
+ return 0;
+ }
+ return next_level_index_[level].num_index;
+}
+
+void FileIndexer::GetNextLevelIndex(const size_t level, const size_t file_index,
+ const int cmp_smallest,
+ const int cmp_largest, int32_t* left_bound,
+ int32_t* right_bound) const {
+ assert(level > 0);
+
+ // Last level, no hint
+ if (level == num_levels_ - 1) {
+ *left_bound = 0;
+ *right_bound = -1;
+ return;
+ }
+
+ assert(level < num_levels_ - 1);
+ assert(static_cast<int32_t>(file_index) <= level_rb_[level]);
+
+ const IndexUnit* index_units = next_level_index_[level].index_units;
+ const auto& index = index_units[file_index];
+
+ if (cmp_smallest < 0) {
+ *left_bound = (level > 0 && file_index > 0)
+ ? index_units[file_index - 1].largest_lb
+ : 0;
+ *right_bound = index.smallest_rb;
+ } else if (cmp_smallest == 0) {
+ *left_bound = index.smallest_lb;
+ *right_bound = index.smallest_rb;
+ } else if (cmp_smallest > 0 && cmp_largest < 0) {
+ *left_bound = index.smallest_lb;
+ *right_bound = index.largest_rb;
+ } else if (cmp_largest == 0) {
+ *left_bound = index.largest_lb;
+ *right_bound = index.largest_rb;
+ } else if (cmp_largest > 0) {
+ *left_bound = index.largest_lb;
+ *right_bound = level_rb_[level + 1];
+ } else {
+ assert(false);
+ }
+
+ assert(*left_bound >= 0);
+ assert(*left_bound <= *right_bound + 1);
+ assert(*right_bound <= level_rb_[level + 1]);
+}
+
+void FileIndexer::UpdateIndex(Arena* arena, const size_t num_levels,
+ std::vector<FileMetaData*>* const files) {
+ if (files == nullptr) {
+ return;
+ }
+ if (num_levels == 0) { // uint_32 0-1 would cause bad behavior
+ num_levels_ = num_levels;
+ return;
+ }
+ assert(level_rb_ == nullptr); // level_rb_ should be init here
+
+ num_levels_ = num_levels;
+ next_level_index_.resize(num_levels);
+
+ char* mem = arena->AllocateAligned(num_levels_ * sizeof(int32_t));
+ level_rb_ = new (mem) int32_t[num_levels_];
+ for (size_t i = 0; i < num_levels_; i++) {
+ level_rb_[i] = -1;
+ }
+
+ // L1 - Ln-1
+ for (size_t level = 1; level < num_levels_ - 1; ++level) {
+ const auto& upper_files = files[level];
+ const int32_t upper_size = static_cast<int32_t>(upper_files.size());
+ const auto& lower_files = files[level + 1];
+ level_rb_[level] = static_cast<int32_t>(upper_files.size()) - 1;
+ if (upper_size == 0) {
+ continue;
+ }
+ IndexLevel& index_level = next_level_index_[level];
+ index_level.num_index = upper_size;
+ mem = arena->AllocateAligned(upper_size * sizeof(IndexUnit));
+ index_level.index_units = new (mem) IndexUnit[upper_size];
+
+ CalculateLB(
+ upper_files, lower_files, &index_level,
+ [this](const FileMetaData* a, const FileMetaData* b) -> int {
+ return ucmp_->CompareWithoutTimestamp(a->smallest.user_key(),
+ b->largest.user_key());
+ },
+ [](IndexUnit* index, int32_t f_idx) { index->smallest_lb = f_idx; });
+ CalculateLB(
+ upper_files, lower_files, &index_level,
+ [this](const FileMetaData* a, const FileMetaData* b) -> int {
+ return ucmp_->CompareWithoutTimestamp(a->largest.user_key(),
+ b->largest.user_key());
+ },
+ [](IndexUnit* index, int32_t f_idx) { index->largest_lb = f_idx; });
+ CalculateRB(
+ upper_files, lower_files, &index_level,
+ [this](const FileMetaData* a, const FileMetaData* b) -> int {
+ return ucmp_->CompareWithoutTimestamp(a->smallest.user_key(),
+ b->smallest.user_key());
+ },
+ [](IndexUnit* index, int32_t f_idx) { index->smallest_rb = f_idx; });
+ CalculateRB(
+ upper_files, lower_files, &index_level,
+ [this](const FileMetaData* a, const FileMetaData* b) -> int {
+ return ucmp_->CompareWithoutTimestamp(a->largest.user_key(),
+ b->smallest.user_key());
+ },
+ [](IndexUnit* index, int32_t f_idx) { index->largest_rb = f_idx; });
+ }
+
+ level_rb_[num_levels_ - 1] =
+ static_cast<int32_t>(files[num_levels_ - 1].size()) - 1;
+}
+
+void FileIndexer::CalculateLB(
+ const std::vector<FileMetaData*>& upper_files,
+ const std::vector<FileMetaData*>& lower_files, IndexLevel* index_level,
+ std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
+ std::function<void(IndexUnit*, int32_t)> set_index) {
+ const int32_t upper_size = static_cast<int32_t>(upper_files.size());
+ const int32_t lower_size = static_cast<int32_t>(lower_files.size());
+ int32_t upper_idx = 0;
+ int32_t lower_idx = 0;
+
+ IndexUnit* index = index_level->index_units;
+ while (upper_idx < upper_size && lower_idx < lower_size) {
+ int cmp = cmp_op(upper_files[upper_idx], lower_files[lower_idx]);
+
+ if (cmp == 0) {
+ set_index(&index[upper_idx], lower_idx);
+ ++upper_idx;
+ } else if (cmp > 0) {
+ // Lower level's file (largest) is smaller, a key won't hit in that
+ // file. Move to next lower file
+ ++lower_idx;
+ } else {
+ // Lower level's file becomes larger, update the index, and
+ // move to the next upper file
+ set_index(&index[upper_idx], lower_idx);
+ ++upper_idx;
+ }
+ }
+
+ while (upper_idx < upper_size) {
+ // Lower files are exhausted, that means the remaining upper files are
+ // greater than any lower files. Set the index to be the lower level size.
+ set_index(&index[upper_idx], lower_size);
+ ++upper_idx;
+ }
+}
+
+void FileIndexer::CalculateRB(
+ const std::vector<FileMetaData*>& upper_files,
+ const std::vector<FileMetaData*>& lower_files, IndexLevel* index_level,
+ std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
+ std::function<void(IndexUnit*, int32_t)> set_index) {
+ const int32_t upper_size = static_cast<int32_t>(upper_files.size());
+ const int32_t lower_size = static_cast<int32_t>(lower_files.size());
+ int32_t upper_idx = upper_size - 1;
+ int32_t lower_idx = lower_size - 1;
+
+ IndexUnit* index = index_level->index_units;
+ while (upper_idx >= 0 && lower_idx >= 0) {
+ int cmp = cmp_op(upper_files[upper_idx], lower_files[lower_idx]);
+
+ if (cmp == 0) {
+ set_index(&index[upper_idx], lower_idx);
+ --upper_idx;
+ } else if (cmp < 0) {
+ // Lower level's file (smallest) is larger, a key won't hit in that
+ // file. Move to next lower file.
+ --lower_idx;
+ } else {
+ // Lower level's file becomes smaller, update the index, and move to
+ // the next the upper file
+ set_index(&index[upper_idx], lower_idx);
+ --upper_idx;
+ }
+ }
+ while (upper_idx >= 0) {
+ // Lower files are exhausted, that means the remaining upper files are
+ // smaller than any lower files. Set it to -1.
+ set_index(&index[upper_idx], -1);
+ --upper_idx;
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/file_indexer.h b/src/rocksdb/db/file_indexer.h
new file mode 100644
index 000000000..ad7553f2c
--- /dev/null
+++ b/src/rocksdb/db/file_indexer.h
@@ -0,0 +1,142 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <vector>
+#include "memory/arena.h"
+#include "port/port.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Comparator;
+struct FileMetaData;
+struct FdWithKeyRange;
+struct FileLevel;
+
+// The file tree structure in Version is prebuilt and the range of each file
+// is known. On Version::Get(), it uses binary search to find a potential file
+// and then check if a target key can be found in the file by comparing the key
+// to each file's smallest and largest key. The results of these comparisons
+// can be reused beyond checking if a key falls into a file's range.
+// With some pre-calculated knowledge, each key comparison that has been done
+// can serve as a hint to narrow down further searches: if a key compared to
+// be smaller than a file's smallest or largest, that comparison can be used
+// to find out the right bound of next binary search. Similarly, if a key
+// compared to be larger than a file's smallest or largest, it can be utilized
+// to find out the left bound of next binary search.
+// With these hints: it can greatly reduce the range of binary search,
+// especially for bottom levels, given that one file most likely overlaps with
+// only N files from level below (where N is max_bytes_for_level_multiplier).
+// So on level L, we will only look at ~N files instead of N^L files on the
+// naive approach.
+class FileIndexer {
+ public:
+ explicit FileIndexer(const Comparator* ucmp);
+
+ size_t NumLevelIndex() const;
+
+ size_t LevelIndexSize(size_t level) const;
+
+ // Return a file index range in the next level to search for a key based on
+ // smallest and largest key comparison for the current file specified by
+ // level and file_index. When *left_index < *right_index, both index should
+ // be valid and fit in the vector size.
+ void GetNextLevelIndex(const size_t level, const size_t file_index,
+ const int cmp_smallest, const int cmp_largest,
+ int32_t* left_bound, int32_t* right_bound) const;
+
+ void UpdateIndex(Arena* arena, const size_t num_levels,
+ std::vector<FileMetaData*>* const files);
+
+ enum {
+ // MSVC version 1800 still does not have constexpr for ::max()
+ kLevelMaxIndex = ROCKSDB_NAMESPACE::port::kMaxInt32
+ };
+
+ private:
+ size_t num_levels_;
+ const Comparator* ucmp_;
+
+ struct IndexUnit {
+ IndexUnit()
+ : smallest_lb(0), largest_lb(0), smallest_rb(-1), largest_rb(-1) {}
+ // During file search, a key is compared against smallest and largest
+ // from a FileMetaData. It can have 3 possible outcomes:
+ // (1) key is smaller than smallest, implying it is also smaller than
+ // larger. Precalculated index based on "smallest < smallest" can
+ // be used to provide right bound.
+ // (2) key is in between smallest and largest.
+ // Precalculated index based on "smallest > greatest" can be used to
+ // provide left bound.
+ // Precalculated index based on "largest < smallest" can be used to
+ // provide right bound.
+ // (3) key is larger than largest, implying it is also larger than smallest.
+ // Precalculated index based on "largest > largest" can be used to
+ // provide left bound.
+ //
+ // As a result, we will need to do:
+ // Compare smallest (<=) and largest keys from upper level file with
+ // smallest key from lower level to get a right bound.
+ // Compare smallest (>=) and largest keys from upper level file with
+ // largest key from lower level to get a left bound.
+ //
+ // Example:
+ // level 1: [50 - 60]
+ // level 2: [1 - 40], [45 - 55], [58 - 80]
+ // A key 35, compared to be less than 50, 3rd file on level 2 can be
+ // skipped according to rule (1). LB = 0, RB = 1.
+ // A key 53, sits in the middle 50 and 60. 1st file on level 2 can be
+ // skipped according to rule (2)-a, but the 3rd file cannot be skipped
+ // because 60 is greater than 58. LB = 1, RB = 2.
+ // A key 70, compared to be larger than 60. 1st and 2nd file can be skipped
+ // according to rule (3). LB = 2, RB = 2.
+ //
+ // Point to a left most file in a lower level that may contain a key,
+ // which compares greater than smallest of a FileMetaData (upper level)
+ int32_t smallest_lb;
+ // Point to a left most file in a lower level that may contain a key,
+ // which compares greater than largest of a FileMetaData (upper level)
+ int32_t largest_lb;
+ // Point to a right most file in a lower level that may contain a key,
+ // which compares smaller than smallest of a FileMetaData (upper level)
+ int32_t smallest_rb;
+ // Point to a right most file in a lower level that may contain a key,
+ // which compares smaller than largest of a FileMetaData (upper level)
+ int32_t largest_rb;
+ };
+
+ // Data structure to store IndexUnits in a whole level
+ struct IndexLevel {
+ size_t num_index;
+ IndexUnit* index_units;
+
+ IndexLevel() : num_index(0), index_units(nullptr) {}
+ };
+
+ void CalculateLB(
+ const std::vector<FileMetaData*>& upper_files,
+ const std::vector<FileMetaData*>& lower_files, IndexLevel* index_level,
+ std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
+ std::function<void(IndexUnit*, int32_t)> set_index);
+
+ void CalculateRB(
+ const std::vector<FileMetaData*>& upper_files,
+ const std::vector<FileMetaData*>& lower_files, IndexLevel* index_level,
+ std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
+ std::function<void(IndexUnit*, int32_t)> set_index);
+
+ autovector<IndexLevel> next_level_index_;
+ int32_t* level_rb_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/file_indexer_test.cc b/src/rocksdb/db/file_indexer_test.cc
new file mode 100644
index 000000000..99ce93993
--- /dev/null
+++ b/src/rocksdb/db/file_indexer_test.cc
@@ -0,0 +1,350 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/file_indexer.h"
+#include <string>
+#include "db/dbformat.h"
+#include "db/version_edit.h"
+#include "port/stack_trace.h"
+#include "rocksdb/comparator.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class IntComparator : public Comparator {
+ public:
+ int Compare(const Slice& a, const Slice& b) const override {
+ assert(a.size() == 8);
+ assert(b.size() == 8);
+ int64_t diff = *reinterpret_cast<const int64_t*>(a.data()) -
+ *reinterpret_cast<const int64_t*>(b.data());
+ if (diff < 0) {
+ return -1;
+ } else if (diff == 0) {
+ return 0;
+ } else {
+ return 1;
+ }
+ }
+
+ const char* Name() const override { return "IntComparator"; }
+
+ void FindShortestSeparator(std::string* /*start*/,
+ const Slice& /*limit*/) const override {}
+
+ void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+class FileIndexerTest : public testing::Test {
+ public:
+ FileIndexerTest()
+ : kNumLevels(4), files(new std::vector<FileMetaData*>[kNumLevels]) {}
+
+ ~FileIndexerTest() override {
+ ClearFiles();
+ delete[] files;
+ }
+
+ void AddFile(int level, int64_t smallest, int64_t largest) {
+ auto* f = new FileMetaData();
+ f->smallest = IntKey(smallest);
+ f->largest = IntKey(largest);
+ files[level].push_back(f);
+ }
+
+ InternalKey IntKey(int64_t v) {
+ return InternalKey(Slice(reinterpret_cast<char*>(&v), 8), 0, kTypeValue);
+ }
+
+ void ClearFiles() {
+ for (uint32_t i = 0; i < kNumLevels; ++i) {
+ for (auto* f : files[i]) {
+ delete f;
+ }
+ files[i].clear();
+ }
+ }
+
+ void GetNextLevelIndex(const uint32_t level, const uint32_t file_index,
+ const int cmp_smallest, const int cmp_largest, int32_t* left_index,
+ int32_t* right_index) {
+ *left_index = 100;
+ *right_index = 100;
+ indexer->GetNextLevelIndex(level, file_index, cmp_smallest, cmp_largest,
+ left_index, right_index);
+ }
+
+ int32_t left = 100;
+ int32_t right = 100;
+ const uint32_t kNumLevels;
+ IntComparator ucmp;
+ FileIndexer* indexer;
+
+ std::vector<FileMetaData*>* files;
+};
+
+// Case 0: Empty
+TEST_F(FileIndexerTest, Empty) {
+ Arena arena;
+ indexer = new FileIndexer(&ucmp);
+ indexer->UpdateIndex(&arena, 0, files);
+ delete indexer;
+}
+
+// Case 1: no overlap, files are on the left of next level files
+TEST_F(FileIndexerTest, no_overlap_left) {
+ Arena arena;
+ indexer = new FileIndexer(&ucmp);
+ // level 1
+ AddFile(1, 100, 200);
+ AddFile(1, 300, 400);
+ AddFile(1, 500, 600);
+ // level 2
+ AddFile(2, 1500, 1600);
+ AddFile(2, 1601, 1699);
+ AddFile(2, 1700, 1800);
+ // level 3
+ AddFile(3, 2500, 2600);
+ AddFile(3, 2601, 2699);
+ AddFile(3, 2700, 2800);
+ indexer->UpdateIndex(&arena, kNumLevels, files);
+ for (uint32_t level = 1; level < 3; ++level) {
+ for (uint32_t f = 0; f < 3; ++f) {
+ GetNextLevelIndex(level, f, -1, -1, &left, &right);
+ ASSERT_EQ(0, left);
+ ASSERT_EQ(-1, right);
+ GetNextLevelIndex(level, f, 0, -1, &left, &right);
+ ASSERT_EQ(0, left);
+ ASSERT_EQ(-1, right);
+ GetNextLevelIndex(level, f, 1, -1, &left, &right);
+ ASSERT_EQ(0, left);
+ ASSERT_EQ(-1, right);
+ GetNextLevelIndex(level, f, 1, 0, &left, &right);
+ ASSERT_EQ(0, left);
+ ASSERT_EQ(-1, right);
+ GetNextLevelIndex(level, f, 1, 1, &left, &right);
+ ASSERT_EQ(0, left);
+ ASSERT_EQ(2, right);
+ }
+ }
+ delete indexer;
+ ClearFiles();
+}
+
+// Case 2: no overlap, files are on the right of next level files
+TEST_F(FileIndexerTest, no_overlap_right) {
+ Arena arena;
+ indexer = new FileIndexer(&ucmp);
+ // level 1
+ AddFile(1, 2100, 2200);
+ AddFile(1, 2300, 2400);
+ AddFile(1, 2500, 2600);
+ // level 2
+ AddFile(2, 1500, 1600);
+ AddFile(2, 1501, 1699);
+ AddFile(2, 1700, 1800);
+ // level 3
+ AddFile(3, 500, 600);
+ AddFile(3, 501, 699);
+ AddFile(3, 700, 800);
+ indexer->UpdateIndex(&arena, kNumLevels, files);
+ for (uint32_t level = 1; level < 3; ++level) {
+ for (uint32_t f = 0; f < 3; ++f) {
+ GetNextLevelIndex(level, f, -1, -1, &left, &right);
+ ASSERT_EQ(f == 0 ? 0 : 3, left);
+ ASSERT_EQ(2, right);
+ GetNextLevelIndex(level, f, 0, -1, &left, &right);
+ ASSERT_EQ(3, left);
+ ASSERT_EQ(2, right);
+ GetNextLevelIndex(level, f, 1, -1, &left, &right);
+ ASSERT_EQ(3, left);
+ ASSERT_EQ(2, right);
+ GetNextLevelIndex(level, f, 1, -1, &left, &right);
+ ASSERT_EQ(3, left);
+ ASSERT_EQ(2, right);
+ GetNextLevelIndex(level, f, 1, 0, &left, &right);
+ ASSERT_EQ(3, left);
+ ASSERT_EQ(2, right);
+ GetNextLevelIndex(level, f, 1, 1, &left, &right);
+ ASSERT_EQ(3, left);
+ ASSERT_EQ(2, right);
+ }
+ }
+ delete indexer;
+}
+
+// Case 3: empty L2
+TEST_F(FileIndexerTest, empty_L2) {
+ Arena arena;
+ indexer = new FileIndexer(&ucmp);
+ for (uint32_t i = 1; i < kNumLevels; ++i) {
+ ASSERT_EQ(0U, indexer->LevelIndexSize(i));
+ }
+ // level 1
+ AddFile(1, 2100, 2200);
+ AddFile(1, 2300, 2400);
+ AddFile(1, 2500, 2600);
+ // level 3
+ AddFile(3, 500, 600);
+ AddFile(3, 501, 699);
+ AddFile(3, 700, 800);
+ indexer->UpdateIndex(&arena, kNumLevels, files);
+ for (uint32_t f = 0; f < 3; ++f) {
+ GetNextLevelIndex(1, f, -1, -1, &left, &right);
+ ASSERT_EQ(0, left);
+ ASSERT_EQ(-1, right);
+ GetNextLevelIndex(1, f, 0, -1, &left, &right);
+ ASSERT_EQ(0, left);
+ ASSERT_EQ(-1, right);
+ GetNextLevelIndex(1, f, 1, -1, &left, &right);
+ ASSERT_EQ(0, left);
+ ASSERT_EQ(-1, right);
+ GetNextLevelIndex(1, f, 1, -1, &left, &right);
+ ASSERT_EQ(0, left);
+ ASSERT_EQ(-1, right);
+ GetNextLevelIndex(1, f, 1, 0, &left, &right);
+ ASSERT_EQ(0, left);
+ ASSERT_EQ(-1, right);
+ GetNextLevelIndex(1, f, 1, 1, &left, &right);
+ ASSERT_EQ(0, left);
+ ASSERT_EQ(-1, right);
+ }
+ delete indexer;
+ ClearFiles();
+}
+
+// Case 4: mixed
+TEST_F(FileIndexerTest, mixed) {
+ Arena arena;
+ indexer = new FileIndexer(&ucmp);
+ // level 1
+ AddFile(1, 100, 200);
+ AddFile(1, 250, 400);
+ AddFile(1, 450, 500);
+ // level 2
+ AddFile(2, 100, 150); // 0
+ AddFile(2, 200, 250); // 1
+ AddFile(2, 251, 300); // 2
+ AddFile(2, 301, 350); // 3
+ AddFile(2, 500, 600); // 4
+ // level 3
+ AddFile(3, 0, 50);
+ AddFile(3, 100, 200);
+ AddFile(3, 201, 250);
+ indexer->UpdateIndex(&arena, kNumLevels, files);
+ // level 1, 0
+ GetNextLevelIndex(1, 0, -1, -1, &left, &right);
+ ASSERT_EQ(0, left);
+ ASSERT_EQ(0, right);
+ GetNextLevelIndex(1, 0, 0, -1, &left, &right);
+ ASSERT_EQ(0, left);
+ ASSERT_EQ(0, right);
+ GetNextLevelIndex(1, 0, 1, -1, &left, &right);
+ ASSERT_EQ(0, left);
+ ASSERT_EQ(1, right);
+ GetNextLevelIndex(1, 0, 1, 0, &left, &right);
+ ASSERT_EQ(1, left);
+ ASSERT_EQ(1, right);
+ GetNextLevelIndex(1, 0, 1, 1, &left, &right);
+ ASSERT_EQ(1, left);
+ ASSERT_EQ(4, right);
+ // level 1, 1
+ GetNextLevelIndex(1, 1, -1, -1, &left, &right);
+ ASSERT_EQ(1, left);
+ ASSERT_EQ(1, right);
+ GetNextLevelIndex(1, 1, 0, -1, &left, &right);
+ ASSERT_EQ(1, left);
+ ASSERT_EQ(1, right);
+ GetNextLevelIndex(1, 1, 1, -1, &left, &right);
+ ASSERT_EQ(1, left);
+ ASSERT_EQ(3, right);
+ GetNextLevelIndex(1, 1, 1, 0, &left, &right);
+ ASSERT_EQ(4, left);
+ ASSERT_EQ(3, right);
+ GetNextLevelIndex(1, 1, 1, 1, &left, &right);
+ ASSERT_EQ(4, left);
+ ASSERT_EQ(4, right);
+ // level 1, 2
+ GetNextLevelIndex(1, 2, -1, -1, &left, &right);
+ ASSERT_EQ(4, left);
+ ASSERT_EQ(3, right);
+ GetNextLevelIndex(1, 2, 0, -1, &left, &right);
+ ASSERT_EQ(4, left);
+ ASSERT_EQ(3, right);
+ GetNextLevelIndex(1, 2, 1, -1, &left, &right);
+ ASSERT_EQ(4, left);
+ ASSERT_EQ(4, right);
+ GetNextLevelIndex(1, 2, 1, 0, &left, &right);
+ ASSERT_EQ(4, left);
+ ASSERT_EQ(4, right);
+ GetNextLevelIndex(1, 2, 1, 1, &left, &right);
+ ASSERT_EQ(4, left);
+ ASSERT_EQ(4, right);
+ // level 2, 0
+ GetNextLevelIndex(2, 0, -1, -1, &left, &right);
+ ASSERT_EQ(0, left);
+ ASSERT_EQ(1, right);
+ GetNextLevelIndex(2, 0, 0, -1, &left, &right);
+ ASSERT_EQ(1, left);
+ ASSERT_EQ(1, right);
+ GetNextLevelIndex(2, 0, 1, -1, &left, &right);
+ ASSERT_EQ(1, left);
+ ASSERT_EQ(1, right);
+ GetNextLevelIndex(2, 0, 1, 0, &left, &right);
+ ASSERT_EQ(1, left);
+ ASSERT_EQ(1, right);
+ GetNextLevelIndex(2, 0, 1, 1, &left, &right);
+ ASSERT_EQ(1, left);
+ ASSERT_EQ(2, right);
+ // level 2, 1
+ GetNextLevelIndex(2, 1, -1, -1, &left, &right);
+ ASSERT_EQ(1, left);
+ ASSERT_EQ(1, right);
+ GetNextLevelIndex(2, 1, 0, -1, &left, &right);
+ ASSERT_EQ(1, left);
+ ASSERT_EQ(1, right);
+ GetNextLevelIndex(2, 1, 1, -1, &left, &right);
+ ASSERT_EQ(1, left);
+ ASSERT_EQ(2, right);
+ GetNextLevelIndex(2, 1, 1, 0, &left, &right);
+ ASSERT_EQ(2, left);
+ ASSERT_EQ(2, right);
+ GetNextLevelIndex(2, 1, 1, 1, &left, &right);
+ ASSERT_EQ(2, left);
+ ASSERT_EQ(2, right);
+ // level 2, [2 - 4], no overlap
+ for (uint32_t f = 2; f <= 4; ++f) {
+ GetNextLevelIndex(2, f, -1, -1, &left, &right);
+ ASSERT_EQ(f == 2 ? 2 : 3, left);
+ ASSERT_EQ(2, right);
+ GetNextLevelIndex(2, f, 0, -1, &left, &right);
+ ASSERT_EQ(3, left);
+ ASSERT_EQ(2, right);
+ GetNextLevelIndex(2, f, 1, -1, &left, &right);
+ ASSERT_EQ(3, left);
+ ASSERT_EQ(2, right);
+ GetNextLevelIndex(2, f, 1, 0, &left, &right);
+ ASSERT_EQ(3, left);
+ ASSERT_EQ(2, right);
+ GetNextLevelIndex(2, f, 1, 1, &left, &right);
+ ASSERT_EQ(3, left);
+ ASSERT_EQ(2, right);
+ }
+ delete indexer;
+ ClearFiles();
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/filename_test.cc b/src/rocksdb/db/filename_test.cc
new file mode 100644
index 000000000..9a04542f6
--- /dev/null
+++ b/src/rocksdb/db/filename_test.cc
@@ -0,0 +1,180 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/filename.h"
+
+#include "db/dbformat.h"
+#include "logging/logging.h"
+#include "port/port.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class FileNameTest : public testing::Test {};
+
+TEST_F(FileNameTest, Parse) {
+ Slice db;
+ FileType type;
+ uint64_t number;
+
+ char kDefautInfoLogDir = 1;
+ char kDifferentInfoLogDir = 2;
+ char kNoCheckLogDir = 4;
+ char kAllMode = kDefautInfoLogDir | kDifferentInfoLogDir | kNoCheckLogDir;
+
+ // Successful parses
+ static struct {
+ const char* fname;
+ uint64_t number;
+ FileType type;
+ char mode;
+ } cases[] = {
+ {"100.log", 100, kLogFile, kAllMode},
+ {"0.log", 0, kLogFile, kAllMode},
+ {"0.sst", 0, kTableFile, kAllMode},
+ {"CURRENT", 0, kCurrentFile, kAllMode},
+ {"LOCK", 0, kDBLockFile, kAllMode},
+ {"MANIFEST-2", 2, kDescriptorFile, kAllMode},
+ {"MANIFEST-7", 7, kDescriptorFile, kAllMode},
+ {"METADB-2", 2, kMetaDatabase, kAllMode},
+ {"METADB-7", 7, kMetaDatabase, kAllMode},
+ {"LOG", 0, kInfoLogFile, kDefautInfoLogDir},
+ {"LOG.old", 0, kInfoLogFile, kDefautInfoLogDir},
+ {"LOG.old.6688", 6688, kInfoLogFile, kDefautInfoLogDir},
+ {"rocksdb_dir_LOG", 0, kInfoLogFile, kDifferentInfoLogDir},
+ {"rocksdb_dir_LOG.old", 0, kInfoLogFile, kDifferentInfoLogDir},
+ {"rocksdb_dir_LOG.old.6688", 6688, kInfoLogFile, kDifferentInfoLogDir},
+ {"18446744073709551615.log", 18446744073709551615ull, kLogFile,
+ kAllMode}, };
+ for (char mode : {kDifferentInfoLogDir, kDefautInfoLogDir, kNoCheckLogDir}) {
+ for (unsigned int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) {
+ InfoLogPrefix info_log_prefix(mode != kDefautInfoLogDir, "/rocksdb/dir");
+ if (cases[i].mode & mode) {
+ std::string f = cases[i].fname;
+ if (mode == kNoCheckLogDir) {
+ ASSERT_TRUE(ParseFileName(f, &number, &type)) << f;
+ } else {
+ ASSERT_TRUE(ParseFileName(f, &number, info_log_prefix.prefix, &type))
+ << f;
+ }
+ ASSERT_EQ(cases[i].type, type) << f;
+ ASSERT_EQ(cases[i].number, number) << f;
+ }
+ }
+ }
+
+ // Errors
+ static const char* errors[] = {
+ "",
+ "foo",
+ "foo-dx-100.log",
+ ".log",
+ "",
+ "manifest",
+ "CURREN",
+ "CURRENTX",
+ "MANIFES",
+ "MANIFEST",
+ "MANIFEST-",
+ "XMANIFEST-3",
+ "MANIFEST-3x",
+ "META",
+ "METADB",
+ "METADB-",
+ "XMETADB-3",
+ "METADB-3x",
+ "LOC",
+ "LOCKx",
+ "LO",
+ "LOGx",
+ "18446744073709551616.log",
+ "184467440737095516150.log",
+ "100",
+ "100.",
+ "100.lop"
+ };
+ for (unsigned int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) {
+ std::string f = errors[i];
+ ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f;
+ };
+}
+
+TEST_F(FileNameTest, InfoLogFileName) {
+ std::string dbname = ("/data/rocksdb");
+ std::string db_absolute_path;
+ Env::Default()->GetAbsolutePath(dbname, &db_absolute_path);
+
+ ASSERT_EQ("/data/rocksdb/LOG", InfoLogFileName(dbname, db_absolute_path, ""));
+ ASSERT_EQ("/data/rocksdb/LOG.old.666",
+ OldInfoLogFileName(dbname, 666u, db_absolute_path, ""));
+
+ ASSERT_EQ("/data/rocksdb_log/data_rocksdb_LOG",
+ InfoLogFileName(dbname, db_absolute_path, "/data/rocksdb_log"));
+ ASSERT_EQ(
+ "/data/rocksdb_log/data_rocksdb_LOG.old.666",
+ OldInfoLogFileName(dbname, 666u, db_absolute_path, "/data/rocksdb_log"));
+}
+
+TEST_F(FileNameTest, Construction) {
+ uint64_t number;
+ FileType type;
+ std::string fname;
+
+ fname = CurrentFileName("foo");
+ ASSERT_EQ("foo/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+ ASSERT_EQ(0U, number);
+ ASSERT_EQ(kCurrentFile, type);
+
+ fname = LockFileName("foo");
+ ASSERT_EQ("foo/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+ ASSERT_EQ(0U, number);
+ ASSERT_EQ(kDBLockFile, type);
+
+ fname = LogFileName("foo", 192);
+ ASSERT_EQ("foo/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+ ASSERT_EQ(192U, number);
+ ASSERT_EQ(kLogFile, type);
+
+ fname = TableFileName({DbPath("bar", 0)}, 200, 0);
+ std::string fname1 =
+ TableFileName({DbPath("foo", 0), DbPath("bar", 0)}, 200, 1);
+ ASSERT_EQ(fname, fname1);
+ ASSERT_EQ("bar/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+ ASSERT_EQ(200U, number);
+ ASSERT_EQ(kTableFile, type);
+
+ fname = DescriptorFileName("bar", 100);
+ ASSERT_EQ("bar/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+ ASSERT_EQ(100U, number);
+ ASSERT_EQ(kDescriptorFile, type);
+
+ fname = TempFileName("tmp", 999);
+ ASSERT_EQ("tmp/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+ ASSERT_EQ(999U, number);
+ ASSERT_EQ(kTempFile, type);
+
+ fname = MetaDatabaseName("met", 100);
+ ASSERT_EQ("met/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+ ASSERT_EQ(100U, number);
+ ASSERT_EQ(kMetaDatabase, type);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/flush_job.cc b/src/rocksdb/db/flush_job.cc
new file mode 100644
index 000000000..997bd8080
--- /dev/null
+++ b/src/rocksdb/db/flush_job.cc
@@ -0,0 +1,466 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/flush_job.h"
+
+#include <cinttypes>
+
+#include <algorithm>
+#include <vector>
+
+#include "db/builder.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/event_helpers.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/memtable_list.h"
+#include "db/merge_context.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/version_set.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "logging/event_logger.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/thread_status_util.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/merging_iterator.h"
+#include "table/table_builder.h"
+#include "table/two_level_iterator.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/mutexlock.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const char* GetFlushReasonString (FlushReason flush_reason) {
+ switch (flush_reason) {
+ case FlushReason::kOthers:
+ return "Other Reasons";
+ case FlushReason::kGetLiveFiles:
+ return "Get Live Files";
+ case FlushReason::kShutDown:
+ return "Shut down";
+ case FlushReason::kExternalFileIngestion:
+ return "External File Ingestion";
+ case FlushReason::kManualCompaction:
+ return "Manual Compaction";
+ case FlushReason::kWriteBufferManager:
+ return "Write Buffer Manager";
+ case FlushReason::kWriteBufferFull:
+ return "Write Buffer Full";
+ case FlushReason::kTest:
+ return "Test";
+ case FlushReason::kDeleteFiles:
+ return "Delete Files";
+ case FlushReason::kAutoCompaction:
+ return "Auto Compaction";
+ case FlushReason::kManualFlush:
+ return "Manual Flush";
+ case FlushReason::kErrorRecovery:
+ return "Error Recovery";
+ default:
+ return "Invalid";
+ }
+}
+
+FlushJob::FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
+ const ImmutableDBOptions& db_options,
+ const MutableCFOptions& mutable_cf_options,
+ const uint64_t* max_memtable_id,
+ const FileOptions& file_options, VersionSet* versions,
+ InstrumentedMutex* db_mutex,
+ std::atomic<bool>* shutting_down,
+ std::vector<SequenceNumber> existing_snapshots,
+ SequenceNumber earliest_write_conflict_snapshot,
+ SnapshotChecker* snapshot_checker, JobContext* job_context,
+ LogBuffer* log_buffer, Directory* db_directory,
+ Directory* output_file_directory,
+ CompressionType output_compression, Statistics* stats,
+ EventLogger* event_logger, bool measure_io_stats,
+ const bool sync_output_directory, const bool write_manifest,
+ Env::Priority thread_pri)
+ : dbname_(dbname),
+ cfd_(cfd),
+ db_options_(db_options),
+ mutable_cf_options_(mutable_cf_options),
+ max_memtable_id_(max_memtable_id),
+ file_options_(file_options),
+ versions_(versions),
+ db_mutex_(db_mutex),
+ shutting_down_(shutting_down),
+ existing_snapshots_(std::move(existing_snapshots)),
+ earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
+ snapshot_checker_(snapshot_checker),
+ job_context_(job_context),
+ log_buffer_(log_buffer),
+ db_directory_(db_directory),
+ output_file_directory_(output_file_directory),
+ output_compression_(output_compression),
+ stats_(stats),
+ event_logger_(event_logger),
+ measure_io_stats_(measure_io_stats),
+ sync_output_directory_(sync_output_directory),
+ write_manifest_(write_manifest),
+ edit_(nullptr),
+ base_(nullptr),
+ pick_memtable_called(false),
+ thread_pri_(thread_pri) {
+ // Update the thread status to indicate flush.
+ ReportStartedFlush();
+ TEST_SYNC_POINT("FlushJob::FlushJob()");
+}
+
+FlushJob::~FlushJob() {
+ ThreadStatusUtil::ResetThreadStatus();
+}
+
+void FlushJob::ReportStartedFlush() {
+ ThreadStatusUtil::SetColumnFamily(cfd_, cfd_->ioptions()->env,
+ db_options_.enable_thread_tracking);
+ ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_FLUSH);
+ ThreadStatusUtil::SetThreadOperationProperty(
+ ThreadStatus::COMPACTION_JOB_ID,
+ job_context_->job_id);
+ IOSTATS_RESET(bytes_written);
+}
+
+void FlushJob::ReportFlushInputSize(const autovector<MemTable*>& mems) {
+ uint64_t input_size = 0;
+ for (auto* mem : mems) {
+ input_size += mem->ApproximateMemoryUsage();
+ }
+ ThreadStatusUtil::IncreaseThreadOperationProperty(
+ ThreadStatus::FLUSH_BYTES_MEMTABLES,
+ input_size);
+}
+
+void FlushJob::RecordFlushIOStats() {
+ RecordTick(stats_, FLUSH_WRITE_BYTES, IOSTATS(bytes_written));
+ ThreadStatusUtil::IncreaseThreadOperationProperty(
+ ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written));
+ IOSTATS_RESET(bytes_written);
+}
+
+void FlushJob::PickMemTable() {
+ db_mutex_->AssertHeld();
+ assert(!pick_memtable_called);
+ pick_memtable_called = true;
+ // Save the contents of the earliest memtable as a new Table
+ cfd_->imm()->PickMemtablesToFlush(max_memtable_id_, &mems_);
+ if (mems_.empty()) {
+ return;
+ }
+
+ ReportFlushInputSize(mems_);
+
+ // entries mems are (implicitly) sorted in ascending order by their created
+ // time. We will use the first memtable's `edit` to keep the meta info for
+ // this flush.
+ MemTable* m = mems_[0];
+ edit_ = m->GetEdits();
+ edit_->SetPrevLogNumber(0);
+ // SetLogNumber(log_num) indicates logs with number smaller than log_num
+ // will no longer be picked up for recovery.
+ edit_->SetLogNumber(mems_.back()->GetNextLogNumber());
+ edit_->SetColumnFamily(cfd_->GetID());
+
+ // path 0 for level 0 file.
+ meta_.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
+
+ base_ = cfd_->current();
+ base_->Ref(); // it is likely that we do not need this reference
+}
+
+Status FlushJob::Run(LogsWithPrepTracker* prep_tracker,
+ FileMetaData* file_meta) {
+ TEST_SYNC_POINT("FlushJob::Start");
+ db_mutex_->AssertHeld();
+ assert(pick_memtable_called);
+ AutoThreadOperationStageUpdater stage_run(
+ ThreadStatus::STAGE_FLUSH_RUN);
+ if (mems_.empty()) {
+ ROCKS_LOG_BUFFER(log_buffer_, "[%s] Nothing in memtable to flush",
+ cfd_->GetName().c_str());
+ return Status::OK();
+ }
+
+ // I/O measurement variables
+ PerfLevel prev_perf_level = PerfLevel::kEnableTime;
+ uint64_t prev_write_nanos = 0;
+ uint64_t prev_fsync_nanos = 0;
+ uint64_t prev_range_sync_nanos = 0;
+ uint64_t prev_prepare_write_nanos = 0;
+ uint64_t prev_cpu_write_nanos = 0;
+ uint64_t prev_cpu_read_nanos = 0;
+ if (measure_io_stats_) {
+ prev_perf_level = GetPerfLevel();
+ SetPerfLevel(PerfLevel::kEnableTime);
+ prev_write_nanos = IOSTATS(write_nanos);
+ prev_fsync_nanos = IOSTATS(fsync_nanos);
+ prev_range_sync_nanos = IOSTATS(range_sync_nanos);
+ prev_prepare_write_nanos = IOSTATS(prepare_write_nanos);
+ prev_cpu_write_nanos = IOSTATS(cpu_write_nanos);
+ prev_cpu_read_nanos = IOSTATS(cpu_read_nanos);
+ }
+
+ // This will release and re-acquire the mutex.
+ Status s = WriteLevel0Table();
+
+ if (s.ok() && cfd_->IsDropped()) {
+ s = Status::ColumnFamilyDropped("Column family dropped during compaction");
+ }
+ if ((s.ok() || s.IsColumnFamilyDropped()) &&
+ shutting_down_->load(std::memory_order_acquire)) {
+ s = Status::ShutdownInProgress("Database shutdown");
+ }
+
+ if (!s.ok()) {
+ cfd_->imm()->RollbackMemtableFlush(mems_, meta_.fd.GetNumber());
+ } else if (write_manifest_) {
+ TEST_SYNC_POINT("FlushJob::InstallResults");
+ // Replace immutable memtable with the generated Table
+ s = cfd_->imm()->TryInstallMemtableFlushResults(
+ cfd_, mutable_cf_options_, mems_, prep_tracker, versions_, db_mutex_,
+ meta_.fd.GetNumber(), &job_context_->memtables_to_free, db_directory_,
+ log_buffer_, &committed_flush_jobs_info_);
+ }
+
+ if (s.ok() && file_meta != nullptr) {
+ *file_meta = meta_;
+ }
+ RecordFlushIOStats();
+
+ // When measure_io_stats_ is true, the default 512 bytes is not enough.
+ auto stream = event_logger_->LogToBuffer(log_buffer_, 1024);
+ stream << "job" << job_context_->job_id << "event"
+ << "flush_finished";
+ stream << "output_compression"
+ << CompressionTypeToString(output_compression_);
+ stream << "lsm_state";
+ stream.StartArray();
+ auto vstorage = cfd_->current()->storage_info();
+ for (int level = 0; level < vstorage->num_levels(); ++level) {
+ stream << vstorage->NumLevelFiles(level);
+ }
+ stream.EndArray();
+ stream << "immutable_memtables" << cfd_->imm()->NumNotFlushed();
+
+ if (measure_io_stats_) {
+ if (prev_perf_level != PerfLevel::kEnableTime) {
+ SetPerfLevel(prev_perf_level);
+ }
+ stream << "file_write_nanos" << (IOSTATS(write_nanos) - prev_write_nanos);
+ stream << "file_range_sync_nanos"
+ << (IOSTATS(range_sync_nanos) - prev_range_sync_nanos);
+ stream << "file_fsync_nanos" << (IOSTATS(fsync_nanos) - prev_fsync_nanos);
+ stream << "file_prepare_write_nanos"
+ << (IOSTATS(prepare_write_nanos) - prev_prepare_write_nanos);
+ stream << "file_cpu_write_nanos"
+ << (IOSTATS(cpu_write_nanos) - prev_cpu_write_nanos);
+ stream << "file_cpu_read_nanos"
+ << (IOSTATS(cpu_read_nanos) - prev_cpu_read_nanos);
+ }
+
+ return s;
+}
+
+void FlushJob::Cancel() {
+ db_mutex_->AssertHeld();
+ assert(base_ != nullptr);
+ base_->Unref();
+}
+
+Status FlushJob::WriteLevel0Table() {
+ AutoThreadOperationStageUpdater stage_updater(
+ ThreadStatus::STAGE_FLUSH_WRITE_L0);
+ db_mutex_->AssertHeld();
+ const uint64_t start_micros = db_options_.env->NowMicros();
+ const uint64_t start_cpu_micros = db_options_.env->NowCPUNanos() / 1000;
+ Status s;
+ {
+ auto write_hint = cfd_->CalculateSSTWriteHint(0);
+ db_mutex_->Unlock();
+ if (log_buffer_) {
+ log_buffer_->FlushBufferToLog();
+ }
+ // memtables and range_del_iters store internal iterators over each data
+ // memtable and its associated range deletion memtable, respectively, at
+ // corresponding indexes.
+ std::vector<InternalIterator*> memtables;
+ std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+ range_del_iters;
+ ReadOptions ro;
+ ro.total_order_seek = true;
+ Arena arena;
+ uint64_t total_num_entries = 0, total_num_deletes = 0;
+ uint64_t total_data_size = 0;
+ size_t total_memory_usage = 0;
+ for (MemTable* m : mems_) {
+ ROCKS_LOG_INFO(
+ db_options_.info_log,
+ "[%s] [JOB %d] Flushing memtable with next log file: %" PRIu64 "\n",
+ cfd_->GetName().c_str(), job_context_->job_id, m->GetNextLogNumber());
+ memtables.push_back(m->NewIterator(ro, &arena));
+ auto* range_del_iter =
+ m->NewRangeTombstoneIterator(ro, kMaxSequenceNumber);
+ if (range_del_iter != nullptr) {
+ range_del_iters.emplace_back(range_del_iter);
+ }
+ total_num_entries += m->num_entries();
+ total_num_deletes += m->num_deletes();
+ total_data_size += m->get_data_size();
+ total_memory_usage += m->ApproximateMemoryUsage();
+ }
+
+ event_logger_->Log() << "job" << job_context_->job_id << "event"
+ << "flush_started"
+ << "num_memtables" << mems_.size() << "num_entries"
+ << total_num_entries << "num_deletes"
+ << total_num_deletes << "total_data_size"
+ << total_data_size << "memory_usage"
+ << total_memory_usage << "flush_reason"
+ << GetFlushReasonString(cfd_->GetFlushReason());
+
+ {
+ ScopedArenaIterator iter(
+ NewMergingIterator(&cfd_->internal_comparator(), &memtables[0],
+ static_cast<int>(memtables.size()), &arena));
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": started",
+ cfd_->GetName().c_str(), job_context_->job_id,
+ meta_.fd.GetNumber());
+
+ TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:output_compression",
+ &output_compression_);
+ int64_t _current_time = 0;
+ auto status = db_options_.env->GetCurrentTime(&_current_time);
+ // Safe to proceed even if GetCurrentTime fails. So, log and proceed.
+ if (!status.ok()) {
+ ROCKS_LOG_WARN(
+ db_options_.info_log,
+ "Failed to get current time to populate creation_time property. "
+ "Status: %s",
+ status.ToString().c_str());
+ }
+ const uint64_t current_time = static_cast<uint64_t>(_current_time);
+
+ uint64_t oldest_key_time =
+ mems_.front()->ApproximateOldestKeyTime();
+
+ // It's not clear whether oldest_key_time is always available. In case
+ // it is not available, use current_time.
+ meta_.oldest_ancester_time = std::min(current_time, oldest_key_time);
+ meta_.file_creation_time = current_time;
+
+ uint64_t creation_time = (cfd_->ioptions()->compaction_style ==
+ CompactionStyle::kCompactionStyleFIFO)
+ ? current_time
+ : meta_.oldest_ancester_time;
+
+ s = BuildTable(
+ dbname_, db_options_.env, db_options_.fs.get(), *cfd_->ioptions(),
+ mutable_cf_options_, file_options_, cfd_->table_cache(), iter.get(),
+ std::move(range_del_iters), &meta_, cfd_->internal_comparator(),
+ cfd_->int_tbl_prop_collector_factories(), cfd_->GetID(),
+ cfd_->GetName(), existing_snapshots_,
+ earliest_write_conflict_snapshot_, snapshot_checker_,
+ output_compression_, mutable_cf_options_.sample_for_compression,
+ cfd_->ioptions()->compression_opts,
+ mutable_cf_options_.paranoid_file_checks, cfd_->internal_stats(),
+ TableFileCreationReason::kFlush, event_logger_, job_context_->job_id,
+ Env::IO_HIGH, &table_properties_, 0 /* level */,
+ creation_time, oldest_key_time, write_hint, current_time);
+ LogFlush(db_options_.info_log);
+ }
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": %" PRIu64
+ " bytes %s"
+ "%s",
+ cfd_->GetName().c_str(), job_context_->job_id,
+ meta_.fd.GetNumber(), meta_.fd.GetFileSize(),
+ s.ToString().c_str(),
+ meta_.marked_for_compaction ? " (needs compaction)" : "");
+
+ if (s.ok() && output_file_directory_ != nullptr && sync_output_directory_) {
+ s = output_file_directory_->Fsync();
+ }
+ TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table", &mems_);
+ db_mutex_->Lock();
+ }
+ base_->Unref();
+
+ // Note that if file_size is zero, the file has been deleted and
+ // should not be added to the manifest.
+ if (s.ok() && meta_.fd.GetFileSize() > 0) {
+ // if we have more than 1 background thread, then we cannot
+ // insert files directly into higher levels because some other
+ // threads could be concurrently producing compacted files for
+ // that key range.
+ // Add file to L0
+ edit_->AddFile(0 /* level */, meta_.fd.GetNumber(), meta_.fd.GetPathId(),
+ meta_.fd.GetFileSize(), meta_.smallest, meta_.largest,
+ meta_.fd.smallest_seqno, meta_.fd.largest_seqno,
+ meta_.marked_for_compaction, meta_.oldest_blob_file_number,
+ meta_.oldest_ancester_time, meta_.file_creation_time,
+ meta_.file_checksum, meta_.file_checksum_func_name);
+ }
+#ifndef ROCKSDB_LITE
+ // Piggyback FlushJobInfo on the first first flushed memtable.
+ mems_[0]->SetFlushJobInfo(GetFlushJobInfo());
+#endif // !ROCKSDB_LITE
+
+ // Note that here we treat flush as level 0 compaction in internal stats
+ InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
+ stats.micros = db_options_.env->NowMicros() - start_micros;
+ stats.cpu_micros = db_options_.env->NowCPUNanos() / 1000 - start_cpu_micros;
+ stats.bytes_written = meta_.fd.GetFileSize();
+ RecordTimeToHistogram(stats_, FLUSH_TIME, stats.micros);
+ cfd_->internal_stats()->AddCompactionStats(0 /* level */, thread_pri_, stats);
+ cfd_->internal_stats()->AddCFStats(InternalStats::BYTES_FLUSHED,
+ meta_.fd.GetFileSize());
+ RecordFlushIOStats();
+ return s;
+}
+
+#ifndef ROCKSDB_LITE
+std::unique_ptr<FlushJobInfo> FlushJob::GetFlushJobInfo() const {
+ db_mutex_->AssertHeld();
+ std::unique_ptr<FlushJobInfo> info(new FlushJobInfo{});
+ info->cf_id = cfd_->GetID();
+ info->cf_name = cfd_->GetName();
+
+ const uint64_t file_number = meta_.fd.GetNumber();
+ info->file_path =
+ MakeTableFileName(cfd_->ioptions()->cf_paths[0].path, file_number);
+ info->file_number = file_number;
+ info->oldest_blob_file_number = meta_.oldest_blob_file_number;
+ info->thread_id = db_options_.env->GetThreadID();
+ info->job_id = job_context_->job_id;
+ info->smallest_seqno = meta_.fd.smallest_seqno;
+ info->largest_seqno = meta_.fd.largest_seqno;
+ info->table_properties = table_properties_;
+ info->flush_reason = cfd_->GetFlushReason();
+ return info;
+}
+#endif // !ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/flush_job.h b/src/rocksdb/db/flush_job.h
new file mode 100644
index 000000000..1f4435f4c
--- /dev/null
+++ b/src/rocksdb/db/flush_job.h
@@ -0,0 +1,158 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <limits>
+#include <list>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/dbformat.h"
+#include "db/flush_scheduler.h"
+#include "db/internal_stats.h"
+#include "db/job_context.h"
+#include "db/log_writer.h"
+#include "db/logs_with_prep_tracker.h"
+#include "db/memtable_list.h"
+#include "db/snapshot_impl.h"
+#include "db/version_edit.h"
+#include "db/write_controller.h"
+#include "db/write_thread.h"
+#include "logging/event_logger.h"
+#include "monitoring/instrumented_mutex.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/transaction_log.h"
+#include "table/scoped_arena_iterator.h"
+#include "util/autovector.h"
+#include "util/stop_watch.h"
+#include "util/thread_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBImpl;
+class MemTable;
+class SnapshotChecker;
+class TableCache;
+class Version;
+class VersionEdit;
+class VersionSet;
+class Arena;
+
+class FlushJob {
+ public:
+ // TODO(icanadi) make effort to reduce number of parameters here
+ // IMPORTANT: mutable_cf_options needs to be alive while FlushJob is alive
+ FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
+ const ImmutableDBOptions& db_options,
+ const MutableCFOptions& mutable_cf_options,
+ const uint64_t* max_memtable_id, const FileOptions& file_options,
+ VersionSet* versions, InstrumentedMutex* db_mutex,
+ std::atomic<bool>* shutting_down,
+ std::vector<SequenceNumber> existing_snapshots,
+ SequenceNumber earliest_write_conflict_snapshot,
+ SnapshotChecker* snapshot_checker, JobContext* job_context,
+ LogBuffer* log_buffer, Directory* db_directory,
+ Directory* output_file_directory, CompressionType output_compression,
+ Statistics* stats, EventLogger* event_logger, bool measure_io_stats,
+ const bool sync_output_directory, const bool write_manifest,
+ Env::Priority thread_pri);
+
+ ~FlushJob();
+
+ // Require db_mutex held.
+ // Once PickMemTable() is called, either Run() or Cancel() has to be called.
+ void PickMemTable();
+ Status Run(LogsWithPrepTracker* prep_tracker = nullptr,
+ FileMetaData* file_meta = nullptr);
+ void Cancel();
+ const autovector<MemTable*>& GetMemTables() const { return mems_; }
+
+#ifndef ROCKSDB_LITE
+ std::list<std::unique_ptr<FlushJobInfo>>* GetCommittedFlushJobsInfo() {
+ return &committed_flush_jobs_info_;
+ }
+#endif // !ROCKSDB_LITE
+
+ private:
+ void ReportStartedFlush();
+ void ReportFlushInputSize(const autovector<MemTable*>& mems);
+ void RecordFlushIOStats();
+ Status WriteLevel0Table();
+#ifndef ROCKSDB_LITE
+ std::unique_ptr<FlushJobInfo> GetFlushJobInfo() const;
+#endif // !ROCKSDB_LITE
+
+ const std::string& dbname_;
+ ColumnFamilyData* cfd_;
+ const ImmutableDBOptions& db_options_;
+ const MutableCFOptions& mutable_cf_options_;
+ // Pointer to a variable storing the largest memtable id to flush in this
+ // flush job. RocksDB uses this variable to select the memtables to flush in
+ // this job. All memtables in this column family with an ID smaller than or
+ // equal to *max_memtable_id_ will be selected for flush. If null, then all
+ // memtables in the column family will be selected.
+ const uint64_t* max_memtable_id_;
+ const FileOptions file_options_;
+ VersionSet* versions_;
+ InstrumentedMutex* db_mutex_;
+ std::atomic<bool>* shutting_down_;
+ std::vector<SequenceNumber> existing_snapshots_;
+ SequenceNumber earliest_write_conflict_snapshot_;
+ SnapshotChecker* snapshot_checker_;
+ JobContext* job_context_;
+ LogBuffer* log_buffer_;
+ Directory* db_directory_;
+ Directory* output_file_directory_;
+ CompressionType output_compression_;
+ Statistics* stats_;
+ EventLogger* event_logger_;
+ TableProperties table_properties_;
+ bool measure_io_stats_;
+ // True if this flush job should call fsync on the output directory. False
+ // otherwise.
+ // Usually sync_output_directory_ is true. A flush job needs to call sync on
+ // the output directory before committing to the MANIFEST.
+ // However, an individual flush job does not have to call sync on the output
+ // directory if it is part of an atomic flush. After all flush jobs in the
+ // atomic flush succeed, call sync once on each distinct output directory.
+ const bool sync_output_directory_;
+ // True if this flush job should write to MANIFEST after successfully
+ // flushing memtables. False otherwise.
+ // Usually write_manifest_ is true. A flush job commits to the MANIFEST after
+ // flushing the memtables.
+ // However, an individual flush job cannot rashly write to the MANIFEST
+ // immediately after it finishes the flush if it is part of an atomic flush.
+ // In this case, only after all flush jobs succeed in flush can RocksDB
+ // commit to the MANIFEST.
+ const bool write_manifest_;
+ // The current flush job can commit flush result of a concurrent flush job.
+ // We collect FlushJobInfo of all jobs committed by current job and fire
+ // OnFlushCompleted for them.
+ std::list<std::unique_ptr<FlushJobInfo>> committed_flush_jobs_info_;
+
+ // Variables below are set by PickMemTable():
+ FileMetaData meta_;
+ autovector<MemTable*> mems_;
+ VersionEdit* edit_;
+ Version* base_;
+ bool pick_memtable_called;
+ Env::Priority thread_pri_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/flush_job_test.cc b/src/rocksdb/db/flush_job_test.cc
new file mode 100644
index 000000000..b77a4a2a9
--- /dev/null
+++ b/src/rocksdb/db/flush_job_test.cc
@@ -0,0 +1,498 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <algorithm>
+#include <array>
+#include <map>
+#include <string>
+
+#include "db/blob_index.h"
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/flush_job.h"
+#include "db/version_set.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/mock_table.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TODO(icanadi) Mock out everything else:
+// 1. VersionSet
+// 2. Memtable
+class FlushJobTest : public testing::Test {
+ public:
+ FlushJobTest()
+ : env_(Env::Default()),
+ fs_(std::make_shared<LegacyFileSystemWrapper>(env_)),
+ dbname_(test::PerThreadDBPath("flush_job_test")),
+ options_(),
+ db_options_(options_),
+ column_family_names_({kDefaultColumnFamilyName, "foo", "bar"}),
+ table_cache_(NewLRUCache(50000, 16)),
+ write_buffer_manager_(db_options_.db_write_buffer_size),
+ shutting_down_(false),
+ mock_table_factory_(new mock::MockTableFactory()) {
+ EXPECT_OK(env_->CreateDirIfMissing(dbname_));
+ db_options_.db_paths.emplace_back(dbname_,
+ std::numeric_limits<uint64_t>::max());
+ db_options_.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ // TODO(icanadi) Remove this once we mock out VersionSet
+ NewDB();
+ std::vector<ColumnFamilyDescriptor> column_families;
+ cf_options_.table_factory = mock_table_factory_;
+ for (const auto& cf_name : column_family_names_) {
+ column_families.emplace_back(cf_name, cf_options_);
+ }
+
+ db_options_.env = env_;
+ db_options_.fs = fs_;
+ versions_.reset(new VersionSet(dbname_, &db_options_, env_options_,
+ table_cache_.get(), &write_buffer_manager_,
+ &write_controller_,
+ /*block_cache_tracer=*/nullptr));
+ EXPECT_OK(versions_->Recover(column_families, false));
+ }
+
+ void NewDB() {
+ SetIdentityFile(env_, dbname_);
+ VersionEdit new_db;
+ if (db_options_.write_dbid_to_manifest) {
+ DBImpl* impl = new DBImpl(DBOptions(), dbname_);
+ std::string db_id;
+ impl->GetDbIdentityFromIdentityFile(&db_id);
+ new_db.SetDBId(db_id);
+ }
+ new_db.SetLogNumber(0);
+ new_db.SetNextFile(2);
+ new_db.SetLastSequence(0);
+
+ autovector<VersionEdit> new_cfs;
+ SequenceNumber last_seq = 1;
+ uint32_t cf_id = 1;
+ for (size_t i = 1; i != column_family_names_.size(); ++i) {
+ VersionEdit new_cf;
+ new_cf.AddColumnFamily(column_family_names_[i]);
+ new_cf.SetColumnFamily(cf_id++);
+ new_cf.SetLogNumber(0);
+ new_cf.SetNextFile(2);
+ new_cf.SetLastSequence(last_seq++);
+ new_cfs.emplace_back(new_cf);
+ }
+
+ const std::string manifest = DescriptorFileName(dbname_, 1);
+ std::unique_ptr<WritableFile> file;
+ Status s = env_->NewWritableFile(
+ manifest, &file, env_->OptimizeForManifestWrite(env_options_));
+ ASSERT_OK(s);
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ NewLegacyWritableFileWrapper(std::move(file)), manifest, EnvOptions()));
+ {
+ log::Writer log(std::move(file_writer), 0, false);
+ std::string record;
+ new_db.EncodeTo(&record);
+ s = log.AddRecord(record);
+
+ for (const auto& e : new_cfs) {
+ record.clear();
+ e.EncodeTo(&record);
+ s = log.AddRecord(record);
+ ASSERT_OK(s);
+ }
+ }
+ ASSERT_OK(s);
+ // Make "CURRENT" file that points to the new manifest file.
+ s = SetCurrentFile(env_, dbname_, 1, nullptr);
+ }
+
+ Env* env_;
+ std::shared_ptr<FileSystem> fs_;
+ std::string dbname_;
+ EnvOptions env_options_;
+ Options options_;
+ ImmutableDBOptions db_options_;
+ const std::vector<std::string> column_family_names_;
+ std::shared_ptr<Cache> table_cache_;
+ WriteController write_controller_;
+ WriteBufferManager write_buffer_manager_;
+ ColumnFamilyOptions cf_options_;
+ std::unique_ptr<VersionSet> versions_;
+ InstrumentedMutex mutex_;
+ std::atomic<bool> shutting_down_;
+ std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
+};
+
+TEST_F(FlushJobTest, Empty) {
+ JobContext job_context(0);
+ auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+ EventLogger event_logger(db_options_.info_log.get());
+ SnapshotChecker* snapshot_checker = nullptr; // not relavant
+ FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
+ db_options_, *cfd->GetLatestMutableCFOptions(),
+ nullptr /* memtable_id */, env_options_, versions_.get(),
+ &mutex_, &shutting_down_, {}, kMaxSequenceNumber,
+ snapshot_checker, &job_context, nullptr, nullptr, nullptr,
+ kNoCompression, nullptr, &event_logger, false,
+ true /* sync_output_directory */,
+ true /* write_manifest */, Env::Priority::USER);
+ {
+ InstrumentedMutexLock l(&mutex_);
+ flush_job.PickMemTable();
+ ASSERT_OK(flush_job.Run());
+ }
+ job_context.Clean();
+}
+
+TEST_F(FlushJobTest, NonEmpty) {
+ JobContext job_context(0);
+ auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+ auto new_mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions(),
+ kMaxSequenceNumber);
+ new_mem->Ref();
+ auto inserted_keys = mock::MakeMockFile();
+ // Test data:
+ // seqno [ 1, 2 ... 8998, 8999, 9000, 9001, 9002 ... 9999 ]
+ // key [ 1001, 1002 ... 9998, 9999, 0, 1, 2 ... 999 ]
+ // range-delete "9995" -> "9999" at seqno 10000
+ // blob references with seqnos 10001..10006
+ for (int i = 1; i < 10000; ++i) {
+ std::string key(ToString((i + 1000) % 10000));
+ std::string value("value" + key);
+ new_mem->Add(SequenceNumber(i), kTypeValue, key, value);
+ if ((i + 1000) % 10000 < 9995) {
+ InternalKey internal_key(key, SequenceNumber(i), kTypeValue);
+ inserted_keys.insert({internal_key.Encode().ToString(), value});
+ }
+ }
+
+ {
+ new_mem->Add(SequenceNumber(10000), kTypeRangeDeletion, "9995", "9999a");
+ InternalKey internal_key("9995", SequenceNumber(10000), kTypeRangeDeletion);
+ inserted_keys.insert({internal_key.Encode().ToString(), "9999a"});
+ }
+
+#ifndef ROCKSDB_LITE
+ // Note: the first two blob references will not be considered when resolving
+ // the oldest blob file referenced (the first one is inlined TTL, while the
+ // second one is TTL and thus points to a TTL blob file).
+ constexpr std::array<uint64_t, 6> blob_file_numbers{
+ kInvalidBlobFileNumber, 5, 103, 17, 102, 101};
+ for (size_t i = 0; i < blob_file_numbers.size(); ++i) {
+ std::string key(ToString(i + 10001));
+ std::string blob_index;
+ if (i == 0) {
+ BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 1234567890ULL,
+ "foo");
+ } else if (i == 1) {
+ BlobIndex::EncodeBlobTTL(&blob_index, /* expiration */ 1234567890ULL,
+ blob_file_numbers[i], /* offset */ i << 10,
+ /* size */ i << 20, kNoCompression);
+ } else {
+ BlobIndex::EncodeBlob(&blob_index, blob_file_numbers[i],
+ /* offset */ i << 10, /* size */ i << 20,
+ kNoCompression);
+ }
+
+ const SequenceNumber seq(i + 10001);
+ new_mem->Add(seq, kTypeBlobIndex, key, blob_index);
+
+ InternalKey internal_key(key, seq, kTypeBlobIndex);
+ inserted_keys.emplace_hint(inserted_keys.end(),
+ internal_key.Encode().ToString(), blob_index);
+ }
+#endif
+
+ autovector<MemTable*> to_delete;
+ cfd->imm()->Add(new_mem, &to_delete);
+ for (auto& m : to_delete) {
+ delete m;
+ }
+
+ EventLogger event_logger(db_options_.info_log.get());
+ SnapshotChecker* snapshot_checker = nullptr; // not relavant
+ FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
+ db_options_, *cfd->GetLatestMutableCFOptions(),
+ nullptr /* memtable_id */, env_options_, versions_.get(),
+ &mutex_, &shutting_down_, {}, kMaxSequenceNumber,
+ snapshot_checker, &job_context, nullptr, nullptr, nullptr,
+ kNoCompression, db_options_.statistics.get(),
+ &event_logger, true, true /* sync_output_directory */,
+ true /* write_manifest */, Env::Priority::USER);
+
+ HistogramData hist;
+ FileMetaData file_meta;
+ mutex_.Lock();
+ flush_job.PickMemTable();
+ ASSERT_OK(flush_job.Run(nullptr, &file_meta));
+ mutex_.Unlock();
+ db_options_.statistics->histogramData(FLUSH_TIME, &hist);
+ ASSERT_GT(hist.average, 0.0);
+
+ ASSERT_EQ(ToString(0), file_meta.smallest.user_key().ToString());
+ ASSERT_EQ("9999a", file_meta.largest.user_key().ToString());
+ ASSERT_EQ(1, file_meta.fd.smallest_seqno);
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ(10006, file_meta.fd.largest_seqno);
+ ASSERT_EQ(17, file_meta.oldest_blob_file_number);
+#else
+ ASSERT_EQ(10000, file_meta.fd.largest_seqno);
+#endif
+ mock_table_factory_->AssertSingleFile(inserted_keys);
+ job_context.Clean();
+}
+
+TEST_F(FlushJobTest, FlushMemTablesSingleColumnFamily) {
+ const size_t num_mems = 2;
+ const size_t num_mems_to_flush = 1;
+ const size_t num_keys_per_table = 100;
+ JobContext job_context(0);
+ ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault();
+ std::vector<uint64_t> memtable_ids;
+ std::vector<MemTable*> new_mems;
+ for (size_t i = 0; i != num_mems; ++i) {
+ MemTable* mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions(),
+ kMaxSequenceNumber);
+ mem->SetID(i);
+ mem->Ref();
+ new_mems.emplace_back(mem);
+ memtable_ids.push_back(mem->GetID());
+
+ for (size_t j = 0; j < num_keys_per_table; ++j) {
+ std::string key(ToString(j + i * num_keys_per_table));
+ std::string value("value" + key);
+ mem->Add(SequenceNumber(j + i * num_keys_per_table), kTypeValue, key,
+ value);
+ }
+ }
+
+ autovector<MemTable*> to_delete;
+ for (auto mem : new_mems) {
+ cfd->imm()->Add(mem, &to_delete);
+ }
+
+ EventLogger event_logger(db_options_.info_log.get());
+ SnapshotChecker* snapshot_checker = nullptr; // not relavant
+
+ assert(memtable_ids.size() == num_mems);
+ uint64_t smallest_memtable_id = memtable_ids.front();
+ uint64_t flush_memtable_id = smallest_memtable_id + num_mems_to_flush - 1;
+
+ FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
+ db_options_, *cfd->GetLatestMutableCFOptions(),
+ &flush_memtable_id, env_options_, versions_.get(), &mutex_,
+ &shutting_down_, {}, kMaxSequenceNumber, snapshot_checker,
+ &job_context, nullptr, nullptr, nullptr, kNoCompression,
+ db_options_.statistics.get(), &event_logger, true,
+ true /* sync_output_directory */,
+ true /* write_manifest */, Env::Priority::USER);
+ HistogramData hist;
+ FileMetaData file_meta;
+ mutex_.Lock();
+ flush_job.PickMemTable();
+ ASSERT_OK(flush_job.Run(nullptr /* prep_tracker */, &file_meta));
+ mutex_.Unlock();
+ db_options_.statistics->histogramData(FLUSH_TIME, &hist);
+ ASSERT_GT(hist.average, 0.0);
+
+ ASSERT_EQ(ToString(0), file_meta.smallest.user_key().ToString());
+ ASSERT_EQ("99", file_meta.largest.user_key().ToString());
+ ASSERT_EQ(0, file_meta.fd.smallest_seqno);
+ ASSERT_EQ(SequenceNumber(num_mems_to_flush * num_keys_per_table - 1),
+ file_meta.fd.largest_seqno);
+ ASSERT_EQ(kInvalidBlobFileNumber, file_meta.oldest_blob_file_number);
+
+ for (auto m : to_delete) {
+ delete m;
+ }
+ to_delete.clear();
+ job_context.Clean();
+}
+
+TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) {
+ autovector<ColumnFamilyData*> all_cfds;
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ all_cfds.push_back(cfd);
+ }
+ const std::vector<size_t> num_memtables = {2, 1, 3};
+ assert(num_memtables.size() == column_family_names_.size());
+ const size_t num_keys_per_memtable = 1000;
+ JobContext job_context(0);
+ std::vector<uint64_t> memtable_ids;
+ std::vector<SequenceNumber> smallest_seqs;
+ std::vector<SequenceNumber> largest_seqs;
+ autovector<MemTable*> to_delete;
+ SequenceNumber curr_seqno = 0;
+ size_t k = 0;
+ for (auto cfd : all_cfds) {
+ smallest_seqs.push_back(curr_seqno);
+ for (size_t i = 0; i != num_memtables[k]; ++i) {
+ MemTable* mem = cfd->ConstructNewMemtable(
+ *cfd->GetLatestMutableCFOptions(), kMaxSequenceNumber);
+ mem->SetID(i);
+ mem->Ref();
+
+ for (size_t j = 0; j != num_keys_per_memtable; ++j) {
+ std::string key(ToString(j + i * num_keys_per_memtable));
+ std::string value("value" + key);
+ mem->Add(curr_seqno++, kTypeValue, key, value);
+ }
+
+ cfd->imm()->Add(mem, &to_delete);
+ }
+ largest_seqs.push_back(curr_seqno - 1);
+ memtable_ids.push_back(num_memtables[k++] - 1);
+ }
+
+ EventLogger event_logger(db_options_.info_log.get());
+ SnapshotChecker* snapshot_checker = nullptr; // not relevant
+ std::vector<std::unique_ptr<FlushJob>> flush_jobs;
+ k = 0;
+ for (auto cfd : all_cfds) {
+ std::vector<SequenceNumber> snapshot_seqs;
+ flush_jobs.emplace_back(new FlushJob(
+ dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(),
+ &memtable_ids[k], env_options_, versions_.get(), &mutex_,
+ &shutting_down_, snapshot_seqs, kMaxSequenceNumber, snapshot_checker,
+ &job_context, nullptr, nullptr, nullptr, kNoCompression,
+ db_options_.statistics.get(), &event_logger, true,
+ false /* sync_output_directory */, false /* write_manifest */,
+ Env::Priority::USER));
+ k++;
+ }
+ HistogramData hist;
+ std::vector<FileMetaData> file_metas;
+ // Call reserve to avoid auto-resizing
+ file_metas.reserve(flush_jobs.size());
+ mutex_.Lock();
+ for (auto& job : flush_jobs) {
+ job->PickMemTable();
+ }
+ for (auto& job : flush_jobs) {
+ FileMetaData meta;
+ // Run will release and re-acquire mutex
+ ASSERT_OK(job->Run(nullptr /**/, &meta));
+ file_metas.emplace_back(meta);
+ }
+ autovector<FileMetaData*> file_meta_ptrs;
+ for (auto& meta : file_metas) {
+ file_meta_ptrs.push_back(&meta);
+ }
+ autovector<const autovector<MemTable*>*> mems_list;
+ for (size_t i = 0; i != all_cfds.size(); ++i) {
+ const auto& mems = flush_jobs[i]->GetMemTables();
+ mems_list.push_back(&mems);
+ }
+ autovector<const MutableCFOptions*> mutable_cf_options_list;
+ for (auto cfd : all_cfds) {
+ mutable_cf_options_list.push_back(cfd->GetLatestMutableCFOptions());
+ }
+
+ Status s = InstallMemtableAtomicFlushResults(
+ nullptr /* imm_lists */, all_cfds, mutable_cf_options_list, mems_list,
+ versions_.get(), &mutex_, file_meta_ptrs, &job_context.memtables_to_free,
+ nullptr /* db_directory */, nullptr /* log_buffer */);
+ ASSERT_OK(s);
+
+ mutex_.Unlock();
+ db_options_.statistics->histogramData(FLUSH_TIME, &hist);
+ ASSERT_GT(hist.average, 0.0);
+ k = 0;
+ for (const auto& file_meta : file_metas) {
+ ASSERT_EQ(ToString(0), file_meta.smallest.user_key().ToString());
+ ASSERT_EQ("999", file_meta.largest.user_key()
+ .ToString()); // max key by bytewise comparator
+ ASSERT_EQ(smallest_seqs[k], file_meta.fd.smallest_seqno);
+ ASSERT_EQ(largest_seqs[k], file_meta.fd.largest_seqno);
+ // Verify that imm is empty
+ ASSERT_EQ(std::numeric_limits<uint64_t>::max(),
+ all_cfds[k]->imm()->GetEarliestMemTableID());
+ ASSERT_EQ(0, all_cfds[k]->imm()->GetLatestMemTableID());
+ ++k;
+ }
+
+ for (auto m : to_delete) {
+ delete m;
+ }
+ to_delete.clear();
+ job_context.Clean();
+}
+
+TEST_F(FlushJobTest, Snapshots) {
+ JobContext job_context(0);
+ auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+ auto new_mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions(),
+ kMaxSequenceNumber);
+
+ std::set<SequenceNumber> snapshots_set;
+ int keys = 10000;
+ int max_inserts_per_keys = 8;
+
+ Random rnd(301);
+ for (int i = 0; i < keys / 2; ++i) {
+ snapshots_set.insert(rnd.Uniform(keys * (max_inserts_per_keys / 2)) + 1);
+ }
+ // set has already removed the duplicate snapshots
+ std::vector<SequenceNumber> snapshots(snapshots_set.begin(),
+ snapshots_set.end());
+
+ new_mem->Ref();
+ SequenceNumber current_seqno = 0;
+ auto inserted_keys = mock::MakeMockFile();
+ for (int i = 1; i < keys; ++i) {
+ std::string key(ToString(i));
+ int insertions = rnd.Uniform(max_inserts_per_keys);
+ for (int j = 0; j < insertions; ++j) {
+ std::string value(test::RandomHumanReadableString(&rnd, 10));
+ auto seqno = ++current_seqno;
+ new_mem->Add(SequenceNumber(seqno), kTypeValue, key, value);
+ // a key is visible only if:
+ // 1. it's the last one written (j == insertions - 1)
+ // 2. there's a snapshot pointing at it
+ bool visible = (j == insertions - 1) ||
+ (snapshots_set.find(seqno) != snapshots_set.end());
+ if (visible) {
+ InternalKey internal_key(key, seqno, kTypeValue);
+ inserted_keys.insert({internal_key.Encode().ToString(), value});
+ }
+ }
+ }
+
+ autovector<MemTable*> to_delete;
+ cfd->imm()->Add(new_mem, &to_delete);
+ for (auto& m : to_delete) {
+ delete m;
+ }
+
+ EventLogger event_logger(db_options_.info_log.get());
+ SnapshotChecker* snapshot_checker = nullptr; // not relavant
+ FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
+ db_options_, *cfd->GetLatestMutableCFOptions(),
+ nullptr /* memtable_id */, env_options_, versions_.get(),
+ &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber,
+ snapshot_checker, &job_context, nullptr, nullptr, nullptr,
+ kNoCompression, db_options_.statistics.get(),
+ &event_logger, true, true /* sync_output_directory */,
+ true /* write_manifest */, Env::Priority::USER);
+ mutex_.Lock();
+ flush_job.PickMemTable();
+ ASSERT_OK(flush_job.Run());
+ mutex_.Unlock();
+ mock_table_factory_->AssertSingleFile(inserted_keys);
+ HistogramData hist;
+ db_options_.statistics->histogramData(FLUSH_TIME, &hist);
+ ASSERT_GT(hist.average, 0.0);
+ job_context.Clean();
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/flush_scheduler.cc b/src/rocksdb/db/flush_scheduler.cc
new file mode 100644
index 000000000..6f4d3e1a5
--- /dev/null
+++ b/src/rocksdb/db/flush_scheduler.cc
@@ -0,0 +1,86 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/flush_scheduler.h"
+
+#include <cassert>
+
+#include "db/column_family.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void FlushScheduler::ScheduleWork(ColumnFamilyData* cfd) {
+#ifndef NDEBUG
+ {
+ std::lock_guard<std::mutex> lock(checking_mutex_);
+ assert(checking_set_.count(cfd) == 0);
+ checking_set_.insert(cfd);
+ }
+#endif // NDEBUG
+ cfd->Ref();
+// Suppress false positive clang analyzer warnings.
+#ifndef __clang_analyzer__
+ Node* node = new Node{cfd, head_.load(std::memory_order_relaxed)};
+ while (!head_.compare_exchange_strong(
+ node->next, node, std::memory_order_relaxed, std::memory_order_relaxed)) {
+ // failing CAS updates the first param, so we are already set for
+ // retry. TakeNextColumnFamily won't happen until after another
+ // inter-thread synchronization, so we don't even need release
+ // semantics for this CAS
+ }
+#endif // __clang_analyzer__
+}
+
+ColumnFamilyData* FlushScheduler::TakeNextColumnFamily() {
+ while (true) {
+ if (head_.load(std::memory_order_relaxed) == nullptr) {
+ return nullptr;
+ }
+
+ // dequeue the head
+ Node* node = head_.load(std::memory_order_relaxed);
+ head_.store(node->next, std::memory_order_relaxed);
+ ColumnFamilyData* cfd = node->column_family;
+ delete node;
+
+#ifndef NDEBUG
+ {
+ std::lock_guard<std::mutex> lock(checking_mutex_);
+ auto iter = checking_set_.find(cfd);
+ assert(iter != checking_set_.end());
+ checking_set_.erase(iter);
+ }
+#endif // NDEBUG
+
+ if (!cfd->IsDropped()) {
+ // success
+ return cfd;
+ }
+
+ // no longer relevant, retry
+ cfd->UnrefAndTryDelete();
+ }
+}
+
+bool FlushScheduler::Empty() {
+ auto rv = head_.load(std::memory_order_relaxed) == nullptr;
+#ifndef NDEBUG
+ std::lock_guard<std::mutex> lock(checking_mutex_);
+ // Empty is allowed to be called concurrnetly with ScheduleFlush. It would
+ // only miss the recent schedules.
+ assert((rv == checking_set_.empty()) || rv);
+#endif // NDEBUG
+ return rv;
+}
+
+void FlushScheduler::Clear() {
+ ColumnFamilyData* cfd;
+ while ((cfd = TakeNextColumnFamily()) != nullptr) {
+ cfd->UnrefAndTryDelete();
+ }
+ assert(head_.load(std::memory_order_relaxed) == nullptr);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/flush_scheduler.h b/src/rocksdb/db/flush_scheduler.h
new file mode 100644
index 000000000..cbe17994f
--- /dev/null
+++ b/src/rocksdb/db/flush_scheduler.h
@@ -0,0 +1,54 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdint.h>
+#include <atomic>
+#include <mutex>
+#include <set>
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ColumnFamilyData;
+
+// FlushScheduler keeps track of all column families whose memtable may
+// be full and require flushing. Unless otherwise noted, all methods on
+// FlushScheduler should be called only with the DB mutex held or from
+// a single-threaded recovery context.
+class FlushScheduler {
+ public:
+ FlushScheduler() : head_(nullptr) {}
+
+ // May be called from multiple threads at once, but not concurrent with
+ // any other method calls on this instance
+ void ScheduleWork(ColumnFamilyData* cfd);
+
+ // Removes and returns Ref()-ed column family. Client needs to Unref().
+ // Filters column families that have been dropped.
+ ColumnFamilyData* TakeNextColumnFamily();
+
+ // This can be called concurrently with ScheduleWork but it would miss all
+ // the scheduled flushes after the last synchronization. This would result
+ // into less precise enforcement of memtable sizes but should not matter much.
+ bool Empty();
+
+ void Clear();
+
+ private:
+ struct Node {
+ ColumnFamilyData* column_family;
+ Node* next;
+ };
+
+ std::atomic<Node*> head_;
+#ifndef NDEBUG
+ std::mutex checking_mutex_;
+ std::set<ColumnFamilyData*> checking_set_;
+#endif // NDEBUG
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/forward_iterator.cc b/src/rocksdb/db/forward_iterator.cc
new file mode 100644
index 000000000..f2b882549
--- /dev/null
+++ b/src/rocksdb/db/forward_iterator.cc
@@ -0,0 +1,975 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include "db/forward_iterator.h"
+
+#include <limits>
+#include <string>
+#include <utility>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/job_context.h"
+#include "db/range_del_aggregator.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "table/merging_iterator.h"
+#include "test_util/sync_point.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Usage:
+// ForwardLevelIterator iter;
+// iter.SetFileIndex(file_index);
+// iter.Seek(target); // or iter.SeekToFirst();
+// iter.Next()
+class ForwardLevelIterator : public InternalIterator {
+ public:
+ ForwardLevelIterator(const ColumnFamilyData* const cfd,
+ const ReadOptions& read_options,
+ const std::vector<FileMetaData*>& files,
+ const SliceTransform* prefix_extractor)
+ : cfd_(cfd),
+ read_options_(read_options),
+ files_(files),
+ valid_(false),
+ file_index_(std::numeric_limits<uint32_t>::max()),
+ file_iter_(nullptr),
+ pinned_iters_mgr_(nullptr),
+ prefix_extractor_(prefix_extractor) {}
+
+ ~ForwardLevelIterator() override {
+ // Reset current pointer
+ if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
+ pinned_iters_mgr_->PinIterator(file_iter_);
+ } else {
+ delete file_iter_;
+ }
+ }
+
+ void SetFileIndex(uint32_t file_index) {
+ assert(file_index < files_.size());
+ status_ = Status::OK();
+ if (file_index != file_index_) {
+ file_index_ = file_index;
+ Reset();
+ }
+ }
+ void Reset() {
+ assert(file_index_ < files_.size());
+
+ // Reset current pointer
+ if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
+ pinned_iters_mgr_->PinIterator(file_iter_);
+ } else {
+ delete file_iter_;
+ }
+
+ ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(),
+ kMaxSequenceNumber /* upper_bound */);
+ file_iter_ = cfd_->table_cache()->NewIterator(
+ read_options_, *(cfd_->soptions()), cfd_->internal_comparator(),
+ *files_[file_index_],
+ read_options_.ignore_range_deletions ? nullptr : &range_del_agg,
+ prefix_extractor_, /*table_reader_ptr=*/nullptr,
+ /*file_read_hist=*/nullptr, TableReaderCaller::kUserIterator,
+ /*arena=*/nullptr, /*skip_filters=*/false, /*level=*/-1,
+ /*smallest_compaction_key=*/nullptr,
+ /*largest_compaction_key=*/nullptr);
+ file_iter_->SetPinnedItersMgr(pinned_iters_mgr_);
+ valid_ = false;
+ if (!range_del_agg.IsEmpty()) {
+ status_ = Status::NotSupported(
+ "Range tombstones unsupported with ForwardIterator");
+ }
+ }
+ void SeekToLast() override {
+ status_ = Status::NotSupported("ForwardLevelIterator::SeekToLast()");
+ valid_ = false;
+ }
+ void Prev() override {
+ status_ = Status::NotSupported("ForwardLevelIterator::Prev()");
+ valid_ = false;
+ }
+ bool Valid() const override {
+ return valid_;
+ }
+ void SeekToFirst() override {
+ assert(file_iter_ != nullptr);
+ if (!status_.ok()) {
+ assert(!valid_);
+ return;
+ }
+ file_iter_->SeekToFirst();
+ valid_ = file_iter_->Valid();
+ }
+ void Seek(const Slice& internal_key) override {
+ assert(file_iter_ != nullptr);
+
+ // This deviates from the usual convention for InternalIterator::Seek() in
+ // that it doesn't discard pre-existing error status. That's because this
+ // Seek() is only supposed to be called immediately after SetFileIndex()
+ // (which discards pre-existing error status), and SetFileIndex() may set
+ // an error status, which we shouldn't discard.
+ if (!status_.ok()) {
+ assert(!valid_);
+ return;
+ }
+
+ file_iter_->Seek(internal_key);
+ valid_ = file_iter_->Valid();
+ }
+ void SeekForPrev(const Slice& /*internal_key*/) override {
+ status_ = Status::NotSupported("ForwardLevelIterator::SeekForPrev()");
+ valid_ = false;
+ }
+ void Next() override {
+ assert(valid_);
+ file_iter_->Next();
+ for (;;) {
+ valid_ = file_iter_->Valid();
+ if (!file_iter_->status().ok()) {
+ assert(!valid_);
+ return;
+ }
+ if (valid_) {
+ return;
+ }
+ if (file_index_ + 1 >= files_.size()) {
+ valid_ = false;
+ return;
+ }
+ SetFileIndex(file_index_ + 1);
+ if (!status_.ok()) {
+ assert(!valid_);
+ return;
+ }
+ file_iter_->SeekToFirst();
+ }
+ }
+ Slice key() const override {
+ assert(valid_);
+ return file_iter_->key();
+ }
+ Slice value() const override {
+ assert(valid_);
+ return file_iter_->value();
+ }
+ Status status() const override {
+ if (!status_.ok()) {
+ return status_;
+ } else if (file_iter_) {
+ return file_iter_->status();
+ }
+ return Status::OK();
+ }
+ bool IsKeyPinned() const override {
+ return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+ file_iter_->IsKeyPinned();
+ }
+ bool IsValuePinned() const override {
+ return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+ file_iter_->IsValuePinned();
+ }
+ void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+ pinned_iters_mgr_ = pinned_iters_mgr;
+ if (file_iter_) {
+ file_iter_->SetPinnedItersMgr(pinned_iters_mgr_);
+ }
+ }
+
+ private:
+ const ColumnFamilyData* const cfd_;
+ const ReadOptions& read_options_;
+ const std::vector<FileMetaData*>& files_;
+
+ bool valid_;
+ uint32_t file_index_;
+ Status status_;
+ InternalIterator* file_iter_;
+ PinnedIteratorsManager* pinned_iters_mgr_;
+ const SliceTransform* prefix_extractor_;
+};
+
+ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options,
+ ColumnFamilyData* cfd,
+ SuperVersion* current_sv)
+ : db_(db),
+ read_options_(read_options),
+ cfd_(cfd),
+ prefix_extractor_(current_sv->mutable_cf_options.prefix_extractor.get()),
+ user_comparator_(cfd->user_comparator()),
+ immutable_min_heap_(MinIterComparator(&cfd_->internal_comparator())),
+ sv_(current_sv),
+ mutable_iter_(nullptr),
+ current_(nullptr),
+ valid_(false),
+ status_(Status::OK()),
+ immutable_status_(Status::OK()),
+ has_iter_trimmed_for_upper_bound_(false),
+ current_over_upper_bound_(false),
+ is_prev_set_(false),
+ is_prev_inclusive_(false),
+ pinned_iters_mgr_(nullptr) {
+ if (sv_) {
+ RebuildIterators(false);
+ }
+}
+
+ForwardIterator::~ForwardIterator() {
+ Cleanup(true);
+}
+
+void ForwardIterator::SVCleanup(DBImpl* db, SuperVersion* sv,
+ bool background_purge_on_iterator_cleanup) {
+ if (sv->Unref()) {
+ // Job id == 0 means that this is not our background process, but rather
+ // user thread
+ JobContext job_context(0);
+ db->mutex_.Lock();
+ sv->Cleanup();
+ db->FindObsoleteFiles(&job_context, false, true);
+ if (background_purge_on_iterator_cleanup) {
+ db->ScheduleBgLogWriterClose(&job_context);
+ db->AddSuperVersionsToFreeQueue(sv);
+ db->SchedulePurge();
+ }
+ db->mutex_.Unlock();
+ if (!background_purge_on_iterator_cleanup) {
+ delete sv;
+ }
+ if (job_context.HaveSomethingToDelete()) {
+ db->PurgeObsoleteFiles(job_context, background_purge_on_iterator_cleanup);
+ }
+ job_context.Clean();
+ }
+}
+
+namespace {
+struct SVCleanupParams {
+ DBImpl* db;
+ SuperVersion* sv;
+ bool background_purge_on_iterator_cleanup;
+};
+}
+
+// Used in PinnedIteratorsManager to release pinned SuperVersion
+void ForwardIterator::DeferredSVCleanup(void* arg) {
+ auto d = reinterpret_cast<SVCleanupParams*>(arg);
+ ForwardIterator::SVCleanup(
+ d->db, d->sv, d->background_purge_on_iterator_cleanup);
+ delete d;
+}
+
+void ForwardIterator::SVCleanup() {
+ if (sv_ == nullptr) {
+ return;
+ }
+ bool background_purge =
+ read_options_.background_purge_on_iterator_cleanup ||
+ db_->immutable_db_options().avoid_unnecessary_blocking_io;
+ if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
+ // pinned_iters_mgr_ tells us to make sure that all visited key-value slices
+ // are alive until pinned_iters_mgr_->ReleasePinnedData() is called.
+ // The slices may point into some memtables owned by sv_, so we need to keep
+ // sv_ referenced until pinned_iters_mgr_ unpins everything.
+ auto p = new SVCleanupParams{db_, sv_, background_purge};
+ pinned_iters_mgr_->PinPtr(p, &ForwardIterator::DeferredSVCleanup);
+ } else {
+ SVCleanup(db_, sv_, background_purge);
+ }
+}
+
+void ForwardIterator::Cleanup(bool release_sv) {
+ if (mutable_iter_ != nullptr) {
+ DeleteIterator(mutable_iter_, true /* is_arena */);
+ }
+
+ for (auto* m : imm_iters_) {
+ DeleteIterator(m, true /* is_arena */);
+ }
+ imm_iters_.clear();
+
+ for (auto* f : l0_iters_) {
+ DeleteIterator(f);
+ }
+ l0_iters_.clear();
+
+ for (auto* l : level_iters_) {
+ DeleteIterator(l);
+ }
+ level_iters_.clear();
+
+ if (release_sv) {
+ SVCleanup();
+ }
+}
+
+bool ForwardIterator::Valid() const {
+ // See UpdateCurrent().
+ return valid_ ? !current_over_upper_bound_ : false;
+}
+
+void ForwardIterator::SeekToFirst() {
+ if (sv_ == nullptr) {
+ RebuildIterators(true);
+ } else if (sv_->version_number != cfd_->GetSuperVersionNumber()) {
+ RenewIterators();
+ } else if (immutable_status_.IsIncomplete()) {
+ ResetIncompleteIterators();
+ }
+ SeekInternal(Slice(), true);
+}
+
+bool ForwardIterator::IsOverUpperBound(const Slice& internal_key) const {
+ return !(read_options_.iterate_upper_bound == nullptr ||
+ cfd_->internal_comparator().user_comparator()->Compare(
+ ExtractUserKey(internal_key),
+ *read_options_.iterate_upper_bound) < 0);
+}
+
+void ForwardIterator::Seek(const Slice& internal_key) {
+ if (sv_ == nullptr) {
+ RebuildIterators(true);
+ } else if (sv_->version_number != cfd_->GetSuperVersionNumber()) {
+ RenewIterators();
+ } else if (immutable_status_.IsIncomplete()) {
+ ResetIncompleteIterators();
+ }
+ SeekInternal(internal_key, false);
+}
+
+void ForwardIterator::SeekInternal(const Slice& internal_key,
+ bool seek_to_first) {
+ assert(mutable_iter_);
+ // mutable
+ seek_to_first ? mutable_iter_->SeekToFirst() :
+ mutable_iter_->Seek(internal_key);
+
+ // immutable
+ // TODO(ljin): NeedToSeekImmutable has negative impact on performance
+ // if it turns to need to seek immutable often. We probably want to have
+ // an option to turn it off.
+ if (seek_to_first || NeedToSeekImmutable(internal_key)) {
+ immutable_status_ = Status::OK();
+ if (has_iter_trimmed_for_upper_bound_ &&
+ (
+ // prev_ is not set yet
+ is_prev_set_ == false ||
+ // We are doing SeekToFirst() and internal_key.size() = 0
+ seek_to_first ||
+ // prev_key_ > internal_key
+ cfd_->internal_comparator().InternalKeyComparator::Compare(
+ prev_key_.GetInternalKey(), internal_key) > 0)) {
+ // Some iterators are trimmed. Need to rebuild.
+ RebuildIterators(true);
+ // Already seeked mutable iter, so seek again
+ seek_to_first ? mutable_iter_->SeekToFirst()
+ : mutable_iter_->Seek(internal_key);
+ }
+ {
+ auto tmp = MinIterHeap(MinIterComparator(&cfd_->internal_comparator()));
+ immutable_min_heap_.swap(tmp);
+ }
+ for (size_t i = 0; i < imm_iters_.size(); i++) {
+ auto* m = imm_iters_[i];
+ seek_to_first ? m->SeekToFirst() : m->Seek(internal_key);
+ if (!m->status().ok()) {
+ immutable_status_ = m->status();
+ } else if (m->Valid()) {
+ immutable_min_heap_.push(m);
+ }
+ }
+
+ Slice target_user_key;
+ if (!seek_to_first) {
+ target_user_key = ExtractUserKey(internal_key);
+ }
+ const VersionStorageInfo* vstorage = sv_->current->storage_info();
+ const std::vector<FileMetaData*>& l0 = vstorage->LevelFiles(0);
+ for (size_t i = 0; i < l0.size(); ++i) {
+ if (!l0_iters_[i]) {
+ continue;
+ }
+ if (seek_to_first) {
+ l0_iters_[i]->SeekToFirst();
+ } else {
+ // If the target key passes over the larget key, we are sure Next()
+ // won't go over this file.
+ if (user_comparator_->Compare(target_user_key,
+ l0[i]->largest.user_key()) > 0) {
+ if (read_options_.iterate_upper_bound != nullptr) {
+ has_iter_trimmed_for_upper_bound_ = true;
+ DeleteIterator(l0_iters_[i]);
+ l0_iters_[i] = nullptr;
+ }
+ continue;
+ }
+ l0_iters_[i]->Seek(internal_key);
+ }
+
+ if (!l0_iters_[i]->status().ok()) {
+ immutable_status_ = l0_iters_[i]->status();
+ } else if (l0_iters_[i]->Valid() &&
+ !IsOverUpperBound(l0_iters_[i]->key())) {
+ immutable_min_heap_.push(l0_iters_[i]);
+ } else {
+ has_iter_trimmed_for_upper_bound_ = true;
+ DeleteIterator(l0_iters_[i]);
+ l0_iters_[i] = nullptr;
+ }
+ }
+
+ for (int32_t level = 1; level < vstorage->num_levels(); ++level) {
+ const std::vector<FileMetaData*>& level_files =
+ vstorage->LevelFiles(level);
+ if (level_files.empty()) {
+ continue;
+ }
+ if (level_iters_[level - 1] == nullptr) {
+ continue;
+ }
+ uint32_t f_idx = 0;
+ if (!seek_to_first) {
+ f_idx = FindFileInRange(level_files, internal_key, 0,
+ static_cast<uint32_t>(level_files.size()));
+ }
+
+ // Seek
+ if (f_idx < level_files.size()) {
+ level_iters_[level - 1]->SetFileIndex(f_idx);
+ seek_to_first ? level_iters_[level - 1]->SeekToFirst() :
+ level_iters_[level - 1]->Seek(internal_key);
+
+ if (!level_iters_[level - 1]->status().ok()) {
+ immutable_status_ = level_iters_[level - 1]->status();
+ } else if (level_iters_[level - 1]->Valid() &&
+ !IsOverUpperBound(level_iters_[level - 1]->key())) {
+ immutable_min_heap_.push(level_iters_[level - 1]);
+ } else {
+ // Nothing in this level is interesting. Remove.
+ has_iter_trimmed_for_upper_bound_ = true;
+ DeleteIterator(level_iters_[level - 1]);
+ level_iters_[level - 1] = nullptr;
+ }
+ }
+ }
+
+ if (seek_to_first) {
+ is_prev_set_ = false;
+ } else {
+ prev_key_.SetInternalKey(internal_key);
+ is_prev_set_ = true;
+ is_prev_inclusive_ = true;
+ }
+
+ TEST_SYNC_POINT_CALLBACK("ForwardIterator::SeekInternal:Immutable", this);
+ } else if (current_ && current_ != mutable_iter_) {
+ // current_ is one of immutable iterators, push it back to the heap
+ immutable_min_heap_.push(current_);
+ }
+
+ UpdateCurrent();
+ TEST_SYNC_POINT_CALLBACK("ForwardIterator::SeekInternal:Return", this);
+}
+
+void ForwardIterator::Next() {
+ assert(valid_);
+ bool update_prev_key = false;
+
+ if (sv_ == nullptr ||
+ sv_->version_number != cfd_->GetSuperVersionNumber()) {
+ std::string current_key = key().ToString();
+ Slice old_key(current_key.data(), current_key.size());
+
+ if (sv_ == nullptr) {
+ RebuildIterators(true);
+ } else {
+ RenewIterators();
+ }
+ SeekInternal(old_key, false);
+ if (!valid_ || key().compare(old_key) != 0) {
+ return;
+ }
+ } else if (current_ != mutable_iter_) {
+ // It is going to advance immutable iterator
+
+ if (is_prev_set_ && prefix_extractor_) {
+ // advance prev_key_ to current_ only if they share the same prefix
+ update_prev_key =
+ prefix_extractor_->Transform(prev_key_.GetUserKey())
+ .compare(prefix_extractor_->Transform(current_->key())) == 0;
+ } else {
+ update_prev_key = true;
+ }
+
+
+ if (update_prev_key) {
+ prev_key_.SetInternalKey(current_->key());
+ is_prev_set_ = true;
+ is_prev_inclusive_ = false;
+ }
+ }
+
+ current_->Next();
+ if (current_ != mutable_iter_) {
+ if (!current_->status().ok()) {
+ immutable_status_ = current_->status();
+ } else if ((current_->Valid()) && (!IsOverUpperBound(current_->key()))) {
+ immutable_min_heap_.push(current_);
+ } else {
+ if ((current_->Valid()) && (IsOverUpperBound(current_->key()))) {
+ // remove the current iterator
+ DeleteCurrentIter();
+ current_ = nullptr;
+ }
+ if (update_prev_key) {
+ mutable_iter_->Seek(prev_key_.GetInternalKey());
+ }
+ }
+ }
+ UpdateCurrent();
+ TEST_SYNC_POINT_CALLBACK("ForwardIterator::Next:Return", this);
+}
+
+Slice ForwardIterator::key() const {
+ assert(valid_);
+ return current_->key();
+}
+
+Slice ForwardIterator::value() const {
+ assert(valid_);
+ return current_->value();
+}
+
+Status ForwardIterator::status() const {
+ if (!status_.ok()) {
+ return status_;
+ } else if (!mutable_iter_->status().ok()) {
+ return mutable_iter_->status();
+ }
+
+ return immutable_status_;
+}
+
+Status ForwardIterator::GetProperty(std::string prop_name, std::string* prop) {
+ assert(prop != nullptr);
+ if (prop_name == "rocksdb.iterator.super-version-number") {
+ *prop = ToString(sv_->version_number);
+ return Status::OK();
+ }
+ return Status::InvalidArgument();
+}
+
+void ForwardIterator::SetPinnedItersMgr(
+ PinnedIteratorsManager* pinned_iters_mgr) {
+ pinned_iters_mgr_ = pinned_iters_mgr;
+ UpdateChildrenPinnedItersMgr();
+}
+
+void ForwardIterator::UpdateChildrenPinnedItersMgr() {
+ // Set PinnedIteratorsManager for mutable memtable iterator.
+ if (mutable_iter_) {
+ mutable_iter_->SetPinnedItersMgr(pinned_iters_mgr_);
+ }
+
+ // Set PinnedIteratorsManager for immutable memtable iterators.
+ for (InternalIterator* child_iter : imm_iters_) {
+ if (child_iter) {
+ child_iter->SetPinnedItersMgr(pinned_iters_mgr_);
+ }
+ }
+
+ // Set PinnedIteratorsManager for L0 files iterators.
+ for (InternalIterator* child_iter : l0_iters_) {
+ if (child_iter) {
+ child_iter->SetPinnedItersMgr(pinned_iters_mgr_);
+ }
+ }
+
+ // Set PinnedIteratorsManager for L1+ levels iterators.
+ for (ForwardLevelIterator* child_iter : level_iters_) {
+ if (child_iter) {
+ child_iter->SetPinnedItersMgr(pinned_iters_mgr_);
+ }
+ }
+}
+
+bool ForwardIterator::IsKeyPinned() const {
+ return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+ current_->IsKeyPinned();
+}
+
+bool ForwardIterator::IsValuePinned() const {
+ return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+ current_->IsValuePinned();
+}
+
+void ForwardIterator::RebuildIterators(bool refresh_sv) {
+ // Clean up
+ Cleanup(refresh_sv);
+ if (refresh_sv) {
+ // New
+ sv_ = cfd_->GetReferencedSuperVersion(db_);
+ }
+ ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(),
+ kMaxSequenceNumber /* upper_bound */);
+ mutable_iter_ = sv_->mem->NewIterator(read_options_, &arena_);
+ sv_->imm->AddIterators(read_options_, &imm_iters_, &arena_);
+ if (!read_options_.ignore_range_deletions) {
+ std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+ sv_->mem->NewRangeTombstoneIterator(
+ read_options_, sv_->current->version_set()->LastSequence()));
+ range_del_agg.AddTombstones(std::move(range_del_iter));
+ sv_->imm->AddRangeTombstoneIterators(read_options_, &arena_,
+ &range_del_agg);
+ }
+ has_iter_trimmed_for_upper_bound_ = false;
+
+ const auto* vstorage = sv_->current->storage_info();
+ const auto& l0_files = vstorage->LevelFiles(0);
+ l0_iters_.reserve(l0_files.size());
+ for (const auto* l0 : l0_files) {
+ if ((read_options_.iterate_upper_bound != nullptr) &&
+ cfd_->internal_comparator().user_comparator()->Compare(
+ l0->smallest.user_key(), *read_options_.iterate_upper_bound) > 0) {
+ // No need to set has_iter_trimmed_for_upper_bound_: this ForwardIterator
+ // will never be interested in files with smallest key above
+ // iterate_upper_bound, since iterate_upper_bound can't be changed.
+ l0_iters_.push_back(nullptr);
+ continue;
+ }
+ l0_iters_.push_back(cfd_->table_cache()->NewIterator(
+ read_options_, *cfd_->soptions(), cfd_->internal_comparator(), *l0,
+ read_options_.ignore_range_deletions ? nullptr : &range_del_agg,
+ sv_->mutable_cf_options.prefix_extractor.get(),
+ /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
+ TableReaderCaller::kUserIterator, /*arena=*/nullptr,
+ /*skip_filters=*/false, /*level=*/-1,
+ /*smallest_compaction_key=*/nullptr,
+ /*largest_compaction_key=*/nullptr));
+ }
+ BuildLevelIterators(vstorage);
+ current_ = nullptr;
+ is_prev_set_ = false;
+
+ UpdateChildrenPinnedItersMgr();
+ if (!range_del_agg.IsEmpty()) {
+ status_ = Status::NotSupported(
+ "Range tombstones unsupported with ForwardIterator");
+ valid_ = false;
+ }
+}
+
+void ForwardIterator::RenewIterators() {
+ SuperVersion* svnew;
+ assert(sv_);
+ svnew = cfd_->GetReferencedSuperVersion(db_);
+
+ if (mutable_iter_ != nullptr) {
+ DeleteIterator(mutable_iter_, true /* is_arena */);
+ }
+ for (auto* m : imm_iters_) {
+ DeleteIterator(m, true /* is_arena */);
+ }
+ imm_iters_.clear();
+
+ mutable_iter_ = svnew->mem->NewIterator(read_options_, &arena_);
+ svnew->imm->AddIterators(read_options_, &imm_iters_, &arena_);
+ ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(),
+ kMaxSequenceNumber /* upper_bound */);
+ if (!read_options_.ignore_range_deletions) {
+ std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+ svnew->mem->NewRangeTombstoneIterator(
+ read_options_, sv_->current->version_set()->LastSequence()));
+ range_del_agg.AddTombstones(std::move(range_del_iter));
+ svnew->imm->AddRangeTombstoneIterators(read_options_, &arena_,
+ &range_del_agg);
+ }
+
+ const auto* vstorage = sv_->current->storage_info();
+ const auto& l0_files = vstorage->LevelFiles(0);
+ const auto* vstorage_new = svnew->current->storage_info();
+ const auto& l0_files_new = vstorage_new->LevelFiles(0);
+ size_t iold, inew;
+ bool found;
+ std::vector<InternalIterator*> l0_iters_new;
+ l0_iters_new.reserve(l0_files_new.size());
+
+ for (inew = 0; inew < l0_files_new.size(); inew++) {
+ found = false;
+ for (iold = 0; iold < l0_files.size(); iold++) {
+ if (l0_files[iold] == l0_files_new[inew]) {
+ found = true;
+ break;
+ }
+ }
+ if (found) {
+ if (l0_iters_[iold] == nullptr) {
+ l0_iters_new.push_back(nullptr);
+ TEST_SYNC_POINT_CALLBACK("ForwardIterator::RenewIterators:Null", this);
+ } else {
+ l0_iters_new.push_back(l0_iters_[iold]);
+ l0_iters_[iold] = nullptr;
+ TEST_SYNC_POINT_CALLBACK("ForwardIterator::RenewIterators:Copy", this);
+ }
+ continue;
+ }
+ l0_iters_new.push_back(cfd_->table_cache()->NewIterator(
+ read_options_, *cfd_->soptions(), cfd_->internal_comparator(),
+ *l0_files_new[inew],
+ read_options_.ignore_range_deletions ? nullptr : &range_del_agg,
+ svnew->mutable_cf_options.prefix_extractor.get(),
+ /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
+ TableReaderCaller::kUserIterator, /*arena=*/nullptr,
+ /*skip_filters=*/false, /*level=*/-1,
+ /*smallest_compaction_key=*/nullptr,
+ /*largest_compaction_key=*/nullptr));
+ }
+
+ for (auto* f : l0_iters_) {
+ DeleteIterator(f);
+ }
+ l0_iters_.clear();
+ l0_iters_ = l0_iters_new;
+
+ for (auto* l : level_iters_) {
+ DeleteIterator(l);
+ }
+ level_iters_.clear();
+ BuildLevelIterators(vstorage_new);
+ current_ = nullptr;
+ is_prev_set_ = false;
+ SVCleanup();
+ sv_ = svnew;
+
+ UpdateChildrenPinnedItersMgr();
+ if (!range_del_agg.IsEmpty()) {
+ status_ = Status::NotSupported(
+ "Range tombstones unsupported with ForwardIterator");
+ valid_ = false;
+ }
+}
+
+void ForwardIterator::BuildLevelIterators(const VersionStorageInfo* vstorage) {
+ level_iters_.reserve(vstorage->num_levels() - 1);
+ for (int32_t level = 1; level < vstorage->num_levels(); ++level) {
+ const auto& level_files = vstorage->LevelFiles(level);
+ if ((level_files.empty()) ||
+ ((read_options_.iterate_upper_bound != nullptr) &&
+ (user_comparator_->Compare(*read_options_.iterate_upper_bound,
+ level_files[0]->smallest.user_key()) <
+ 0))) {
+ level_iters_.push_back(nullptr);
+ if (!level_files.empty()) {
+ has_iter_trimmed_for_upper_bound_ = true;
+ }
+ } else {
+ level_iters_.push_back(new ForwardLevelIterator(
+ cfd_, read_options_, level_files,
+ sv_->mutable_cf_options.prefix_extractor.get()));
+ }
+ }
+}
+
+void ForwardIterator::ResetIncompleteIterators() {
+ const auto& l0_files = sv_->current->storage_info()->LevelFiles(0);
+ for (size_t i = 0; i < l0_iters_.size(); ++i) {
+ assert(i < l0_files.size());
+ if (!l0_iters_[i] || !l0_iters_[i]->status().IsIncomplete()) {
+ continue;
+ }
+ DeleteIterator(l0_iters_[i]);
+ l0_iters_[i] = cfd_->table_cache()->NewIterator(
+ read_options_, *cfd_->soptions(), cfd_->internal_comparator(),
+ *l0_files[i], /*range_del_agg=*/nullptr,
+ sv_->mutable_cf_options.prefix_extractor.get(),
+ /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
+ TableReaderCaller::kUserIterator, /*arena=*/nullptr,
+ /*skip_filters=*/false, /*level=*/-1,
+ /*smallest_compaction_key=*/nullptr,
+ /*largest_compaction_key=*/nullptr);
+ l0_iters_[i]->SetPinnedItersMgr(pinned_iters_mgr_);
+ }
+
+ for (auto* level_iter : level_iters_) {
+ if (level_iter && level_iter->status().IsIncomplete()) {
+ level_iter->Reset();
+ }
+ }
+
+ current_ = nullptr;
+ is_prev_set_ = false;
+}
+
+void ForwardIterator::UpdateCurrent() {
+ if (immutable_min_heap_.empty() && !mutable_iter_->Valid()) {
+ current_ = nullptr;
+ } else if (immutable_min_heap_.empty()) {
+ current_ = mutable_iter_;
+ } else if (!mutable_iter_->Valid()) {
+ current_ = immutable_min_heap_.top();
+ immutable_min_heap_.pop();
+ } else {
+ current_ = immutable_min_heap_.top();
+ assert(current_ != nullptr);
+ assert(current_->Valid());
+ int cmp = cfd_->internal_comparator().InternalKeyComparator::Compare(
+ mutable_iter_->key(), current_->key());
+ assert(cmp != 0);
+ if (cmp > 0) {
+ immutable_min_heap_.pop();
+ } else {
+ current_ = mutable_iter_;
+ }
+ }
+ valid_ = current_ != nullptr && immutable_status_.ok();
+ if (!status_.ok()) {
+ status_ = Status::OK();
+ }
+
+ // Upper bound doesn't apply to the memtable iterator. We want Valid() to
+ // return false when all iterators are over iterate_upper_bound, but can't
+ // just set valid_ to false, as that would effectively disable the tailing
+ // optimization (Seek() would be called on all immutable iterators regardless
+ // of whether the target key is greater than prev_key_).
+ current_over_upper_bound_ = valid_ && IsOverUpperBound(current_->key());
+}
+
+bool ForwardIterator::NeedToSeekImmutable(const Slice& target) {
+ // We maintain the interval (prev_key_, immutable_min_heap_.top()->key())
+ // such that there are no records with keys within that range in
+ // immutable_min_heap_. Since immutable structures (SST files and immutable
+ // memtables) can't change in this version, we don't need to do a seek if
+ // 'target' belongs to that interval (immutable_min_heap_.top() is already
+ // at the correct position).
+
+ if (!valid_ || !current_ || !is_prev_set_ || !immutable_status_.ok()) {
+ return true;
+ }
+ Slice prev_key = prev_key_.GetInternalKey();
+ if (prefix_extractor_ && prefix_extractor_->Transform(target).compare(
+ prefix_extractor_->Transform(prev_key)) != 0) {
+ return true;
+ }
+ if (cfd_->internal_comparator().InternalKeyComparator::Compare(
+ prev_key, target) >= (is_prev_inclusive_ ? 1 : 0)) {
+ return true;
+ }
+
+ if (immutable_min_heap_.empty() && current_ == mutable_iter_) {
+ // Nothing to seek on.
+ return false;
+ }
+ if (cfd_->internal_comparator().InternalKeyComparator::Compare(
+ target, current_ == mutable_iter_ ? immutable_min_heap_.top()->key()
+ : current_->key()) > 0) {
+ return true;
+ }
+ return false;
+}
+
+void ForwardIterator::DeleteCurrentIter() {
+ const VersionStorageInfo* vstorage = sv_->current->storage_info();
+ const std::vector<FileMetaData*>& l0 = vstorage->LevelFiles(0);
+ for (size_t i = 0; i < l0.size(); ++i) {
+ if (!l0_iters_[i]) {
+ continue;
+ }
+ if (l0_iters_[i] == current_) {
+ has_iter_trimmed_for_upper_bound_ = true;
+ DeleteIterator(l0_iters_[i]);
+ l0_iters_[i] = nullptr;
+ return;
+ }
+ }
+
+ for (int32_t level = 1; level < vstorage->num_levels(); ++level) {
+ if (level_iters_[level - 1] == nullptr) {
+ continue;
+ }
+ if (level_iters_[level - 1] == current_) {
+ has_iter_trimmed_for_upper_bound_ = true;
+ DeleteIterator(level_iters_[level - 1]);
+ level_iters_[level - 1] = nullptr;
+ }
+ }
+}
+
+bool ForwardIterator::TEST_CheckDeletedIters(int* pdeleted_iters,
+ int* pnum_iters) {
+ bool retval = false;
+ int deleted_iters = 0;
+ int num_iters = 0;
+
+ const VersionStorageInfo* vstorage = sv_->current->storage_info();
+ const std::vector<FileMetaData*>& l0 = vstorage->LevelFiles(0);
+ for (size_t i = 0; i < l0.size(); ++i) {
+ if (!l0_iters_[i]) {
+ retval = true;
+ deleted_iters++;
+ } else {
+ num_iters++;
+ }
+ }
+
+ for (int32_t level = 1; level < vstorage->num_levels(); ++level) {
+ if ((level_iters_[level - 1] == nullptr) &&
+ (!vstorage->LevelFiles(level).empty())) {
+ retval = true;
+ deleted_iters++;
+ } else if (!vstorage->LevelFiles(level).empty()) {
+ num_iters++;
+ }
+ }
+ if ((!retval) && num_iters <= 1) {
+ retval = true;
+ }
+ if (pdeleted_iters) {
+ *pdeleted_iters = deleted_iters;
+ }
+ if (pnum_iters) {
+ *pnum_iters = num_iters;
+ }
+ return retval;
+}
+
+uint32_t ForwardIterator::FindFileInRange(
+ const std::vector<FileMetaData*>& files, const Slice& internal_key,
+ uint32_t left, uint32_t right) {
+ auto cmp = [&](const FileMetaData* f, const Slice& key) -> bool {
+ return cfd_->internal_comparator().InternalKeyComparator::Compare(
+ f->largest.Encode(), key) < 0;
+ };
+ const auto &b = files.begin();
+ return static_cast<uint32_t>(std::lower_bound(b + left,
+ b + right, internal_key, cmp) - b);
+}
+
+void ForwardIterator::DeleteIterator(InternalIterator* iter, bool is_arena) {
+ if (iter == nullptr) {
+ return;
+ }
+
+ if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
+ pinned_iters_mgr_->PinIterator(iter, is_arena);
+ } else {
+ if (is_arena) {
+ iter->~InternalIterator();
+ } else {
+ delete iter;
+ }
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/db/forward_iterator.h b/src/rocksdb/db/forward_iterator.h
new file mode 100644
index 000000000..8c671c75f
--- /dev/null
+++ b/src/rocksdb/db/forward_iterator.h
@@ -0,0 +1,160 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+#include <queue>
+
+#include "db/dbformat.h"
+#include "memory/arena.h"
+#include "rocksdb/db.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBImpl;
+class Env;
+struct SuperVersion;
+class ColumnFamilyData;
+class ForwardLevelIterator;
+class VersionStorageInfo;
+struct FileMetaData;
+
+class MinIterComparator {
+ public:
+ explicit MinIterComparator(const Comparator* comparator) :
+ comparator_(comparator) {}
+
+ bool operator()(InternalIterator* a, InternalIterator* b) {
+ return comparator_->Compare(a->key(), b->key()) > 0;
+ }
+ private:
+ const Comparator* comparator_;
+};
+
+typedef std::priority_queue<InternalIterator*, std::vector<InternalIterator*>,
+ MinIterComparator> MinIterHeap;
+
+/**
+ * ForwardIterator is a special type of iterator that only supports Seek()
+ * and Next(). It is expected to perform better than TailingIterator by
+ * removing the encapsulation and making all information accessible within
+ * the iterator. At the current implementation, snapshot is taken at the
+ * time Seek() is called. The Next() followed do not see new values after.
+ */
+class ForwardIterator : public InternalIterator {
+ public:
+ ForwardIterator(DBImpl* db, const ReadOptions& read_options,
+ ColumnFamilyData* cfd, SuperVersion* current_sv = nullptr);
+ virtual ~ForwardIterator();
+
+ void SeekForPrev(const Slice& /*target*/) override {
+ status_ = Status::NotSupported("ForwardIterator::SeekForPrev()");
+ valid_ = false;
+ }
+ void SeekToLast() override {
+ status_ = Status::NotSupported("ForwardIterator::SeekToLast()");
+ valid_ = false;
+ }
+ void Prev() override {
+ status_ = Status::NotSupported("ForwardIterator::Prev");
+ valid_ = false;
+ }
+
+ virtual bool Valid() const override;
+ void SeekToFirst() override;
+ virtual void Seek(const Slice& target) override;
+ virtual void Next() override;
+ virtual Slice key() const override;
+ virtual Slice value() const override;
+ virtual Status status() const override;
+ virtual Status GetProperty(std::string prop_name, std::string* prop) override;
+ virtual void SetPinnedItersMgr(
+ PinnedIteratorsManager* pinned_iters_mgr) override;
+ virtual bool IsKeyPinned() const override;
+ virtual bool IsValuePinned() const override;
+
+ bool TEST_CheckDeletedIters(int* deleted_iters, int* num_iters);
+
+ private:
+ void Cleanup(bool release_sv);
+ // Unreference and, if needed, clean up the current SuperVersion. This is
+ // either done immediately or deferred until this iterator is unpinned by
+ // PinnedIteratorsManager.
+ void SVCleanup();
+ static void SVCleanup(
+ DBImpl* db, SuperVersion* sv, bool background_purge_on_iterator_cleanup);
+ static void DeferredSVCleanup(void* arg);
+
+ void RebuildIterators(bool refresh_sv);
+ void RenewIterators();
+ void BuildLevelIterators(const VersionStorageInfo* vstorage);
+ void ResetIncompleteIterators();
+ void SeekInternal(const Slice& internal_key, bool seek_to_first);
+ void UpdateCurrent();
+ bool NeedToSeekImmutable(const Slice& internal_key);
+ void DeleteCurrentIter();
+ uint32_t FindFileInRange(
+ const std::vector<FileMetaData*>& files, const Slice& internal_key,
+ uint32_t left, uint32_t right);
+
+ bool IsOverUpperBound(const Slice& internal_key) const;
+
+ // Set PinnedIteratorsManager for all children Iterators, this function should
+ // be called whenever we update children Iterators or pinned_iters_mgr_.
+ void UpdateChildrenPinnedItersMgr();
+
+ // A helper function that will release iter in the proper manner, or pass it
+ // to pinned_iters_mgr_ to release it later if pinning is enabled.
+ void DeleteIterator(InternalIterator* iter, bool is_arena = false);
+
+ DBImpl* const db_;
+ const ReadOptions read_options_;
+ ColumnFamilyData* const cfd_;
+ const SliceTransform* const prefix_extractor_;
+ const Comparator* user_comparator_;
+ MinIterHeap immutable_min_heap_;
+
+ SuperVersion* sv_;
+ InternalIterator* mutable_iter_;
+ std::vector<InternalIterator*> imm_iters_;
+ std::vector<InternalIterator*> l0_iters_;
+ std::vector<ForwardLevelIterator*> level_iters_;
+ InternalIterator* current_;
+ bool valid_;
+
+ // Internal iterator status; set only by one of the unsupported methods.
+ Status status_;
+ // Status of immutable iterators, maintained here to avoid iterating over
+ // all of them in status().
+ Status immutable_status_;
+ // Indicates that at least one of the immutable iterators pointed to a key
+ // larger than iterate_upper_bound and was therefore destroyed. Seek() may
+ // need to rebuild such iterators.
+ bool has_iter_trimmed_for_upper_bound_;
+ // Is current key larger than iterate_upper_bound? If so, makes Valid()
+ // return false.
+ bool current_over_upper_bound_;
+
+ // Left endpoint of the range of keys that immutable iterators currently
+ // cover. When Seek() is called with a key that's within that range, immutable
+ // iterators don't need to be moved; see NeedToSeekImmutable(). This key is
+ // included in the range after a Seek(), but excluded when advancing the
+ // iterator using Next().
+ IterKey prev_key_;
+ bool is_prev_set_;
+ bool is_prev_inclusive_;
+
+ PinnedIteratorsManager* pinned_iters_mgr_;
+ Arena arena_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/db/forward_iterator_bench.cc b/src/rocksdb/db/forward_iterator_bench.cc
new file mode 100644
index 000000000..6f1223537
--- /dev/null
+++ b/src/rocksdb/db/forward_iterator_bench.cc
@@ -0,0 +1,377 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#if !defined(GFLAGS) || defined(ROCKSDB_LITE)
+#include <cstdio>
+int main() {
+ fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+ return 1;
+}
+#elif defined(OS_MACOSX) || defined(OS_WIN)
+// Block forward_iterator_bench under MAC and Windows
+int main() { return 0; }
+#else
+#include <semaphore.h>
+#include <atomic>
+#include <bitset>
+#include <chrono>
+#include <climits>
+#include <condition_variable>
+#include <limits>
+#include <mutex>
+#include <queue>
+#include <random>
+#include <thread>
+
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "test_util/testharness.h"
+#include "util/gflags_compat.h"
+
+const int MAX_SHARDS = 100000;
+
+DEFINE_int32(writers, 8, "");
+DEFINE_int32(readers, 8, "");
+DEFINE_int64(rate, 100000, "");
+DEFINE_int64(value_size, 300, "");
+DEFINE_int64(shards, 1000, "");
+DEFINE_int64(memtable_size, 500000000, "");
+DEFINE_int64(block_cache_size, 300000000, "");
+DEFINE_int64(block_size, 65536, "");
+DEFINE_double(runtime, 300.0, "");
+DEFINE_bool(cache_only_first, true, "");
+DEFINE_bool(iterate_upper_bound, true, "");
+
+struct Stats {
+ char pad1[128] __attribute__((__unused__));
+ std::atomic<uint64_t> written{0};
+ char pad2[128] __attribute__((__unused__));
+ std::atomic<uint64_t> read{0};
+ std::atomic<uint64_t> cache_misses{0};
+ char pad3[128] __attribute__((__unused__));
+} stats;
+
+struct Key {
+ Key() {}
+ Key(uint64_t shard_in, uint64_t seqno_in)
+ : shard_be(htobe64(shard_in)), seqno_be(htobe64(seqno_in)) {}
+
+ uint64_t shard() const { return be64toh(shard_be); }
+ uint64_t seqno() const { return be64toh(seqno_be); }
+
+ private:
+ uint64_t shard_be;
+ uint64_t seqno_be;
+} __attribute__((__packed__));
+
+struct Reader;
+struct Writer;
+
+struct ShardState {
+ char pad1[128] __attribute__((__unused__));
+ std::atomic<uint64_t> last_written{0};
+ Writer* writer;
+ Reader* reader;
+ char pad2[128] __attribute__((__unused__));
+ std::atomic<uint64_t> last_read{0};
+ std::unique_ptr<ROCKSDB_NAMESPACE::Iterator> it;
+ std::unique_ptr<ROCKSDB_NAMESPACE::Iterator> it_cacheonly;
+ Key upper_bound;
+ ROCKSDB_NAMESPACE::Slice upper_bound_slice;
+ char pad3[128] __attribute__((__unused__));
+};
+
+struct Reader {
+ public:
+ explicit Reader(std::vector<ShardState>* shard_states,
+ ROCKSDB_NAMESPACE::DB* db)
+ : shard_states_(shard_states), db_(db) {
+ sem_init(&sem_, 0, 0);
+ thread_ = port::Thread(&Reader::run, this);
+ }
+
+ void run() {
+ while (1) {
+ sem_wait(&sem_);
+ if (done_.load()) {
+ break;
+ }
+
+ uint64_t shard;
+ {
+ std::lock_guard<std::mutex> guard(queue_mutex_);
+ assert(!shards_pending_queue_.empty());
+ shard = shards_pending_queue_.front();
+ shards_pending_queue_.pop();
+ shards_pending_set_.reset(shard);
+ }
+ readOnceFromShard(shard);
+ }
+ }
+
+ void readOnceFromShard(uint64_t shard) {
+ ShardState& state = (*shard_states_)[shard];
+ if (!state.it) {
+ // Initialize iterators
+ ROCKSDB_NAMESPACE::ReadOptions options;
+ options.tailing = true;
+ if (FLAGS_iterate_upper_bound) {
+ state.upper_bound = Key(shard, std::numeric_limits<uint64_t>::max());
+ state.upper_bound_slice = ROCKSDB_NAMESPACE::Slice(
+ (const char*)&state.upper_bound, sizeof(state.upper_bound));
+ options.iterate_upper_bound = &state.upper_bound_slice;
+ }
+
+ state.it.reset(db_->NewIterator(options));
+
+ if (FLAGS_cache_only_first) {
+ options.read_tier = ROCKSDB_NAMESPACE::ReadTier::kBlockCacheTier;
+ state.it_cacheonly.reset(db_->NewIterator(options));
+ }
+ }
+
+ const uint64_t upto = state.last_written.load();
+ for (ROCKSDB_NAMESPACE::Iterator* it :
+ {state.it_cacheonly.get(), state.it.get()}) {
+ if (it == nullptr) {
+ continue;
+ }
+ if (state.last_read.load() >= upto) {
+ break;
+ }
+ bool need_seek = true;
+ for (uint64_t seq = state.last_read.load() + 1; seq <= upto; ++seq) {
+ if (need_seek) {
+ Key from(shard, state.last_read.load() + 1);
+ it->Seek(ROCKSDB_NAMESPACE::Slice((const char*)&from, sizeof(from)));
+ need_seek = false;
+ } else {
+ it->Next();
+ }
+ if (it->status().IsIncomplete()) {
+ ++::stats.cache_misses;
+ break;
+ }
+ assert(it->Valid());
+ assert(it->key().size() == sizeof(Key));
+ Key key;
+ memcpy(&key, it->key().data(), it->key().size());
+ // fprintf(stderr, "Expecting (%ld, %ld) read (%ld, %ld)\n",
+ // shard, seq, key.shard(), key.seqno());
+ assert(key.shard() == shard);
+ assert(key.seqno() == seq);
+ state.last_read.store(seq);
+ ++::stats.read;
+ }
+ }
+ }
+
+ void onWrite(uint64_t shard) {
+ {
+ std::lock_guard<std::mutex> guard(queue_mutex_);
+ if (!shards_pending_set_.test(shard)) {
+ shards_pending_queue_.push(shard);
+ shards_pending_set_.set(shard);
+ sem_post(&sem_);
+ }
+ }
+ }
+
+ ~Reader() {
+ done_.store(true);
+ sem_post(&sem_);
+ thread_.join();
+ }
+
+ private:
+ char pad1[128] __attribute__((__unused__));
+ std::vector<ShardState>* shard_states_;
+ ROCKSDB_NAMESPACE::DB* db_;
+ ROCKSDB_NAMESPACE::port::Thread thread_;
+ sem_t sem_;
+ std::mutex queue_mutex_;
+ std::bitset<MAX_SHARDS + 1> shards_pending_set_;
+ std::queue<uint64_t> shards_pending_queue_;
+ std::atomic<bool> done_{false};
+ char pad2[128] __attribute__((__unused__));
+};
+
+struct Writer {
+ explicit Writer(std::vector<ShardState>* shard_states,
+ ROCKSDB_NAMESPACE::DB* db)
+ : shard_states_(shard_states), db_(db) {}
+
+ void start() { thread_ = port::Thread(&Writer::run, this); }
+
+ void run() {
+ std::queue<std::chrono::steady_clock::time_point> workq;
+ std::chrono::steady_clock::time_point deadline(
+ std::chrono::steady_clock::now() +
+ std::chrono::nanoseconds((uint64_t)(1000000000 * FLAGS_runtime)));
+ std::vector<uint64_t> my_shards;
+ for (int i = 1; i <= FLAGS_shards; ++i) {
+ if ((*shard_states_)[i].writer == this) {
+ my_shards.push_back(i);
+ }
+ }
+
+ std::mt19937 rng{std::random_device()()};
+ std::uniform_int_distribution<int> shard_dist(
+ 0, static_cast<int>(my_shards.size()) - 1);
+ std::string value(FLAGS_value_size, '*');
+
+ while (1) {
+ auto now = std::chrono::steady_clock::now();
+ if (FLAGS_runtime >= 0 && now >= deadline) {
+ break;
+ }
+ if (workq.empty()) {
+ for (int i = 0; i < FLAGS_rate; i += FLAGS_writers) {
+ std::chrono::nanoseconds offset(1000000000LL * i / FLAGS_rate);
+ workq.push(now + offset);
+ }
+ }
+ while (!workq.empty() && workq.front() < now) {
+ workq.pop();
+ uint64_t shard = my_shards[shard_dist(rng)];
+ ShardState& state = (*shard_states_)[shard];
+ uint64_t seqno = state.last_written.load() + 1;
+ Key key(shard, seqno);
+ // fprintf(stderr, "Writing (%ld, %ld)\n", shard, seqno);
+ ROCKSDB_NAMESPACE::Status status =
+ db_->Put(ROCKSDB_NAMESPACE::WriteOptions(),
+ ROCKSDB_NAMESPACE::Slice((const char*)&key, sizeof(key)),
+ ROCKSDB_NAMESPACE::Slice(value));
+ assert(status.ok());
+ state.last_written.store(seqno);
+ state.reader->onWrite(shard);
+ ++::stats.written;
+ }
+ std::this_thread::sleep_for(std::chrono::milliseconds(1));
+ }
+ // fprintf(stderr, "Writer done\n");
+ }
+
+ ~Writer() { thread_.join(); }
+
+ private:
+ char pad1[128] __attribute__((__unused__));
+ std::vector<ShardState>* shard_states_;
+ ROCKSDB_NAMESPACE::DB* db_;
+ ROCKSDB_NAMESPACE::port::Thread thread_;
+ char pad2[128] __attribute__((__unused__));
+};
+
+struct StatsThread {
+ explicit StatsThread(ROCKSDB_NAMESPACE::DB* db)
+ : db_(db), thread_(&StatsThread::run, this) {}
+
+ void run() {
+ // using namespace std::chrono;
+ auto tstart = std::chrono::steady_clock::now(), tlast = tstart;
+ uint64_t wlast = 0, rlast = 0;
+ while (!done_.load()) {
+ {
+ std::unique_lock<std::mutex> lock(cvm_);
+ cv_.wait_for(lock, std::chrono::seconds(1));
+ }
+ auto now = std::chrono::steady_clock::now();
+ double elapsed =
+ std::chrono::duration_cast<std::chrono::duration<double> >(
+ now - tlast).count();
+ uint64_t w = ::stats.written.load();
+ uint64_t r = ::stats.read.load();
+ fprintf(stderr,
+ "%s elapsed %4lds | written %10ld | w/s %10.0f | read %10ld | "
+ "r/s %10.0f | cache misses %10ld\n",
+ db_->GetEnv()->TimeToString(time(nullptr)).c_str(),
+ std::chrono::duration_cast<std::chrono::seconds>(now - tstart)
+ .count(),
+ w, (w - wlast) / elapsed, r, (r - rlast) / elapsed,
+ ::stats.cache_misses.load());
+ wlast = w;
+ rlast = r;
+ tlast = now;
+ }
+ }
+
+ ~StatsThread() {
+ {
+ std::lock_guard<std::mutex> guard(cvm_);
+ done_.store(true);
+ }
+ cv_.notify_all();
+ thread_.join();
+ }
+
+ private:
+ ROCKSDB_NAMESPACE::DB* db_;
+ std::mutex cvm_;
+ std::condition_variable cv_;
+ ROCKSDB_NAMESPACE::port::Thread thread_;
+ std::atomic<bool> done_{false};
+};
+
+int main(int argc, char** argv) {
+ GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+
+ std::mt19937 rng{std::random_device()()};
+ ROCKSDB_NAMESPACE::Status status;
+ std::string path =
+ ROCKSDB_NAMESPACE::test::PerThreadDBPath("forward_iterator_test");
+ fprintf(stderr, "db path is %s\n", path.c_str());
+ ROCKSDB_NAMESPACE::Options options;
+ options.create_if_missing = true;
+ options.compression = ROCKSDB_NAMESPACE::CompressionType::kNoCompression;
+ options.compaction_style =
+ ROCKSDB_NAMESPACE::CompactionStyle::kCompactionStyleNone;
+ options.level0_slowdown_writes_trigger = 99999;
+ options.level0_stop_writes_trigger = 99999;
+ options.use_direct_io_for_flush_and_compaction = true;
+ options.write_buffer_size = FLAGS_memtable_size;
+ ROCKSDB_NAMESPACE::BlockBasedTableOptions table_options;
+ table_options.block_cache =
+ ROCKSDB_NAMESPACE::NewLRUCache(FLAGS_block_cache_size);
+ table_options.block_size = FLAGS_block_size;
+ options.table_factory.reset(
+ ROCKSDB_NAMESPACE::NewBlockBasedTableFactory(table_options));
+
+ status = ROCKSDB_NAMESPACE::DestroyDB(path, options);
+ assert(status.ok());
+ ROCKSDB_NAMESPACE::DB* db_raw;
+ status = ROCKSDB_NAMESPACE::DB::Open(options, path, &db_raw);
+ assert(status.ok());
+ std::unique_ptr<ROCKSDB_NAMESPACE::DB> db(db_raw);
+
+ std::vector<ShardState> shard_states(FLAGS_shards + 1);
+ std::deque<Reader> readers;
+ while (static_cast<int>(readers.size()) < FLAGS_readers) {
+ readers.emplace_back(&shard_states, db_raw);
+ }
+ std::deque<Writer> writers;
+ while (static_cast<int>(writers.size()) < FLAGS_writers) {
+ writers.emplace_back(&shard_states, db_raw);
+ }
+
+ // Each shard gets a random reader and random writer assigned to it
+ for (int i = 1; i <= FLAGS_shards; ++i) {
+ std::uniform_int_distribution<int> reader_dist(0, FLAGS_readers - 1);
+ std::uniform_int_distribution<int> writer_dist(0, FLAGS_writers - 1);
+ shard_states[i].reader = &readers[reader_dist(rng)];
+ shard_states[i].writer = &writers[writer_dist(rng)];
+ }
+
+ StatsThread stats_thread(db_raw);
+ for (Writer& w : writers) {
+ w.start();
+ }
+
+ writers.clear();
+ readers.clear();
+}
+#endif // !defined(GFLAGS) || defined(ROCKSDB_LITE)
diff --git a/src/rocksdb/db/import_column_family_job.cc b/src/rocksdb/db/import_column_family_job.cc
new file mode 100644
index 000000000..15af1cf80
--- /dev/null
+++ b/src/rocksdb/db/import_column_family_job.cc
@@ -0,0 +1,276 @@
+#ifndef ROCKSDB_LITE
+
+#include "db/import_column_family_job.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <string>
+#include <vector>
+
+#include "db/version_edit.h"
+#include "file/file_util.h"
+#include "file/random_access_file_reader.h"
+#include "table/merging_iterator.h"
+#include "table/scoped_arena_iterator.h"
+#include "table/sst_file_writer_collectors.h"
+#include "table/table_builder.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status ImportColumnFamilyJob::Prepare(uint64_t next_file_number,
+ SuperVersion* sv) {
+ Status status;
+
+ // Read the information of files we are importing
+ for (const auto& file_metadata : metadata_) {
+ const auto file_path = file_metadata.db_path + "/" + file_metadata.name;
+ IngestedFileInfo file_to_import;
+ status = GetIngestedFileInfo(file_path, &file_to_import, sv);
+ if (!status.ok()) {
+ return status;
+ }
+ files_to_import_.push_back(file_to_import);
+ }
+
+ const auto ucmp = cfd_->internal_comparator().user_comparator();
+ auto num_files = files_to_import_.size();
+ if (num_files == 0) {
+ return Status::InvalidArgument("The list of files is empty");
+ } else if (num_files > 1) {
+ // Verify that passed files don't have overlapping ranges in any particular
+ // level.
+ int min_level = 1; // Check for overlaps in Level 1 and above.
+ int max_level = -1;
+ for (const auto& file_metadata : metadata_) {
+ if (file_metadata.level > max_level) {
+ max_level = file_metadata.level;
+ }
+ }
+ for (int level = min_level; level <= max_level; ++level) {
+ autovector<const IngestedFileInfo*> sorted_files;
+ for (size_t i = 0; i < num_files; i++) {
+ if (metadata_[i].level == level) {
+ sorted_files.push_back(&files_to_import_[i]);
+ }
+ }
+
+ std::sort(sorted_files.begin(), sorted_files.end(),
+ [&ucmp](const IngestedFileInfo* info1,
+ const IngestedFileInfo* info2) {
+ return sstableKeyCompare(ucmp, info1->smallest_internal_key,
+ info2->smallest_internal_key) < 0;
+ });
+
+ for (size_t i = 0; i < sorted_files.size() - 1; i++) {
+ if (sstableKeyCompare(ucmp, sorted_files[i]->largest_internal_key,
+ sorted_files[i + 1]->smallest_internal_key) >=
+ 0) {
+ return Status::InvalidArgument("Files have overlapping ranges");
+ }
+ }
+ }
+ }
+
+ for (const auto& f : files_to_import_) {
+ if (f.num_entries == 0) {
+ return Status::InvalidArgument("File contain no entries");
+ }
+
+ if (!f.smallest_internal_key.Valid() || !f.largest_internal_key.Valid()) {
+ return Status::Corruption("File has corrupted keys");
+ }
+ }
+
+ // Copy/Move external files into DB
+ auto hardlink_files = import_options_.move_files;
+ for (auto& f : files_to_import_) {
+ f.fd = FileDescriptor(next_file_number++, 0, f.file_size);
+
+ const auto path_outside_db = f.external_file_path;
+ const auto path_inside_db = TableFileName(
+ cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId());
+
+ if (hardlink_files) {
+ status =
+ fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(), nullptr);
+ if (status.IsNotSupported()) {
+ // Original file is on a different FS, use copy instead of hard linking
+ hardlink_files = false;
+ }
+ }
+ if (!hardlink_files) {
+ status = CopyFile(fs_, path_outside_db, path_inside_db, 0,
+ db_options_.use_fsync);
+ }
+ if (!status.ok()) {
+ break;
+ }
+ f.copy_file = !hardlink_files;
+ f.internal_file_path = path_inside_db;
+ }
+
+ if (!status.ok()) {
+ // We failed, remove all files that we copied into the db
+ for (const auto& f : files_to_import_) {
+ if (f.internal_file_path.empty()) {
+ break;
+ }
+ const auto s =
+ fs_->DeleteFile(f.internal_file_path, IOOptions(), nullptr);
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "AddFile() clean up for file %s failed : %s",
+ f.internal_file_path.c_str(), s.ToString().c_str());
+ }
+ }
+ }
+
+ return status;
+}
+
+// REQUIRES: we have become the only writer by entering both write_thread_ and
+// nonmem_write_thread_
+Status ImportColumnFamilyJob::Run() {
+ Status status;
+ edit_.SetColumnFamily(cfd_->GetID());
+
+ // We use the import time as the ancester time. This is the time the data
+ // is written to the database.
+ int64_t temp_current_time = 0;
+ uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
+ uint64_t current_time = kUnknownOldestAncesterTime;
+ if (env_->GetCurrentTime(&temp_current_time).ok()) {
+ current_time = oldest_ancester_time =
+ static_cast<uint64_t>(temp_current_time);
+ }
+
+ for (size_t i = 0; i < files_to_import_.size(); ++i) {
+ const auto& f = files_to_import_[i];
+ const auto& file_metadata = metadata_[i];
+
+ edit_.AddFile(file_metadata.level, f.fd.GetNumber(), f.fd.GetPathId(),
+ f.fd.GetFileSize(), f.smallest_internal_key,
+ f.largest_internal_key, file_metadata.smallest_seqno,
+ file_metadata.largest_seqno, false, kInvalidBlobFileNumber,
+ oldest_ancester_time, current_time, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName);
+
+ // If incoming sequence number is higher, update local sequence number.
+ if (file_metadata.largest_seqno > versions_->LastSequence()) {
+ versions_->SetLastAllocatedSequence(file_metadata.largest_seqno);
+ versions_->SetLastPublishedSequence(file_metadata.largest_seqno);
+ versions_->SetLastSequence(file_metadata.largest_seqno);
+ }
+ }
+
+ return status;
+}
+
+void ImportColumnFamilyJob::Cleanup(const Status& status) {
+ if (!status.ok()) {
+ // We failed to add files to the database remove all the files we copied.
+ for (const auto& f : files_to_import_) {
+ const auto s =
+ fs_->DeleteFile(f.internal_file_path, IOOptions(), nullptr);
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "AddFile() clean up for file %s failed : %s",
+ f.internal_file_path.c_str(), s.ToString().c_str());
+ }
+ }
+ } else if (status.ok() && import_options_.move_files) {
+ // The files were moved and added successfully, remove original file links
+ for (IngestedFileInfo& f : files_to_import_) {
+ const auto s =
+ fs_->DeleteFile(f.external_file_path, IOOptions(), nullptr);
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(
+ db_options_.info_log,
+ "%s was added to DB successfully but failed to remove original "
+ "file link : %s",
+ f.external_file_path.c_str(), s.ToString().c_str());
+ }
+ }
+ }
+}
+
+Status ImportColumnFamilyJob::GetIngestedFileInfo(
+ const std::string& external_file, IngestedFileInfo* file_to_import,
+ SuperVersion* sv) {
+ file_to_import->external_file_path = external_file;
+
+ // Get external file size
+ Status status = fs_->GetFileSize(external_file, IOOptions(),
+ &file_to_import->file_size, nullptr);
+ if (!status.ok()) {
+ return status;
+ }
+
+ // Create TableReader for external file
+ std::unique_ptr<TableReader> table_reader;
+ std::unique_ptr<FSRandomAccessFile> sst_file;
+ std::unique_ptr<RandomAccessFileReader> sst_file_reader;
+
+ status = fs_->NewRandomAccessFile(external_file, env_options_,
+ &sst_file, nullptr);
+ if (!status.ok()) {
+ return status;
+ }
+ sst_file_reader.reset(
+ new RandomAccessFileReader(std::move(sst_file), external_file));
+
+ status = cfd_->ioptions()->table_factory->NewTableReader(
+ TableReaderOptions(*cfd_->ioptions(),
+ sv->mutable_cf_options.prefix_extractor.get(),
+ env_options_, cfd_->internal_comparator()),
+ std::move(sst_file_reader), file_to_import->file_size, &table_reader);
+ if (!status.ok()) {
+ return status;
+ }
+
+ // Get the external file properties
+ auto props = table_reader->GetTableProperties();
+
+ // Set original_seqno to 0.
+ file_to_import->original_seqno = 0;
+
+ // Get number of entries in table
+ file_to_import->num_entries = props->num_entries;
+
+ ParsedInternalKey key;
+ ReadOptions ro;
+ // During reading the external file we can cache blocks that we read into
+ // the block cache, if we later change the global seqno of this file, we will
+ // have block in cache that will include keys with wrong seqno.
+ // We need to disable fill_cache so that we read from the file without
+ // updating the block cache.
+ ro.fill_cache = false;
+ std::unique_ptr<InternalIterator> iter(table_reader->NewIterator(
+ ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr,
+ /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion));
+
+ // Get first (smallest) key from file
+ iter->SeekToFirst();
+ if (!ParseInternalKey(iter->key(), &key)) {
+ return Status::Corruption("external file have corrupted keys");
+ }
+ file_to_import->smallest_internal_key.SetFrom(key);
+
+ // Get last (largest) key from file
+ iter->SeekToLast();
+ if (!ParseInternalKey(iter->key(), &key)) {
+ return Status::Corruption("external file have corrupted keys");
+ }
+ file_to_import->largest_internal_key.SetFrom(key);
+
+ file_to_import->cf_id = static_cast<uint32_t>(props->column_family_id);
+
+ file_to_import->table_properties = *props;
+
+ return status;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/import_column_family_job.h b/src/rocksdb/db/import_column_family_job.h
new file mode 100644
index 000000000..160fd1247
--- /dev/null
+++ b/src/rocksdb/db/import_column_family_job.h
@@ -0,0 +1,72 @@
+#pragma once
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/dbformat.h"
+#include "db/external_sst_file_ingestion_job.h"
+#include "db/snapshot_impl.h"
+#include "options/db_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/metadata.h"
+#include "rocksdb/sst_file_writer.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Imports a set of sst files as is into a new column family. Logic is similar
+// to ExternalSstFileIngestionJob.
+class ImportColumnFamilyJob {
+ public:
+ ImportColumnFamilyJob(Env* env, VersionSet* versions, ColumnFamilyData* cfd,
+ const ImmutableDBOptions& db_options,
+ const EnvOptions& env_options,
+ const ImportColumnFamilyOptions& import_options,
+ const std::vector<LiveFileMetaData>& metadata)
+ : env_(env),
+ versions_(versions),
+ cfd_(cfd),
+ db_options_(db_options),
+ fs_(db_options_.fs.get()),
+ env_options_(env_options),
+ import_options_(import_options),
+ metadata_(metadata) {}
+
+ // Prepare the job by copying external files into the DB.
+ Status Prepare(uint64_t next_file_number, SuperVersion* sv);
+
+ // Will execute the import job and prepare edit() to be applied.
+ // REQUIRES: Mutex held
+ Status Run();
+
+ // Cleanup after successful/failed job
+ void Cleanup(const Status& status);
+
+ VersionEdit* edit() { return &edit_; }
+
+ const autovector<IngestedFileInfo>& files_to_import() const {
+ return files_to_import_;
+ }
+
+ private:
+ // Open the external file and populate `file_to_import` with all the
+ // external information we need to import this file.
+ Status GetIngestedFileInfo(const std::string& external_file,
+ IngestedFileInfo* file_to_import,
+ SuperVersion* sv);
+
+ Env* env_;
+ VersionSet* versions_;
+ ColumnFamilyData* cfd_;
+ const ImmutableDBOptions& db_options_;
+ FileSystem* fs_;
+ const EnvOptions& env_options_;
+ autovector<IngestedFileInfo> files_to_import_;
+ VersionEdit edit_;
+ const ImportColumnFamilyOptions& import_options_;
+ std::vector<LiveFileMetaData> metadata_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/import_column_family_test.cc b/src/rocksdb/db/import_column_family_test.cc
new file mode 100644
index 000000000..a25560b7c
--- /dev/null
+++ b/src/rocksdb/db/import_column_family_test.cc
@@ -0,0 +1,567 @@
+#ifndef ROCKSDB_LITE
+
+#include <functional>
+#include "db/db_test_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/sst_file_writer.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ImportColumnFamilyTest : public DBTestBase {
+ public:
+ ImportColumnFamilyTest() : DBTestBase("/import_column_family_test") {
+ sst_files_dir_ = dbname_ + "/sst_files/";
+ DestroyAndRecreateExternalSSTFilesDir();
+ export_files_dir_ = test::TmpDir(env_) + "/export";
+ import_cfh_ = nullptr;
+ import_cfh2_ = nullptr;
+ metadata_ptr_ = nullptr;
+ }
+
+ ~ImportColumnFamilyTest() {
+ if (import_cfh_) {
+ db_->DropColumnFamily(import_cfh_);
+ db_->DestroyColumnFamilyHandle(import_cfh_);
+ import_cfh_ = nullptr;
+ }
+ if (import_cfh2_) {
+ db_->DropColumnFamily(import_cfh2_);
+ db_->DestroyColumnFamilyHandle(import_cfh2_);
+ import_cfh2_ = nullptr;
+ }
+ if (metadata_ptr_) {
+ delete metadata_ptr_;
+ metadata_ptr_ = nullptr;
+ }
+ test::DestroyDir(env_, sst_files_dir_);
+ test::DestroyDir(env_, export_files_dir_);
+ }
+
+ void DestroyAndRecreateExternalSSTFilesDir() {
+ test::DestroyDir(env_, sst_files_dir_);
+ env_->CreateDir(sst_files_dir_);
+ test::DestroyDir(env_, export_files_dir_);
+ }
+
+ LiveFileMetaData LiveFileMetaDataInit(std::string name, std::string path,
+ int level,
+ SequenceNumber smallest_seqno,
+ SequenceNumber largest_seqno) {
+ LiveFileMetaData metadata;
+ metadata.name = name;
+ metadata.db_path = path;
+ metadata.smallest_seqno = smallest_seqno;
+ metadata.largest_seqno = largest_seqno;
+ metadata.level = level;
+ return metadata;
+ }
+
+ protected:
+ std::string sst_files_dir_;
+ std::string export_files_dir_;
+ ColumnFamilyHandle* import_cfh_;
+ ColumnFamilyHandle* import_cfh2_;
+ ExportImportFilesMetaData* metadata_ptr_;
+};
+
+TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFiles) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"koko"}, options);
+
+ SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]);
+ SstFileWriter sfw_unknown(EnvOptions(), options);
+
+ // cf1.sst
+ const std::string cf1_sst_name = "cf1.sst";
+ const std::string cf1_sst = sst_files_dir_ + cf1_sst_name;
+ ASSERT_OK(sfw_cf1.Open(cf1_sst));
+ ASSERT_OK(sfw_cf1.Put("K1", "V1"));
+ ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+ ASSERT_OK(sfw_cf1.Finish());
+
+ // cf_unknown.sst
+ const std::string unknown_sst_name = "cf_unknown.sst";
+ const std::string unknown_sst = sst_files_dir_ + unknown_sst_name;
+ ASSERT_OK(sfw_unknown.Open(unknown_sst));
+ ASSERT_OK(sfw_unknown.Put("K3", "V1"));
+ ASSERT_OK(sfw_unknown.Put("K4", "V2"));
+ ASSERT_OK(sfw_unknown.Finish());
+
+ {
+ // Import sst file corresponding to cf1 onto a new cf and verify
+ ExportImportFilesMetaData metadata;
+ metadata.files.push_back(
+ LiveFileMetaDataInit(cf1_sst_name, sst_files_dir_, 0, 10, 19));
+ metadata.db_comparator_name = options.comparator->Name();
+
+ ASSERT_OK(db_->CreateColumnFamilyWithImport(
+ options, "toto", ImportColumnFamilyOptions(), metadata, &import_cfh_));
+ ASSERT_NE(import_cfh_, nullptr);
+
+ std::string value;
+ db_->Get(ReadOptions(), import_cfh_, "K1", &value);
+ ASSERT_EQ(value, "V1");
+ db_->Get(ReadOptions(), import_cfh_, "K2", &value);
+ ASSERT_EQ(value, "V2");
+ ASSERT_OK(db_->DropColumnFamily(import_cfh_));
+ ASSERT_OK(db_->DestroyColumnFamilyHandle(import_cfh_));
+ import_cfh_ = nullptr;
+ }
+
+ {
+ // Import sst file corresponding to unknown cf onto a new cf and verify
+ ExportImportFilesMetaData metadata;
+ metadata.files.push_back(
+ LiveFileMetaDataInit(unknown_sst_name, sst_files_dir_, 0, 20, 29));
+ metadata.db_comparator_name = options.comparator->Name();
+
+ ASSERT_OK(db_->CreateColumnFamilyWithImport(
+ options, "yoyo", ImportColumnFamilyOptions(), metadata, &import_cfh_));
+ ASSERT_NE(import_cfh_, nullptr);
+
+ std::string value;
+ db_->Get(ReadOptions(), import_cfh_, "K3", &value);
+ ASSERT_EQ(value, "V1");
+ db_->Get(ReadOptions(), import_cfh_, "K4", &value);
+ ASSERT_EQ(value, "V2");
+ }
+}
+
+TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFilesWithOverlap) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"koko"}, options);
+
+ SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]);
+
+ // file3.sst
+ const std::string file3_sst_name = "file3.sst";
+ const std::string file3_sst = sst_files_dir_ + file3_sst_name;
+ ASSERT_OK(sfw_cf1.Open(file3_sst));
+ for (int i = 0; i < 100; ++i) {
+ sfw_cf1.Put(Key(i), Key(i) + "_val");
+ }
+ ASSERT_OK(sfw_cf1.Finish());
+
+ // file2.sst
+ const std::string file2_sst_name = "file2.sst";
+ const std::string file2_sst = sst_files_dir_ + file2_sst_name;
+ ASSERT_OK(sfw_cf1.Open(file2_sst));
+ for (int i = 0; i < 100; i += 2) {
+ sfw_cf1.Put(Key(i), Key(i) + "_overwrite1");
+ }
+ ASSERT_OK(sfw_cf1.Finish());
+
+ // file1a.sst
+ const std::string file1a_sst_name = "file1a.sst";
+ const std::string file1a_sst = sst_files_dir_ + file1a_sst_name;
+ ASSERT_OK(sfw_cf1.Open(file1a_sst));
+ for (int i = 0; i < 52; i += 4) {
+ sfw_cf1.Put(Key(i), Key(i) + "_overwrite2");
+ }
+ ASSERT_OK(sfw_cf1.Finish());
+
+ // file1b.sst
+ const std::string file1b_sst_name = "file1b.sst";
+ const std::string file1b_sst = sst_files_dir_ + file1b_sst_name;
+ ASSERT_OK(sfw_cf1.Open(file1b_sst));
+ for (int i = 52; i < 100; i += 4) {
+ sfw_cf1.Put(Key(i), Key(i) + "_overwrite2");
+ }
+ ASSERT_OK(sfw_cf1.Finish());
+
+ // file0a.sst
+ const std::string file0a_sst_name = "file0a.sst";
+ const std::string file0a_sst = sst_files_dir_ + file0a_sst_name;
+ ASSERT_OK(sfw_cf1.Open(file0a_sst));
+ for (int i = 0; i < 100; i += 16) {
+ sfw_cf1.Put(Key(i), Key(i) + "_overwrite3");
+ }
+ ASSERT_OK(sfw_cf1.Finish());
+
+ // file0b.sst
+ const std::string file0b_sst_name = "file0b.sst";
+ const std::string file0b_sst = sst_files_dir_ + file0b_sst_name;
+ ASSERT_OK(sfw_cf1.Open(file0b_sst));
+ for (int i = 0; i < 100; i += 16) {
+ sfw_cf1.Put(Key(i), Key(i) + "_overwrite4");
+ }
+ ASSERT_OK(sfw_cf1.Finish());
+
+ // Import sst files and verify
+ ExportImportFilesMetaData metadata;
+ metadata.files.push_back(
+ LiveFileMetaDataInit(file3_sst_name, sst_files_dir_, 3, 10, 19));
+ metadata.files.push_back(
+ LiveFileMetaDataInit(file2_sst_name, sst_files_dir_, 2, 20, 29));
+ metadata.files.push_back(
+ LiveFileMetaDataInit(file1a_sst_name, sst_files_dir_, 1, 30, 34));
+ metadata.files.push_back(
+ LiveFileMetaDataInit(file1b_sst_name, sst_files_dir_, 1, 35, 39));
+ metadata.files.push_back(
+ LiveFileMetaDataInit(file0a_sst_name, sst_files_dir_, 0, 40, 49));
+ metadata.files.push_back(
+ LiveFileMetaDataInit(file0b_sst_name, sst_files_dir_, 0, 50, 59));
+ metadata.db_comparator_name = options.comparator->Name();
+
+ ASSERT_OK(db_->CreateColumnFamilyWithImport(
+ options, "toto", ImportColumnFamilyOptions(), metadata, &import_cfh_));
+ ASSERT_NE(import_cfh_, nullptr);
+
+ for (int i = 0; i < 100; i++) {
+ std::string value;
+ db_->Get(ReadOptions(), import_cfh_, Key(i), &value);
+ if (i % 16 == 0) {
+ ASSERT_EQ(value, Key(i) + "_overwrite4");
+ } else if (i % 4 == 0) {
+ ASSERT_EQ(value, Key(i) + "_overwrite2");
+ } else if (i % 2 == 0) {
+ ASSERT_EQ(value, Key(i) + "_overwrite1");
+ } else {
+ ASSERT_EQ(value, Key(i) + "_val");
+ }
+ }
+
+ for (int i = 0; i < 100; i += 5) {
+ ASSERT_OK(
+ db_->Put(WriteOptions(), import_cfh_, Key(i), Key(i) + "_overwrite5"));
+ }
+
+ // Flush and check again
+ ASSERT_OK(db_->Flush(FlushOptions(), import_cfh_));
+ for (int i = 0; i < 100; i++) {
+ std::string value;
+ db_->Get(ReadOptions(), import_cfh_, Key(i), &value);
+ if (i % 5 == 0) {
+ ASSERT_EQ(value, Key(i) + "_overwrite5");
+ } else if (i % 16 == 0) {
+ ASSERT_EQ(value, Key(i) + "_overwrite4");
+ } else if (i % 4 == 0) {
+ ASSERT_EQ(value, Key(i) + "_overwrite2");
+ } else if (i % 2 == 0) {
+ ASSERT_EQ(value, Key(i) + "_overwrite1");
+ } else {
+ ASSERT_EQ(value, Key(i) + "_val");
+ }
+ }
+
+ // Compact and check again.
+ ASSERT_OK(
+ db_->CompactRange(CompactRangeOptions(), import_cfh_, nullptr, nullptr));
+ for (int i = 0; i < 100; i++) {
+ std::string value;
+ db_->Get(ReadOptions(), import_cfh_, Key(i), &value);
+ if (i % 5 == 0) {
+ ASSERT_EQ(value, Key(i) + "_overwrite5");
+ } else if (i % 16 == 0) {
+ ASSERT_EQ(value, Key(i) + "_overwrite4");
+ } else if (i % 4 == 0) {
+ ASSERT_EQ(value, Key(i) + "_overwrite2");
+ } else if (i % 2 == 0) {
+ ASSERT_EQ(value, Key(i) + "_overwrite1");
+ } else {
+ ASSERT_EQ(value, Key(i) + "_val");
+ }
+ }
+}
+
+TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherCF) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"koko"}, options);
+
+ for (int i = 0; i < 100; ++i) {
+ Put(1, Key(i), Key(i) + "_val");
+ }
+ ASSERT_OK(Flush(1));
+
+ ASSERT_OK(
+ db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr));
+
+ // Overwrite the value in the same set of keys.
+ for (int i = 0; i < 100; ++i) {
+ Put(1, Key(i), Key(i) + "_overwrite");
+ }
+
+ // Flush to create L0 file.
+ ASSERT_OK(Flush(1));
+ for (int i = 0; i < 100; ++i) {
+ Put(1, Key(i), Key(i) + "_overwrite2");
+ }
+
+ // Flush again to create another L0 file. It should have higher sequencer.
+ ASSERT_OK(Flush(1));
+
+ Checkpoint* checkpoint;
+ ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+ ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_,
+ &metadata_ptr_));
+ ASSERT_NE(metadata_ptr_, nullptr);
+ delete checkpoint;
+
+ ImportColumnFamilyOptions import_options;
+ import_options.move_files = false;
+ ASSERT_OK(db_->CreateColumnFamilyWithImport(options, "toto", import_options,
+ *metadata_ptr_, &import_cfh_));
+ ASSERT_NE(import_cfh_, nullptr);
+
+ import_options.move_files = true;
+ ASSERT_OK(db_->CreateColumnFamilyWithImport(options, "yoyo", import_options,
+ *metadata_ptr_, &import_cfh2_));
+ ASSERT_NE(import_cfh2_, nullptr);
+ delete metadata_ptr_;
+ metadata_ptr_ = NULL;
+
+ std::string value1, value2;
+
+ for (int i = 0; i < 100; ++i) {
+ db_->Get(ReadOptions(), import_cfh_, Key(i), &value1);
+ ASSERT_EQ(Get(1, Key(i)), value1);
+ }
+
+ for (int i = 0; i < 100; ++i) {
+ db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2);
+ ASSERT_EQ(Get(1, Key(i)), value2);
+ }
+
+ // Modify keys in cf1 and verify.
+ for (int i = 0; i < 25; i++) {
+ ASSERT_OK(db_->Delete(WriteOptions(), import_cfh_, Key(i)));
+ }
+ for (int i = 25; i < 50; i++) {
+ ASSERT_OK(
+ db_->Put(WriteOptions(), import_cfh_, Key(i), Key(i) + "_overwrite3"));
+ }
+ for (int i = 0; i < 25; ++i) {
+ ASSERT_TRUE(
+ db_->Get(ReadOptions(), import_cfh_, Key(i), &value1).IsNotFound());
+ }
+ for (int i = 25; i < 50; ++i) {
+ db_->Get(ReadOptions(), import_cfh_, Key(i), &value1);
+ ASSERT_EQ(Key(i) + "_overwrite3", value1);
+ }
+ for (int i = 50; i < 100; ++i) {
+ db_->Get(ReadOptions(), import_cfh_, Key(i), &value1);
+ ASSERT_EQ(Key(i) + "_overwrite2", value1);
+ }
+
+ for (int i = 0; i < 100; ++i) {
+ db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2);
+ ASSERT_EQ(Get(1, Key(i)), value2);
+ }
+
+ // Compact and check again.
+ ASSERT_OK(db_->Flush(FlushOptions(), import_cfh_));
+ ASSERT_OK(
+ db_->CompactRange(CompactRangeOptions(), import_cfh_, nullptr, nullptr));
+
+ for (int i = 0; i < 25; ++i) {
+ ASSERT_TRUE(
+ db_->Get(ReadOptions(), import_cfh_, Key(i), &value1).IsNotFound());
+ }
+ for (int i = 25; i < 50; ++i) {
+ db_->Get(ReadOptions(), import_cfh_, Key(i), &value1);
+ ASSERT_EQ(Key(i) + "_overwrite3", value1);
+ }
+ for (int i = 50; i < 100; ++i) {
+ db_->Get(ReadOptions(), import_cfh_, Key(i), &value1);
+ ASSERT_EQ(Key(i) + "_overwrite2", value1);
+ }
+
+ for (int i = 0; i < 100; ++i) {
+ db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2);
+ ASSERT_EQ(Get(1, Key(i)), value2);
+ }
+}
+
+TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherDB) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"koko"}, options);
+
+ for (int i = 0; i < 100; ++i) {
+ Put(1, Key(i), Key(i) + "_val");
+ }
+ ASSERT_OK(Flush(1));
+
+ // Compact to create a L1 file.
+ ASSERT_OK(
+ db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr));
+
+ // Overwrite the value in the same set of keys.
+ for (int i = 0; i < 50; ++i) {
+ Put(1, Key(i), Key(i) + "_overwrite");
+ }
+
+ // Flush to create L0 file.
+ ASSERT_OK(Flush(1));
+
+ for (int i = 0; i < 25; ++i) {
+ Put(1, Key(i), Key(i) + "_overwrite2");
+ }
+
+ // Flush again to create another L0 file. It should have higher sequencer.
+ ASSERT_OK(Flush(1));
+
+ Checkpoint* checkpoint;
+ ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+ ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_,
+ &metadata_ptr_));
+ ASSERT_NE(metadata_ptr_, nullptr);
+ delete checkpoint;
+
+ // Create a new db and import the files.
+ DB* db_copy;
+ test::DestroyDir(env_, dbname_ + "/db_copy");
+ ASSERT_OK(DB::Open(options, dbname_ + "/db_copy", &db_copy));
+ ColumnFamilyHandle* cfh = nullptr;
+ ASSERT_OK(db_copy->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+ ImportColumnFamilyOptions(),
+ *metadata_ptr_, &cfh));
+ ASSERT_NE(cfh, nullptr);
+
+ for (int i = 0; i < 100; ++i) {
+ std::string value;
+ db_copy->Get(ReadOptions(), cfh, Key(i), &value);
+ ASSERT_EQ(Get(1, Key(i)), value);
+ }
+ db_copy->DropColumnFamily(cfh);
+ db_copy->DestroyColumnFamilyHandle(cfh);
+ delete db_copy;
+ test::DestroyDir(env_, dbname_ + "/db_copy");
+}
+
+TEST_F(ImportColumnFamilyTest, ImportColumnFamilyNegativeTest) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"koko"}, options);
+
+ {
+ // Create column family with existing cf name.
+ ExportImportFilesMetaData metadata;
+
+ ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "koko",
+ ImportColumnFamilyOptions(),
+ metadata, &import_cfh_),
+ Status::InvalidArgument("Column family already exists"));
+ ASSERT_EQ(import_cfh_, nullptr);
+ }
+
+ {
+ // Import with no files specified.
+ ExportImportFilesMetaData metadata;
+
+ ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+ ImportColumnFamilyOptions(),
+ metadata, &import_cfh_),
+ Status::InvalidArgument("The list of files is empty"));
+ ASSERT_EQ(import_cfh_, nullptr);
+ }
+
+ {
+ // Import with overlapping keys in sst files.
+ ExportImportFilesMetaData metadata;
+ SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]);
+ const std::string file1_sst_name = "file1.sst";
+ const std::string file1_sst = sst_files_dir_ + file1_sst_name;
+ ASSERT_OK(sfw_cf1.Open(file1_sst));
+ ASSERT_OK(sfw_cf1.Put("K1", "V1"));
+ ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+ ASSERT_OK(sfw_cf1.Finish());
+ const std::string file2_sst_name = "file2.sst";
+ const std::string file2_sst = sst_files_dir_ + file2_sst_name;
+ ASSERT_OK(sfw_cf1.Open(file2_sst));
+ ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+ ASSERT_OK(sfw_cf1.Put("K3", "V3"));
+ ASSERT_OK(sfw_cf1.Finish());
+
+ metadata.files.push_back(
+ LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19));
+ metadata.files.push_back(
+ LiveFileMetaDataInit(file2_sst_name, sst_files_dir_, 1, 10, 19));
+ metadata.db_comparator_name = options.comparator->Name();
+
+ ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+ ImportColumnFamilyOptions(),
+ metadata, &import_cfh_),
+ Status::InvalidArgument("Files have overlapping ranges"));
+ ASSERT_EQ(import_cfh_, nullptr);
+ }
+
+ {
+ // Import with a mismatching comparator, should fail with appropriate error.
+ ExportImportFilesMetaData metadata;
+ Options mismatch_options = CurrentOptions();
+ mismatch_options.comparator = ReverseBytewiseComparator();
+ SstFileWriter sfw_cf1(EnvOptions(), mismatch_options, handles_[1]);
+ const std::string file1_sst_name = "file1.sst";
+ const std::string file1_sst = sst_files_dir_ + file1_sst_name;
+ ASSERT_OK(sfw_cf1.Open(file1_sst));
+ ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+ ASSERT_OK(sfw_cf1.Put("K1", "V1"));
+ ASSERT_OK(sfw_cf1.Finish());
+
+ metadata.files.push_back(
+ LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19));
+ metadata.db_comparator_name = mismatch_options.comparator->Name();
+
+ ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "coco",
+ ImportColumnFamilyOptions(),
+ metadata, &import_cfh_),
+ Status::InvalidArgument("Comparator name mismatch"));
+ ASSERT_EQ(import_cfh_, nullptr);
+ }
+
+ {
+ // Import with non existent sst file should fail with appropriate error
+ ExportImportFilesMetaData metadata;
+ SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]);
+ const std::string file1_sst_name = "file1.sst";
+ const std::string file1_sst = sst_files_dir_ + file1_sst_name;
+ ASSERT_OK(sfw_cf1.Open(file1_sst));
+ ASSERT_OK(sfw_cf1.Put("K1", "V1"));
+ ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+ ASSERT_OK(sfw_cf1.Finish());
+ const std::string file3_sst_name = "file3.sst";
+
+ metadata.files.push_back(
+ LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19));
+ metadata.files.push_back(
+ LiveFileMetaDataInit(file3_sst_name, sst_files_dir_, 1, 10, 19));
+ metadata.db_comparator_name = options.comparator->Name();
+
+ ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+ ImportColumnFamilyOptions(),
+ metadata, &import_cfh_),
+ Status::IOError("No such file or directory"));
+ ASSERT_EQ(import_cfh_, nullptr);
+
+ // Test successful import after a failure with the same CF name. Ensures
+ // there is no side effect with CF when there is a failed import
+ metadata.files.pop_back();
+ metadata.db_comparator_name = options.comparator->Name();
+
+ ASSERT_OK(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+ ImportColumnFamilyOptions(),
+ metadata, &import_cfh_));
+ ASSERT_NE(import_cfh_, nullptr);
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr,
+ "SKIPPED as External SST File Writer and Import are not supported "
+ "in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/internal_stats.cc b/src/rocksdb/db/internal_stats.cc
new file mode 100644
index 000000000..f729ee7c7
--- /dev/null
+++ b/src/rocksdb/db/internal_stats.cc
@@ -0,0 +1,1424 @@
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/internal_stats.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <limits>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+
+const std::map<LevelStatType, LevelStat> InternalStats::compaction_level_stats =
+ {
+ {LevelStatType::NUM_FILES, LevelStat{"NumFiles", "Files"}},
+ {LevelStatType::COMPACTED_FILES,
+ LevelStat{"CompactedFiles", "CompactedFiles"}},
+ {LevelStatType::SIZE_BYTES, LevelStat{"SizeBytes", "Size"}},
+ {LevelStatType::SCORE, LevelStat{"Score", "Score"}},
+ {LevelStatType::READ_GB, LevelStat{"ReadGB", "Read(GB)"}},
+ {LevelStatType::RN_GB, LevelStat{"RnGB", "Rn(GB)"}},
+ {LevelStatType::RNP1_GB, LevelStat{"Rnp1GB", "Rnp1(GB)"}},
+ {LevelStatType::WRITE_GB, LevelStat{"WriteGB", "Write(GB)"}},
+ {LevelStatType::W_NEW_GB, LevelStat{"WnewGB", "Wnew(GB)"}},
+ {LevelStatType::MOVED_GB, LevelStat{"MovedGB", "Moved(GB)"}},
+ {LevelStatType::WRITE_AMP, LevelStat{"WriteAmp", "W-Amp"}},
+ {LevelStatType::READ_MBPS, LevelStat{"ReadMBps", "Rd(MB/s)"}},
+ {LevelStatType::WRITE_MBPS, LevelStat{"WriteMBps", "Wr(MB/s)"}},
+ {LevelStatType::COMP_SEC, LevelStat{"CompSec", "Comp(sec)"}},
+ {LevelStatType::COMP_CPU_SEC,
+ LevelStat{"CompMergeCPU", "CompMergeCPU(sec)"}},
+ {LevelStatType::COMP_COUNT, LevelStat{"CompCount", "Comp(cnt)"}},
+ {LevelStatType::AVG_SEC, LevelStat{"AvgSec", "Avg(sec)"}},
+ {LevelStatType::KEY_IN, LevelStat{"KeyIn", "KeyIn"}},
+ {LevelStatType::KEY_DROP, LevelStat{"KeyDrop", "KeyDrop"}},
+};
+
+namespace {
+const double kMB = 1048576.0;
+const double kGB = kMB * 1024;
+const double kMicrosInSec = 1000000.0;
+
+void PrintLevelStatsHeader(char* buf, size_t len, const std::string& cf_name,
+ const std::string& group_by) {
+ int written_size =
+ snprintf(buf, len, "\n** Compaction Stats [%s] **\n", cf_name.c_str());
+ auto hdr = [](LevelStatType t) {
+ return InternalStats::compaction_level_stats.at(t).header_name.c_str();
+ };
+ int line_size = snprintf(
+ buf + written_size, len - written_size,
+ "%s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s\n",
+ // Note that we skip COMPACTED_FILES and merge it with Files column
+ group_by.c_str(), hdr(LevelStatType::NUM_FILES),
+ hdr(LevelStatType::SIZE_BYTES), hdr(LevelStatType::SCORE),
+ hdr(LevelStatType::READ_GB), hdr(LevelStatType::RN_GB),
+ hdr(LevelStatType::RNP1_GB), hdr(LevelStatType::WRITE_GB),
+ hdr(LevelStatType::W_NEW_GB), hdr(LevelStatType::MOVED_GB),
+ hdr(LevelStatType::WRITE_AMP), hdr(LevelStatType::READ_MBPS),
+ hdr(LevelStatType::WRITE_MBPS), hdr(LevelStatType::COMP_SEC),
+ hdr(LevelStatType::COMP_CPU_SEC), hdr(LevelStatType::COMP_COUNT),
+ hdr(LevelStatType::AVG_SEC), hdr(LevelStatType::KEY_IN),
+ hdr(LevelStatType::KEY_DROP));
+
+ written_size += line_size;
+ snprintf(buf + written_size, len - written_size, "%s\n",
+ std::string(line_size, '-').c_str());
+}
+
+void PrepareLevelStats(std::map<LevelStatType, double>* level_stats,
+ int num_files, int being_compacted,
+ double total_file_size, double score, double w_amp,
+ const InternalStats::CompactionStats& stats) {
+ uint64_t bytes_read =
+ stats.bytes_read_non_output_levels + stats.bytes_read_output_level;
+ int64_t bytes_new = stats.bytes_written - stats.bytes_read_output_level;
+ double elapsed = (stats.micros + 1) / kMicrosInSec;
+
+ (*level_stats)[LevelStatType::NUM_FILES] = num_files;
+ (*level_stats)[LevelStatType::COMPACTED_FILES] = being_compacted;
+ (*level_stats)[LevelStatType::SIZE_BYTES] = total_file_size;
+ (*level_stats)[LevelStatType::SCORE] = score;
+ (*level_stats)[LevelStatType::READ_GB] = bytes_read / kGB;
+ (*level_stats)[LevelStatType::RN_GB] =
+ stats.bytes_read_non_output_levels / kGB;
+ (*level_stats)[LevelStatType::RNP1_GB] = stats.bytes_read_output_level / kGB;
+ (*level_stats)[LevelStatType::WRITE_GB] = stats.bytes_written / kGB;
+ (*level_stats)[LevelStatType::W_NEW_GB] = bytes_new / kGB;
+ (*level_stats)[LevelStatType::MOVED_GB] = stats.bytes_moved / kGB;
+ (*level_stats)[LevelStatType::WRITE_AMP] = w_amp;
+ (*level_stats)[LevelStatType::READ_MBPS] = bytes_read / kMB / elapsed;
+ (*level_stats)[LevelStatType::WRITE_MBPS] =
+ stats.bytes_written / kMB / elapsed;
+ (*level_stats)[LevelStatType::COMP_SEC] = stats.micros / kMicrosInSec;
+ (*level_stats)[LevelStatType::COMP_CPU_SEC] = stats.cpu_micros / kMicrosInSec;
+ (*level_stats)[LevelStatType::COMP_COUNT] = stats.count;
+ (*level_stats)[LevelStatType::AVG_SEC] =
+ stats.count == 0 ? 0 : stats.micros / kMicrosInSec / stats.count;
+ (*level_stats)[LevelStatType::KEY_IN] =
+ static_cast<double>(stats.num_input_records);
+ (*level_stats)[LevelStatType::KEY_DROP] =
+ static_cast<double>(stats.num_dropped_records);
+}
+
+void PrintLevelStats(char* buf, size_t len, const std::string& name,
+ const std::map<LevelStatType, double>& stat_value) {
+ snprintf(
+ buf, len,
+ "%4s " /* Level */
+ "%6d/%-3d " /* Files */
+ "%8s " /* Size */
+ "%5.1f " /* Score */
+ "%8.1f " /* Read(GB) */
+ "%7.1f " /* Rn(GB) */
+ "%8.1f " /* Rnp1(GB) */
+ "%9.1f " /* Write(GB) */
+ "%8.1f " /* Wnew(GB) */
+ "%9.1f " /* Moved(GB) */
+ "%5.1f " /* W-Amp */
+ "%8.1f " /* Rd(MB/s) */
+ "%8.1f " /* Wr(MB/s) */
+ "%9.2f " /* Comp(sec) */
+ "%17.2f " /* CompMergeCPU(sec) */
+ "%9d " /* Comp(cnt) */
+ "%8.3f " /* Avg(sec) */
+ "%7s " /* KeyIn */
+ "%6s\n", /* KeyDrop */
+ name.c_str(), static_cast<int>(stat_value.at(LevelStatType::NUM_FILES)),
+ static_cast<int>(stat_value.at(LevelStatType::COMPACTED_FILES)),
+ BytesToHumanString(
+ static_cast<uint64_t>(stat_value.at(LevelStatType::SIZE_BYTES)))
+ .c_str(),
+ stat_value.at(LevelStatType::SCORE),
+ stat_value.at(LevelStatType::READ_GB),
+ stat_value.at(LevelStatType::RN_GB),
+ stat_value.at(LevelStatType::RNP1_GB),
+ stat_value.at(LevelStatType::WRITE_GB),
+ stat_value.at(LevelStatType::W_NEW_GB),
+ stat_value.at(LevelStatType::MOVED_GB),
+ stat_value.at(LevelStatType::WRITE_AMP),
+ stat_value.at(LevelStatType::READ_MBPS),
+ stat_value.at(LevelStatType::WRITE_MBPS),
+ stat_value.at(LevelStatType::COMP_SEC),
+ stat_value.at(LevelStatType::COMP_CPU_SEC),
+ static_cast<int>(stat_value.at(LevelStatType::COMP_COUNT)),
+ stat_value.at(LevelStatType::AVG_SEC),
+ NumberToHumanString(
+ static_cast<std::int64_t>(stat_value.at(LevelStatType::KEY_IN)))
+ .c_str(),
+ NumberToHumanString(
+ static_cast<std::int64_t>(stat_value.at(LevelStatType::KEY_DROP)))
+ .c_str());
+}
+
+void PrintLevelStats(char* buf, size_t len, const std::string& name,
+ int num_files, int being_compacted, double total_file_size,
+ double score, double w_amp,
+ const InternalStats::CompactionStats& stats) {
+ std::map<LevelStatType, double> level_stats;
+ PrepareLevelStats(&level_stats, num_files, being_compacted, total_file_size,
+ score, w_amp, stats);
+ PrintLevelStats(buf, len, name, level_stats);
+}
+
+// Assumes that trailing numbers represent an optional argument. This requires
+// property names to not end with numbers.
+std::pair<Slice, Slice> GetPropertyNameAndArg(const Slice& property) {
+ Slice name = property, arg = property;
+ size_t sfx_len = 0;
+ while (sfx_len < property.size() &&
+ isdigit(property[property.size() - sfx_len - 1])) {
+ ++sfx_len;
+ }
+ name.remove_suffix(sfx_len);
+ arg.remove_prefix(property.size() - sfx_len);
+ return {name, arg};
+}
+} // anonymous namespace
+
+static const std::string rocksdb_prefix = "rocksdb.";
+
+static const std::string num_files_at_level_prefix = "num-files-at-level";
+static const std::string compression_ratio_at_level_prefix =
+ "compression-ratio-at-level";
+static const std::string allstats = "stats";
+static const std::string sstables = "sstables";
+static const std::string cfstats = "cfstats";
+static const std::string cfstats_no_file_histogram =
+ "cfstats-no-file-histogram";
+static const std::string cf_file_histogram = "cf-file-histogram";
+static const std::string dbstats = "dbstats";
+static const std::string levelstats = "levelstats";
+static const std::string num_immutable_mem_table = "num-immutable-mem-table";
+static const std::string num_immutable_mem_table_flushed =
+ "num-immutable-mem-table-flushed";
+static const std::string mem_table_flush_pending = "mem-table-flush-pending";
+static const std::string compaction_pending = "compaction-pending";
+static const std::string background_errors = "background-errors";
+static const std::string cur_size_active_mem_table =
+ "cur-size-active-mem-table";
+static const std::string cur_size_all_mem_tables = "cur-size-all-mem-tables";
+static const std::string size_all_mem_tables = "size-all-mem-tables";
+static const std::string num_entries_active_mem_table =
+ "num-entries-active-mem-table";
+static const std::string num_entries_imm_mem_tables =
+ "num-entries-imm-mem-tables";
+static const std::string num_deletes_active_mem_table =
+ "num-deletes-active-mem-table";
+static const std::string num_deletes_imm_mem_tables =
+ "num-deletes-imm-mem-tables";
+static const std::string estimate_num_keys = "estimate-num-keys";
+static const std::string estimate_table_readers_mem =
+ "estimate-table-readers-mem";
+static const std::string is_file_deletions_enabled =
+ "is-file-deletions-enabled";
+static const std::string num_snapshots = "num-snapshots";
+static const std::string oldest_snapshot_time = "oldest-snapshot-time";
+static const std::string oldest_snapshot_sequence = "oldest-snapshot-sequence";
+static const std::string num_live_versions = "num-live-versions";
+static const std::string current_version_number =
+ "current-super-version-number";
+static const std::string estimate_live_data_size = "estimate-live-data-size";
+static const std::string min_log_number_to_keep_str = "min-log-number-to-keep";
+static const std::string min_obsolete_sst_number_to_keep_str =
+ "min-obsolete-sst-number-to-keep";
+static const std::string base_level_str = "base-level";
+static const std::string total_sst_files_size = "total-sst-files-size";
+static const std::string live_sst_files_size = "live-sst-files-size";
+static const std::string estimate_pending_comp_bytes =
+ "estimate-pending-compaction-bytes";
+static const std::string aggregated_table_properties =
+ "aggregated-table-properties";
+static const std::string aggregated_table_properties_at_level =
+ aggregated_table_properties + "-at-level";
+static const std::string num_running_compactions = "num-running-compactions";
+static const std::string num_running_flushes = "num-running-flushes";
+static const std::string actual_delayed_write_rate =
+ "actual-delayed-write-rate";
+static const std::string is_write_stopped = "is-write-stopped";
+static const std::string estimate_oldest_key_time = "estimate-oldest-key-time";
+static const std::string block_cache_capacity = "block-cache-capacity";
+static const std::string block_cache_usage = "block-cache-usage";
+static const std::string block_cache_pinned_usage = "block-cache-pinned-usage";
+static const std::string options_statistics = "options-statistics";
+
+const std::string DB::Properties::kNumFilesAtLevelPrefix =
+ rocksdb_prefix + num_files_at_level_prefix;
+const std::string DB::Properties::kCompressionRatioAtLevelPrefix =
+ rocksdb_prefix + compression_ratio_at_level_prefix;
+const std::string DB::Properties::kStats = rocksdb_prefix + allstats;
+const std::string DB::Properties::kSSTables = rocksdb_prefix + sstables;
+const std::string DB::Properties::kCFStats = rocksdb_prefix + cfstats;
+const std::string DB::Properties::kCFStatsNoFileHistogram =
+ rocksdb_prefix + cfstats_no_file_histogram;
+const std::string DB::Properties::kCFFileHistogram =
+ rocksdb_prefix + cf_file_histogram;
+const std::string DB::Properties::kDBStats = rocksdb_prefix + dbstats;
+const std::string DB::Properties::kLevelStats = rocksdb_prefix + levelstats;
+const std::string DB::Properties::kNumImmutableMemTable =
+ rocksdb_prefix + num_immutable_mem_table;
+const std::string DB::Properties::kNumImmutableMemTableFlushed =
+ rocksdb_prefix + num_immutable_mem_table_flushed;
+const std::string DB::Properties::kMemTableFlushPending =
+ rocksdb_prefix + mem_table_flush_pending;
+const std::string DB::Properties::kCompactionPending =
+ rocksdb_prefix + compaction_pending;
+const std::string DB::Properties::kNumRunningCompactions =
+ rocksdb_prefix + num_running_compactions;
+const std::string DB::Properties::kNumRunningFlushes =
+ rocksdb_prefix + num_running_flushes;
+const std::string DB::Properties::kBackgroundErrors =
+ rocksdb_prefix + background_errors;
+const std::string DB::Properties::kCurSizeActiveMemTable =
+ rocksdb_prefix + cur_size_active_mem_table;
+const std::string DB::Properties::kCurSizeAllMemTables =
+ rocksdb_prefix + cur_size_all_mem_tables;
+const std::string DB::Properties::kSizeAllMemTables =
+ rocksdb_prefix + size_all_mem_tables;
+const std::string DB::Properties::kNumEntriesActiveMemTable =
+ rocksdb_prefix + num_entries_active_mem_table;
+const std::string DB::Properties::kNumEntriesImmMemTables =
+ rocksdb_prefix + num_entries_imm_mem_tables;
+const std::string DB::Properties::kNumDeletesActiveMemTable =
+ rocksdb_prefix + num_deletes_active_mem_table;
+const std::string DB::Properties::kNumDeletesImmMemTables =
+ rocksdb_prefix + num_deletes_imm_mem_tables;
+const std::string DB::Properties::kEstimateNumKeys =
+ rocksdb_prefix + estimate_num_keys;
+const std::string DB::Properties::kEstimateTableReadersMem =
+ rocksdb_prefix + estimate_table_readers_mem;
+const std::string DB::Properties::kIsFileDeletionsEnabled =
+ rocksdb_prefix + is_file_deletions_enabled;
+const std::string DB::Properties::kNumSnapshots =
+ rocksdb_prefix + num_snapshots;
+const std::string DB::Properties::kOldestSnapshotTime =
+ rocksdb_prefix + oldest_snapshot_time;
+const std::string DB::Properties::kOldestSnapshotSequence =
+ rocksdb_prefix + oldest_snapshot_sequence;
+const std::string DB::Properties::kNumLiveVersions =
+ rocksdb_prefix + num_live_versions;
+const std::string DB::Properties::kCurrentSuperVersionNumber =
+ rocksdb_prefix + current_version_number;
+const std::string DB::Properties::kEstimateLiveDataSize =
+ rocksdb_prefix + estimate_live_data_size;
+const std::string DB::Properties::kMinLogNumberToKeep =
+ rocksdb_prefix + min_log_number_to_keep_str;
+const std::string DB::Properties::kMinObsoleteSstNumberToKeep =
+ rocksdb_prefix + min_obsolete_sst_number_to_keep_str;
+const std::string DB::Properties::kTotalSstFilesSize =
+ rocksdb_prefix + total_sst_files_size;
+const std::string DB::Properties::kLiveSstFilesSize =
+ rocksdb_prefix + live_sst_files_size;
+const std::string DB::Properties::kBaseLevel = rocksdb_prefix + base_level_str;
+const std::string DB::Properties::kEstimatePendingCompactionBytes =
+ rocksdb_prefix + estimate_pending_comp_bytes;
+const std::string DB::Properties::kAggregatedTableProperties =
+ rocksdb_prefix + aggregated_table_properties;
+const std::string DB::Properties::kAggregatedTablePropertiesAtLevel =
+ rocksdb_prefix + aggregated_table_properties_at_level;
+const std::string DB::Properties::kActualDelayedWriteRate =
+ rocksdb_prefix + actual_delayed_write_rate;
+const std::string DB::Properties::kIsWriteStopped =
+ rocksdb_prefix + is_write_stopped;
+const std::string DB::Properties::kEstimateOldestKeyTime =
+ rocksdb_prefix + estimate_oldest_key_time;
+const std::string DB::Properties::kBlockCacheCapacity =
+ rocksdb_prefix + block_cache_capacity;
+const std::string DB::Properties::kBlockCacheUsage =
+ rocksdb_prefix + block_cache_usage;
+const std::string DB::Properties::kBlockCachePinnedUsage =
+ rocksdb_prefix + block_cache_pinned_usage;
+const std::string DB::Properties::kOptionsStatistics =
+ rocksdb_prefix + options_statistics;
+
+const std::unordered_map<std::string, DBPropertyInfo>
+ InternalStats::ppt_name_to_info = {
+ {DB::Properties::kNumFilesAtLevelPrefix,
+ {false, &InternalStats::HandleNumFilesAtLevel, nullptr, nullptr,
+ nullptr}},
+ {DB::Properties::kCompressionRatioAtLevelPrefix,
+ {false, &InternalStats::HandleCompressionRatioAtLevelPrefix, nullptr,
+ nullptr, nullptr}},
+ {DB::Properties::kLevelStats,
+ {false, &InternalStats::HandleLevelStats, nullptr, nullptr, nullptr}},
+ {DB::Properties::kStats,
+ {false, &InternalStats::HandleStats, nullptr, nullptr, nullptr}},
+ {DB::Properties::kCFStats,
+ {false, &InternalStats::HandleCFStats, nullptr,
+ &InternalStats::HandleCFMapStats, nullptr}},
+ {DB::Properties::kCFStatsNoFileHistogram,
+ {false, &InternalStats::HandleCFStatsNoFileHistogram, nullptr, nullptr,
+ nullptr}},
+ {DB::Properties::kCFFileHistogram,
+ {false, &InternalStats::HandleCFFileHistogram, nullptr, nullptr,
+ nullptr}},
+ {DB::Properties::kDBStats,
+ {false, &InternalStats::HandleDBStats, nullptr, nullptr, nullptr}},
+ {DB::Properties::kSSTables,
+ {false, &InternalStats::HandleSsTables, nullptr, nullptr, nullptr}},
+ {DB::Properties::kAggregatedTableProperties,
+ {false, &InternalStats::HandleAggregatedTableProperties, nullptr,
+ nullptr, nullptr}},
+ {DB::Properties::kAggregatedTablePropertiesAtLevel,
+ {false, &InternalStats::HandleAggregatedTablePropertiesAtLevel,
+ nullptr, nullptr, nullptr}},
+ {DB::Properties::kNumImmutableMemTable,
+ {false, nullptr, &InternalStats::HandleNumImmutableMemTable, nullptr,
+ nullptr}},
+ {DB::Properties::kNumImmutableMemTableFlushed,
+ {false, nullptr, &InternalStats::HandleNumImmutableMemTableFlushed,
+ nullptr, nullptr}},
+ {DB::Properties::kMemTableFlushPending,
+ {false, nullptr, &InternalStats::HandleMemTableFlushPending, nullptr,
+ nullptr}},
+ {DB::Properties::kCompactionPending,
+ {false, nullptr, &InternalStats::HandleCompactionPending, nullptr,
+ nullptr}},
+ {DB::Properties::kBackgroundErrors,
+ {false, nullptr, &InternalStats::HandleBackgroundErrors, nullptr,
+ nullptr}},
+ {DB::Properties::kCurSizeActiveMemTable,
+ {false, nullptr, &InternalStats::HandleCurSizeActiveMemTable, nullptr,
+ nullptr}},
+ {DB::Properties::kCurSizeAllMemTables,
+ {false, nullptr, &InternalStats::HandleCurSizeAllMemTables, nullptr,
+ nullptr}},
+ {DB::Properties::kSizeAllMemTables,
+ {false, nullptr, &InternalStats::HandleSizeAllMemTables, nullptr,
+ nullptr}},
+ {DB::Properties::kNumEntriesActiveMemTable,
+ {false, nullptr, &InternalStats::HandleNumEntriesActiveMemTable,
+ nullptr, nullptr}},
+ {DB::Properties::kNumEntriesImmMemTables,
+ {false, nullptr, &InternalStats::HandleNumEntriesImmMemTables, nullptr,
+ nullptr}},
+ {DB::Properties::kNumDeletesActiveMemTable,
+ {false, nullptr, &InternalStats::HandleNumDeletesActiveMemTable,
+ nullptr, nullptr}},
+ {DB::Properties::kNumDeletesImmMemTables,
+ {false, nullptr, &InternalStats::HandleNumDeletesImmMemTables, nullptr,
+ nullptr}},
+ {DB::Properties::kEstimateNumKeys,
+ {false, nullptr, &InternalStats::HandleEstimateNumKeys, nullptr,
+ nullptr}},
+ {DB::Properties::kEstimateTableReadersMem,
+ {true, nullptr, &InternalStats::HandleEstimateTableReadersMem, nullptr,
+ nullptr}},
+ {DB::Properties::kIsFileDeletionsEnabled,
+ {false, nullptr, &InternalStats::HandleIsFileDeletionsEnabled, nullptr,
+ nullptr}},
+ {DB::Properties::kNumSnapshots,
+ {false, nullptr, &InternalStats::HandleNumSnapshots, nullptr,
+ nullptr}},
+ {DB::Properties::kOldestSnapshotTime,
+ {false, nullptr, &InternalStats::HandleOldestSnapshotTime, nullptr,
+ nullptr}},
+ {DB::Properties::kOldestSnapshotSequence,
+ {false, nullptr, &InternalStats::HandleOldestSnapshotSequence, nullptr,
+ nullptr}},
+ {DB::Properties::kNumLiveVersions,
+ {false, nullptr, &InternalStats::HandleNumLiveVersions, nullptr,
+ nullptr}},
+ {DB::Properties::kCurrentSuperVersionNumber,
+ {false, nullptr, &InternalStats::HandleCurrentSuperVersionNumber,
+ nullptr, nullptr}},
+ {DB::Properties::kEstimateLiveDataSize,
+ {true, nullptr, &InternalStats::HandleEstimateLiveDataSize, nullptr,
+ nullptr}},
+ {DB::Properties::kMinLogNumberToKeep,
+ {false, nullptr, &InternalStats::HandleMinLogNumberToKeep, nullptr,
+ nullptr}},
+ {DB::Properties::kMinObsoleteSstNumberToKeep,
+ {false, nullptr, &InternalStats::HandleMinObsoleteSstNumberToKeep,
+ nullptr, nullptr}},
+ {DB::Properties::kBaseLevel,
+ {false, nullptr, &InternalStats::HandleBaseLevel, nullptr, nullptr}},
+ {DB::Properties::kTotalSstFilesSize,
+ {false, nullptr, &InternalStats::HandleTotalSstFilesSize, nullptr,
+ nullptr}},
+ {DB::Properties::kLiveSstFilesSize,
+ {false, nullptr, &InternalStats::HandleLiveSstFilesSize, nullptr,
+ nullptr}},
+ {DB::Properties::kEstimatePendingCompactionBytes,
+ {false, nullptr, &InternalStats::HandleEstimatePendingCompactionBytes,
+ nullptr, nullptr}},
+ {DB::Properties::kNumRunningFlushes,
+ {false, nullptr, &InternalStats::HandleNumRunningFlushes, nullptr,
+ nullptr}},
+ {DB::Properties::kNumRunningCompactions,
+ {false, nullptr, &InternalStats::HandleNumRunningCompactions, nullptr,
+ nullptr}},
+ {DB::Properties::kActualDelayedWriteRate,
+ {false, nullptr, &InternalStats::HandleActualDelayedWriteRate, nullptr,
+ nullptr}},
+ {DB::Properties::kIsWriteStopped,
+ {false, nullptr, &InternalStats::HandleIsWriteStopped, nullptr,
+ nullptr}},
+ {DB::Properties::kEstimateOldestKeyTime,
+ {false, nullptr, &InternalStats::HandleEstimateOldestKeyTime, nullptr,
+ nullptr}},
+ {DB::Properties::kBlockCacheCapacity,
+ {false, nullptr, &InternalStats::HandleBlockCacheCapacity, nullptr,
+ nullptr}},
+ {DB::Properties::kBlockCacheUsage,
+ {false, nullptr, &InternalStats::HandleBlockCacheUsage, nullptr,
+ nullptr}},
+ {DB::Properties::kBlockCachePinnedUsage,
+ {false, nullptr, &InternalStats::HandleBlockCachePinnedUsage, nullptr,
+ nullptr}},
+ {DB::Properties::kOptionsStatistics,
+ {false, nullptr, nullptr, nullptr,
+ &DBImpl::GetPropertyHandleOptionsStatistics}},
+};
+
+const DBPropertyInfo* GetPropertyInfo(const Slice& property) {
+ std::string ppt_name = GetPropertyNameAndArg(property).first.ToString();
+ auto ppt_info_iter = InternalStats::ppt_name_to_info.find(ppt_name);
+ if (ppt_info_iter == InternalStats::ppt_name_to_info.end()) {
+ return nullptr;
+ }
+ return &ppt_info_iter->second;
+}
+
+bool InternalStats::GetStringProperty(const DBPropertyInfo& property_info,
+ const Slice& property,
+ std::string* value) {
+ assert(value != nullptr);
+ assert(property_info.handle_string != nullptr);
+ Slice arg = GetPropertyNameAndArg(property).second;
+ return (this->*(property_info.handle_string))(value, arg);
+}
+
+bool InternalStats::GetMapProperty(const DBPropertyInfo& property_info,
+ const Slice& /*property*/,
+ std::map<std::string, std::string>* value) {
+ assert(value != nullptr);
+ assert(property_info.handle_map != nullptr);
+ return (this->*(property_info.handle_map))(value);
+}
+
+bool InternalStats::GetIntProperty(const DBPropertyInfo& property_info,
+ uint64_t* value, DBImpl* db) {
+ assert(value != nullptr);
+ assert(property_info.handle_int != nullptr &&
+ !property_info.need_out_of_mutex);
+ db->mutex_.AssertHeld();
+ return (this->*(property_info.handle_int))(value, db, nullptr /* version */);
+}
+
+bool InternalStats::GetIntPropertyOutOfMutex(
+ const DBPropertyInfo& property_info, Version* version, uint64_t* value) {
+ assert(value != nullptr);
+ assert(property_info.handle_int != nullptr &&
+ property_info.need_out_of_mutex);
+ return (this->*(property_info.handle_int))(value, nullptr /* db */, version);
+}
+
+bool InternalStats::HandleNumFilesAtLevel(std::string* value, Slice suffix) {
+ uint64_t level;
+ const auto* vstorage = cfd_->current()->storage_info();
+ bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty();
+ if (!ok || static_cast<int>(level) >= number_levels_) {
+ return false;
+ } else {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%d",
+ vstorage->NumLevelFiles(static_cast<int>(level)));
+ *value = buf;
+ return true;
+ }
+}
+
+bool InternalStats::HandleCompressionRatioAtLevelPrefix(std::string* value,
+ Slice suffix) {
+ uint64_t level;
+ const auto* vstorage = cfd_->current()->storage_info();
+ bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty();
+ if (!ok || level >= static_cast<uint64_t>(number_levels_)) {
+ return false;
+ }
+ *value = ToString(
+ vstorage->GetEstimatedCompressionRatioAtLevel(static_cast<int>(level)));
+ return true;
+}
+
+bool InternalStats::HandleLevelStats(std::string* value, Slice /*suffix*/) {
+ char buf[1000];
+ const auto* vstorage = cfd_->current()->storage_info();
+ snprintf(buf, sizeof(buf),
+ "Level Files Size(MB)\n"
+ "--------------------\n");
+ value->append(buf);
+
+ for (int level = 0; level < number_levels_; level++) {
+ snprintf(buf, sizeof(buf), "%3d %8d %8.0f\n", level,
+ vstorage->NumLevelFiles(level),
+ vstorage->NumLevelBytes(level) / kMB);
+ value->append(buf);
+ }
+ return true;
+}
+
+bool InternalStats::HandleStats(std::string* value, Slice suffix) {
+ if (!HandleCFStats(value, suffix)) {
+ return false;
+ }
+ if (!HandleDBStats(value, suffix)) {
+ return false;
+ }
+ return true;
+}
+
+bool InternalStats::HandleCFMapStats(
+ std::map<std::string, std::string>* cf_stats) {
+ DumpCFMapStats(cf_stats);
+ return true;
+}
+
+bool InternalStats::HandleCFStats(std::string* value, Slice /*suffix*/) {
+ DumpCFStats(value);
+ return true;
+}
+
+bool InternalStats::HandleCFStatsNoFileHistogram(std::string* value,
+ Slice /*suffix*/) {
+ DumpCFStatsNoFileHistogram(value);
+ return true;
+}
+
+bool InternalStats::HandleCFFileHistogram(std::string* value,
+ Slice /*suffix*/) {
+ DumpCFFileHistogram(value);
+ return true;
+}
+
+bool InternalStats::HandleDBStats(std::string* value, Slice /*suffix*/) {
+ DumpDBStats(value);
+ return true;
+}
+
+bool InternalStats::HandleSsTables(std::string* value, Slice /*suffix*/) {
+ auto* current = cfd_->current();
+ *value = current->DebugString(true, true);
+ return true;
+}
+
+bool InternalStats::HandleAggregatedTableProperties(std::string* value,
+ Slice /*suffix*/) {
+ std::shared_ptr<const TableProperties> tp;
+ auto s = cfd_->current()->GetAggregatedTableProperties(&tp);
+ if (!s.ok()) {
+ return false;
+ }
+ *value = tp->ToString();
+ return true;
+}
+
+bool InternalStats::HandleAggregatedTablePropertiesAtLevel(std::string* value,
+ Slice suffix) {
+ uint64_t level;
+ bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty();
+ if (!ok || static_cast<int>(level) >= number_levels_) {
+ return false;
+ }
+ std::shared_ptr<const TableProperties> tp;
+ auto s = cfd_->current()->GetAggregatedTableProperties(
+ &tp, static_cast<int>(level));
+ if (!s.ok()) {
+ return false;
+ }
+ *value = tp->ToString();
+ return true;
+}
+
+bool InternalStats::HandleNumImmutableMemTable(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ *value = cfd_->imm()->NumNotFlushed();
+ return true;
+}
+
+bool InternalStats::HandleNumImmutableMemTableFlushed(uint64_t* value,
+ DBImpl* /*db*/,
+ Version* /*version*/) {
+ *value = cfd_->imm()->NumFlushed();
+ return true;
+}
+
+bool InternalStats::HandleMemTableFlushPending(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ *value = (cfd_->imm()->IsFlushPending() ? 1 : 0);
+ return true;
+}
+
+bool InternalStats::HandleNumRunningFlushes(uint64_t* value, DBImpl* db,
+ Version* /*version*/) {
+ *value = db->num_running_flushes();
+ return true;
+}
+
+bool InternalStats::HandleCompactionPending(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ // 1 if the system already determines at least one compaction is needed.
+ // 0 otherwise,
+ const auto* vstorage = cfd_->current()->storage_info();
+ *value = (cfd_->compaction_picker()->NeedsCompaction(vstorage) ? 1 : 0);
+ return true;
+}
+
+bool InternalStats::HandleNumRunningCompactions(uint64_t* value, DBImpl* db,
+ Version* /*version*/) {
+ *value = db->num_running_compactions_;
+ return true;
+}
+
+bool InternalStats::HandleBackgroundErrors(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ // Accumulated number of errors in background flushes or compactions.
+ *value = GetBackgroundErrorCount();
+ return true;
+}
+
+bool InternalStats::HandleCurSizeActiveMemTable(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ // Current size of the active memtable
+ *value = cfd_->mem()->ApproximateMemoryUsage();
+ return true;
+}
+
+bool InternalStats::HandleCurSizeAllMemTables(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ // Current size of the active memtable + immutable memtables
+ *value = cfd_->mem()->ApproximateMemoryUsage() +
+ cfd_->imm()->ApproximateUnflushedMemTablesMemoryUsage();
+ return true;
+}
+
+bool InternalStats::HandleSizeAllMemTables(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ *value = cfd_->mem()->ApproximateMemoryUsage() +
+ cfd_->imm()->ApproximateMemoryUsage();
+ return true;
+}
+
+bool InternalStats::HandleNumEntriesActiveMemTable(uint64_t* value,
+ DBImpl* /*db*/,
+ Version* /*version*/) {
+ // Current number of entires in the active memtable
+ *value = cfd_->mem()->num_entries();
+ return true;
+}
+
+bool InternalStats::HandleNumEntriesImmMemTables(uint64_t* value,
+ DBImpl* /*db*/,
+ Version* /*version*/) {
+ // Current number of entries in the immutable memtables
+ *value = cfd_->imm()->current()->GetTotalNumEntries();
+ return true;
+}
+
+bool InternalStats::HandleNumDeletesActiveMemTable(uint64_t* value,
+ DBImpl* /*db*/,
+ Version* /*version*/) {
+ // Current number of entires in the active memtable
+ *value = cfd_->mem()->num_deletes();
+ return true;
+}
+
+bool InternalStats::HandleNumDeletesImmMemTables(uint64_t* value,
+ DBImpl* /*db*/,
+ Version* /*version*/) {
+ // Current number of entries in the immutable memtables
+ *value = cfd_->imm()->current()->GetTotalNumDeletes();
+ return true;
+}
+
+bool InternalStats::HandleEstimateNumKeys(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ // Estimate number of entries in the column family:
+ // Use estimated entries in tables + total entries in memtables.
+ const auto* vstorage = cfd_->current()->storage_info();
+ uint64_t estimate_keys = cfd_->mem()->num_entries() +
+ cfd_->imm()->current()->GetTotalNumEntries() +
+ vstorage->GetEstimatedActiveKeys();
+ uint64_t estimate_deletes =
+ cfd_->mem()->num_deletes() + cfd_->imm()->current()->GetTotalNumDeletes();
+ *value = estimate_keys > estimate_deletes * 2
+ ? estimate_keys - (estimate_deletes * 2)
+ : 0;
+ return true;
+}
+
+bool InternalStats::HandleNumSnapshots(uint64_t* value, DBImpl* db,
+ Version* /*version*/) {
+ *value = db->snapshots().count();
+ return true;
+}
+
+bool InternalStats::HandleOldestSnapshotTime(uint64_t* value, DBImpl* db,
+ Version* /*version*/) {
+ *value = static_cast<uint64_t>(db->snapshots().GetOldestSnapshotTime());
+ return true;
+}
+
+bool InternalStats::HandleOldestSnapshotSequence(uint64_t* value, DBImpl* db,
+ Version* /*version*/) {
+ *value = static_cast<uint64_t>(db->snapshots().GetOldestSnapshotSequence());
+ return true;
+}
+
+bool InternalStats::HandleNumLiveVersions(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ *value = cfd_->GetNumLiveVersions();
+ return true;
+}
+
+bool InternalStats::HandleCurrentSuperVersionNumber(uint64_t* value,
+ DBImpl* /*db*/,
+ Version* /*version*/) {
+ *value = cfd_->GetSuperVersionNumber();
+ return true;
+}
+
+bool InternalStats::HandleIsFileDeletionsEnabled(uint64_t* value, DBImpl* db,
+ Version* /*version*/) {
+ *value = db->IsFileDeletionsEnabled();
+ return true;
+}
+
+bool InternalStats::HandleBaseLevel(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ const auto* vstorage = cfd_->current()->storage_info();
+ *value = vstorage->base_level();
+ return true;
+}
+
+bool InternalStats::HandleTotalSstFilesSize(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ *value = cfd_->GetTotalSstFilesSize();
+ return true;
+}
+
+bool InternalStats::HandleLiveSstFilesSize(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ *value = cfd_->GetLiveSstFilesSize();
+ return true;
+}
+
+bool InternalStats::HandleEstimatePendingCompactionBytes(uint64_t* value,
+ DBImpl* /*db*/,
+ Version* /*version*/) {
+ const auto* vstorage = cfd_->current()->storage_info();
+ *value = vstorage->estimated_compaction_needed_bytes();
+ return true;
+}
+
+bool InternalStats::HandleEstimateTableReadersMem(uint64_t* value,
+ DBImpl* /*db*/,
+ Version* version) {
+ *value = (version == nullptr) ? 0 : version->GetMemoryUsageByTableReaders();
+ return true;
+}
+
+bool InternalStats::HandleEstimateLiveDataSize(uint64_t* value, DBImpl* /*db*/,
+ Version* version) {
+ const auto* vstorage = version->storage_info();
+ *value = vstorage->EstimateLiveDataSize();
+ return true;
+}
+
+bool InternalStats::HandleMinLogNumberToKeep(uint64_t* value, DBImpl* db,
+ Version* /*version*/) {
+ *value = db->MinLogNumberToKeep();
+ return true;
+}
+
+bool InternalStats::HandleMinObsoleteSstNumberToKeep(uint64_t* value,
+ DBImpl* db,
+ Version* /*version*/) {
+ *value = db->MinObsoleteSstNumberToKeep();
+ return true;
+}
+
+bool InternalStats::HandleActualDelayedWriteRate(uint64_t* value, DBImpl* db,
+ Version* /*version*/) {
+ const WriteController& wc = db->write_controller();
+ if (!wc.NeedsDelay()) {
+ *value = 0;
+ } else {
+ *value = wc.delayed_write_rate();
+ }
+ return true;
+}
+
+bool InternalStats::HandleIsWriteStopped(uint64_t* value, DBImpl* db,
+ Version* /*version*/) {
+ *value = db->write_controller().IsStopped() ? 1 : 0;
+ return true;
+}
+
+bool InternalStats::HandleEstimateOldestKeyTime(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ // TODO(yiwu): The property is currently available for fifo compaction
+ // with allow_compaction = false. This is because we don't propagate
+ // oldest_key_time on compaction.
+ if (cfd_->ioptions()->compaction_style != kCompactionStyleFIFO ||
+ cfd_->GetCurrentMutableCFOptions()
+ ->compaction_options_fifo.allow_compaction) {
+ return false;
+ }
+
+ TablePropertiesCollection collection;
+ auto s = cfd_->current()->GetPropertiesOfAllTables(&collection);
+ if (!s.ok()) {
+ return false;
+ }
+ *value = std::numeric_limits<uint64_t>::max();
+ for (auto& p : collection) {
+ *value = std::min(*value, p.second->oldest_key_time);
+ if (*value == 0) {
+ break;
+ }
+ }
+ if (*value > 0) {
+ *value = std::min({cfd_->mem()->ApproximateOldestKeyTime(),
+ cfd_->imm()->ApproximateOldestKeyTime(), *value});
+ }
+ return *value > 0 && *value < std::numeric_limits<uint64_t>::max();
+}
+
+bool InternalStats::HandleBlockCacheStat(Cache** block_cache) {
+ assert(block_cache != nullptr);
+ auto* table_factory = cfd_->ioptions()->table_factory;
+ assert(table_factory != nullptr);
+ if (BlockBasedTableFactory::kName != table_factory->Name()) {
+ return false;
+ }
+ auto* table_options =
+ reinterpret_cast<BlockBasedTableOptions*>(table_factory->GetOptions());
+ if (table_options == nullptr) {
+ return false;
+ }
+ *block_cache = table_options->block_cache.get();
+ if (table_options->no_block_cache || *block_cache == nullptr) {
+ return false;
+ }
+ return true;
+}
+
+bool InternalStats::HandleBlockCacheCapacity(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ Cache* block_cache;
+ bool ok = HandleBlockCacheStat(&block_cache);
+ if (!ok) {
+ return false;
+ }
+ *value = static_cast<uint64_t>(block_cache->GetCapacity());
+ return true;
+}
+
+bool InternalStats::HandleBlockCacheUsage(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ Cache* block_cache;
+ bool ok = HandleBlockCacheStat(&block_cache);
+ if (!ok) {
+ return false;
+ }
+ *value = static_cast<uint64_t>(block_cache->GetUsage());
+ return true;
+}
+
+bool InternalStats::HandleBlockCachePinnedUsage(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ Cache* block_cache;
+ bool ok = HandleBlockCacheStat(&block_cache);
+ if (!ok) {
+ return false;
+ }
+ *value = static_cast<uint64_t>(block_cache->GetPinnedUsage());
+ return true;
+}
+
+void InternalStats::DumpDBStats(std::string* value) {
+ char buf[1000];
+ // DB-level stats, only available from default column family
+ double seconds_up = (env_->NowMicros() - started_at_ + 1) / kMicrosInSec;
+ double interval_seconds_up = seconds_up - db_stats_snapshot_.seconds_up;
+ snprintf(buf, sizeof(buf),
+ "\n** DB Stats **\nUptime(secs): %.1f total, %.1f interval\n",
+ seconds_up, interval_seconds_up);
+ value->append(buf);
+ // Cumulative
+ uint64_t user_bytes_written =
+ GetDBStats(InternalStats::kIntStatsBytesWritten);
+ uint64_t num_keys_written =
+ GetDBStats(InternalStats::kIntStatsNumKeysWritten);
+ uint64_t write_other = GetDBStats(InternalStats::kIntStatsWriteDoneByOther);
+ uint64_t write_self = GetDBStats(InternalStats::kIntStatsWriteDoneBySelf);
+ uint64_t wal_bytes = GetDBStats(InternalStats::kIntStatsWalFileBytes);
+ uint64_t wal_synced = GetDBStats(InternalStats::kIntStatsWalFileSynced);
+ uint64_t write_with_wal = GetDBStats(InternalStats::kIntStatsWriteWithWal);
+ uint64_t write_stall_micros =
+ GetDBStats(InternalStats::kIntStatsWriteStallMicros);
+
+ const int kHumanMicrosLen = 32;
+ char human_micros[kHumanMicrosLen];
+
+ // Data
+ // writes: total number of write requests.
+ // keys: total number of key updates issued by all the write requests
+ // commit groups: number of group commits issued to the DB. Each group can
+ // contain one or more writes.
+ // so writes/keys is the average number of put in multi-put or put
+ // writes/groups is the average group commit size.
+ //
+ // The format is the same for interval stats.
+ snprintf(buf, sizeof(buf),
+ "Cumulative writes: %s writes, %s keys, %s commit groups, "
+ "%.1f writes per commit group, ingest: %.2f GB, %.2f MB/s\n",
+ NumberToHumanString(write_other + write_self).c_str(),
+ NumberToHumanString(num_keys_written).c_str(),
+ NumberToHumanString(write_self).c_str(),
+ (write_other + write_self) / static_cast<double>(write_self + 1),
+ user_bytes_written / kGB, user_bytes_written / kMB / seconds_up);
+ value->append(buf);
+ // WAL
+ snprintf(buf, sizeof(buf),
+ "Cumulative WAL: %s writes, %s syncs, "
+ "%.2f writes per sync, written: %.2f GB, %.2f MB/s\n",
+ NumberToHumanString(write_with_wal).c_str(),
+ NumberToHumanString(wal_synced).c_str(),
+ write_with_wal / static_cast<double>(wal_synced + 1),
+ wal_bytes / kGB, wal_bytes / kMB / seconds_up);
+ value->append(buf);
+ // Stall
+ AppendHumanMicros(write_stall_micros, human_micros, kHumanMicrosLen, true);
+ snprintf(buf, sizeof(buf), "Cumulative stall: %s, %.1f percent\n",
+ human_micros,
+ // 10000 = divide by 1M to get secs, then multiply by 100 for pct
+ write_stall_micros / 10000.0 / std::max(seconds_up, 0.001));
+ value->append(buf);
+
+ // Interval
+ uint64_t interval_write_other = write_other - db_stats_snapshot_.write_other;
+ uint64_t interval_write_self = write_self - db_stats_snapshot_.write_self;
+ uint64_t interval_num_keys_written =
+ num_keys_written - db_stats_snapshot_.num_keys_written;
+ snprintf(
+ buf, sizeof(buf),
+ "Interval writes: %s writes, %s keys, %s commit groups, "
+ "%.1f writes per commit group, ingest: %.2f MB, %.2f MB/s\n",
+ NumberToHumanString(interval_write_other + interval_write_self).c_str(),
+ NumberToHumanString(interval_num_keys_written).c_str(),
+ NumberToHumanString(interval_write_self).c_str(),
+ static_cast<double>(interval_write_other + interval_write_self) /
+ (interval_write_self + 1),
+ (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB,
+ (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB /
+ std::max(interval_seconds_up, 0.001)),
+ value->append(buf);
+
+ uint64_t interval_write_with_wal =
+ write_with_wal - db_stats_snapshot_.write_with_wal;
+ uint64_t interval_wal_synced = wal_synced - db_stats_snapshot_.wal_synced;
+ uint64_t interval_wal_bytes = wal_bytes - db_stats_snapshot_.wal_bytes;
+
+ snprintf(
+ buf, sizeof(buf),
+ "Interval WAL: %s writes, %s syncs, "
+ "%.2f writes per sync, written: %.2f MB, %.2f MB/s\n",
+ NumberToHumanString(interval_write_with_wal).c_str(),
+ NumberToHumanString(interval_wal_synced).c_str(),
+ interval_write_with_wal / static_cast<double>(interval_wal_synced + 1),
+ interval_wal_bytes / kGB,
+ interval_wal_bytes / kMB / std::max(interval_seconds_up, 0.001));
+ value->append(buf);
+
+ // Stall
+ AppendHumanMicros(write_stall_micros - db_stats_snapshot_.write_stall_micros,
+ human_micros, kHumanMicrosLen, true);
+ snprintf(buf, sizeof(buf), "Interval stall: %s, %.1f percent\n", human_micros,
+ // 10000 = divide by 1M to get secs, then multiply by 100 for pct
+ (write_stall_micros - db_stats_snapshot_.write_stall_micros) /
+ 10000.0 / std::max(interval_seconds_up, 0.001));
+ value->append(buf);
+
+ db_stats_snapshot_.seconds_up = seconds_up;
+ db_stats_snapshot_.ingest_bytes = user_bytes_written;
+ db_stats_snapshot_.write_other = write_other;
+ db_stats_snapshot_.write_self = write_self;
+ db_stats_snapshot_.num_keys_written = num_keys_written;
+ db_stats_snapshot_.wal_bytes = wal_bytes;
+ db_stats_snapshot_.wal_synced = wal_synced;
+ db_stats_snapshot_.write_with_wal = write_with_wal;
+ db_stats_snapshot_.write_stall_micros = write_stall_micros;
+}
+
+/**
+ * Dump Compaction Level stats to a map of stat name with "compaction." prefix
+ * to value in double as string. The level in stat name is represented with
+ * a prefix "Lx" where "x" is the level number. A special level "Sum"
+ * represents the sum of a stat for all levels.
+ * The result also contains IO stall counters which keys start with "io_stalls."
+ * and values represent uint64 encoded as strings.
+ */
+void InternalStats::DumpCFMapStats(
+ std::map<std::string, std::string>* cf_stats) {
+ CompactionStats compaction_stats_sum;
+ std::map<int, std::map<LevelStatType, double>> levels_stats;
+ DumpCFMapStats(&levels_stats, &compaction_stats_sum);
+ for (auto const& level_ent : levels_stats) {
+ auto level_str =
+ level_ent.first == -1 ? "Sum" : "L" + ToString(level_ent.first);
+ for (auto const& stat_ent : level_ent.second) {
+ auto stat_type = stat_ent.first;
+ auto key_str =
+ "compaction." + level_str + "." +
+ InternalStats::compaction_level_stats.at(stat_type).property_name;
+ (*cf_stats)[key_str] = std::to_string(stat_ent.second);
+ }
+ }
+
+ DumpCFMapStatsIOStalls(cf_stats);
+}
+
+void InternalStats::DumpCFMapStats(
+ std::map<int, std::map<LevelStatType, double>>* levels_stats,
+ CompactionStats* compaction_stats_sum) {
+ const VersionStorageInfo* vstorage = cfd_->current()->storage_info();
+
+ int num_levels_to_check =
+ (cfd_->ioptions()->compaction_style != kCompactionStyleFIFO)
+ ? vstorage->num_levels() - 1
+ : 1;
+
+ // Compaction scores are sorted based on its value. Restore them to the
+ // level order
+ std::vector<double> compaction_score(number_levels_, 0);
+ for (int i = 0; i < num_levels_to_check; ++i) {
+ compaction_score[vstorage->CompactionScoreLevel(i)] =
+ vstorage->CompactionScore(i);
+ }
+ // Count # of files being compacted for each level
+ std::vector<int> files_being_compacted(number_levels_, 0);
+ for (int level = 0; level < number_levels_; ++level) {
+ for (auto* f : vstorage->LevelFiles(level)) {
+ if (f->being_compacted) {
+ ++files_being_compacted[level];
+ }
+ }
+ }
+
+ int total_files = 0;
+ int total_files_being_compacted = 0;
+ double total_file_size = 0;
+ uint64_t flush_ingest = cf_stats_value_[BYTES_FLUSHED];
+ uint64_t add_file_ingest = cf_stats_value_[BYTES_INGESTED_ADD_FILE];
+ uint64_t curr_ingest = flush_ingest + add_file_ingest;
+ for (int level = 0; level < number_levels_; level++) {
+ int files = vstorage->NumLevelFiles(level);
+ total_files += files;
+ total_files_being_compacted += files_being_compacted[level];
+ if (comp_stats_[level].micros > 0 || files > 0) {
+ compaction_stats_sum->Add(comp_stats_[level]);
+ total_file_size += vstorage->NumLevelBytes(level);
+ uint64_t input_bytes;
+ if (level == 0) {
+ input_bytes = curr_ingest;
+ } else {
+ input_bytes = comp_stats_[level].bytes_read_non_output_levels;
+ }
+ double w_amp =
+ (input_bytes == 0)
+ ? 0.0
+ : static_cast<double>(comp_stats_[level].bytes_written) /
+ input_bytes;
+ std::map<LevelStatType, double> level_stats;
+ PrepareLevelStats(&level_stats, files, files_being_compacted[level],
+ static_cast<double>(vstorage->NumLevelBytes(level)),
+ compaction_score[level], w_amp, comp_stats_[level]);
+ (*levels_stats)[level] = level_stats;
+ }
+ }
+ // Cumulative summary
+ double w_amp = compaction_stats_sum->bytes_written /
+ static_cast<double>(curr_ingest + 1);
+ // Stats summary across levels
+ std::map<LevelStatType, double> sum_stats;
+ PrepareLevelStats(&sum_stats, total_files, total_files_being_compacted,
+ total_file_size, 0, w_amp, *compaction_stats_sum);
+ (*levels_stats)[-1] = sum_stats; // -1 is for the Sum level
+}
+
+void InternalStats::DumpCFMapStatsByPriority(
+ std::map<int, std::map<LevelStatType, double>>* priorities_stats) {
+ for (size_t priority = 0; priority < comp_stats_by_pri_.size(); priority++) {
+ if (comp_stats_by_pri_[priority].micros > 0) {
+ std::map<LevelStatType, double> priority_stats;
+ PrepareLevelStats(&priority_stats, 0 /* num_files */,
+ 0 /* being_compacted */, 0 /* total_file_size */,
+ 0 /* compaction_score */, 0 /* w_amp */,
+ comp_stats_by_pri_[priority]);
+ (*priorities_stats)[static_cast<int>(priority)] = priority_stats;
+ }
+ }
+}
+
+void InternalStats::DumpCFMapStatsIOStalls(
+ std::map<std::string, std::string>* cf_stats) {
+ (*cf_stats)["io_stalls.level0_slowdown"] =
+ std::to_string(cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS]);
+ (*cf_stats)["io_stalls.level0_slowdown_with_compaction"] =
+ std::to_string(cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS]);
+ (*cf_stats)["io_stalls.level0_numfiles"] =
+ std::to_string(cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS]);
+ (*cf_stats)["io_stalls.level0_numfiles_with_compaction"] =
+ std::to_string(cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_STOPS]);
+ (*cf_stats)["io_stalls.stop_for_pending_compaction_bytes"] =
+ std::to_string(cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS]);
+ (*cf_stats)["io_stalls.slowdown_for_pending_compaction_bytes"] =
+ std::to_string(cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS]);
+ (*cf_stats)["io_stalls.memtable_compaction"] =
+ std::to_string(cf_stats_count_[MEMTABLE_LIMIT_STOPS]);
+ (*cf_stats)["io_stalls.memtable_slowdown"] =
+ std::to_string(cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS]);
+
+ uint64_t total_stop = cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS] +
+ cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS] +
+ cf_stats_count_[MEMTABLE_LIMIT_STOPS];
+
+ uint64_t total_slowdown =
+ cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS] +
+ cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS] +
+ cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS];
+
+ (*cf_stats)["io_stalls.total_stop"] = std::to_string(total_stop);
+ (*cf_stats)["io_stalls.total_slowdown"] = std::to_string(total_slowdown);
+}
+
+void InternalStats::DumpCFStats(std::string* value) {
+ DumpCFStatsNoFileHistogram(value);
+ DumpCFFileHistogram(value);
+}
+
+void InternalStats::DumpCFStatsNoFileHistogram(std::string* value) {
+ char buf[2000];
+ // Per-ColumnFamily stats
+ PrintLevelStatsHeader(buf, sizeof(buf), cfd_->GetName(), "Level");
+ value->append(buf);
+
+ // Print stats for each level
+ std::map<int, std::map<LevelStatType, double>> levels_stats;
+ CompactionStats compaction_stats_sum;
+ DumpCFMapStats(&levels_stats, &compaction_stats_sum);
+ for (int l = 0; l < number_levels_; ++l) {
+ if (levels_stats.find(l) != levels_stats.end()) {
+ PrintLevelStats(buf, sizeof(buf), "L" + ToString(l), levels_stats[l]);
+ value->append(buf);
+ }
+ }
+
+ // Print sum of level stats
+ PrintLevelStats(buf, sizeof(buf), "Sum", levels_stats[-1]);
+ value->append(buf);
+
+ uint64_t flush_ingest = cf_stats_value_[BYTES_FLUSHED];
+ uint64_t add_file_ingest = cf_stats_value_[BYTES_INGESTED_ADD_FILE];
+ uint64_t ingest_files_addfile = cf_stats_value_[INGESTED_NUM_FILES_TOTAL];
+ uint64_t ingest_l0_files_addfile =
+ cf_stats_value_[INGESTED_LEVEL0_NUM_FILES_TOTAL];
+ uint64_t ingest_keys_addfile = cf_stats_value_[INGESTED_NUM_KEYS_TOTAL];
+ // Cumulative summary
+ uint64_t total_stall_count =
+ cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS] +
+ cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS] +
+ cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS] +
+ cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS] +
+ cf_stats_count_[MEMTABLE_LIMIT_STOPS] +
+ cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS];
+ // Interval summary
+ uint64_t interval_flush_ingest =
+ flush_ingest - cf_stats_snapshot_.ingest_bytes_flush;
+ uint64_t interval_add_file_inget =
+ add_file_ingest - cf_stats_snapshot_.ingest_bytes_addfile;
+ uint64_t interval_ingest =
+ interval_flush_ingest + interval_add_file_inget + 1;
+ CompactionStats interval_stats(compaction_stats_sum);
+ interval_stats.Subtract(cf_stats_snapshot_.comp_stats);
+ double w_amp =
+ interval_stats.bytes_written / static_cast<double>(interval_ingest);
+ PrintLevelStats(buf, sizeof(buf), "Int", 0, 0, 0, 0, w_amp, interval_stats);
+ value->append(buf);
+
+ PrintLevelStatsHeader(buf, sizeof(buf), cfd_->GetName(), "Priority");
+ value->append(buf);
+ std::map<int, std::map<LevelStatType, double>> priorities_stats;
+ DumpCFMapStatsByPriority(&priorities_stats);
+ for (size_t priority = 0; priority < comp_stats_by_pri_.size(); ++priority) {
+ if (priorities_stats.find(static_cast<int>(priority)) !=
+ priorities_stats.end()) {
+ PrintLevelStats(
+ buf, sizeof(buf),
+ Env::PriorityToString(static_cast<Env::Priority>(priority)),
+ priorities_stats[static_cast<int>(priority)]);
+ value->append(buf);
+ }
+ }
+
+ double seconds_up = (env_->NowMicros() - started_at_ + 1) / kMicrosInSec;
+ double interval_seconds_up = seconds_up - cf_stats_snapshot_.seconds_up;
+ snprintf(buf, sizeof(buf), "Uptime(secs): %.1f total, %.1f interval\n",
+ seconds_up, interval_seconds_up);
+ value->append(buf);
+ snprintf(buf, sizeof(buf), "Flush(GB): cumulative %.3f, interval %.3f\n",
+ flush_ingest / kGB, interval_flush_ingest / kGB);
+ value->append(buf);
+ snprintf(buf, sizeof(buf), "AddFile(GB): cumulative %.3f, interval %.3f\n",
+ add_file_ingest / kGB, interval_add_file_inget / kGB);
+ value->append(buf);
+
+ uint64_t interval_ingest_files_addfile =
+ ingest_files_addfile - cf_stats_snapshot_.ingest_files_addfile;
+ snprintf(buf, sizeof(buf),
+ "AddFile(Total Files): cumulative %" PRIu64 ", interval %" PRIu64
+ "\n",
+ ingest_files_addfile, interval_ingest_files_addfile);
+ value->append(buf);
+
+ uint64_t interval_ingest_l0_files_addfile =
+ ingest_l0_files_addfile - cf_stats_snapshot_.ingest_l0_files_addfile;
+ snprintf(buf, sizeof(buf),
+ "AddFile(L0 Files): cumulative %" PRIu64 ", interval %" PRIu64 "\n",
+ ingest_l0_files_addfile, interval_ingest_l0_files_addfile);
+ value->append(buf);
+
+ uint64_t interval_ingest_keys_addfile =
+ ingest_keys_addfile - cf_stats_snapshot_.ingest_keys_addfile;
+ snprintf(buf, sizeof(buf),
+ "AddFile(Keys): cumulative %" PRIu64 ", interval %" PRIu64 "\n",
+ ingest_keys_addfile, interval_ingest_keys_addfile);
+ value->append(buf);
+
+ // Compact
+ uint64_t compact_bytes_read = 0;
+ uint64_t compact_bytes_write = 0;
+ uint64_t compact_micros = 0;
+ for (int level = 0; level < number_levels_; level++) {
+ compact_bytes_read += comp_stats_[level].bytes_read_output_level +
+ comp_stats_[level].bytes_read_non_output_levels;
+ compact_bytes_write += comp_stats_[level].bytes_written;
+ compact_micros += comp_stats_[level].micros;
+ }
+
+ snprintf(buf, sizeof(buf),
+ "Cumulative compaction: %.2f GB write, %.2f MB/s write, "
+ "%.2f GB read, %.2f MB/s read, %.1f seconds\n",
+ compact_bytes_write / kGB, compact_bytes_write / kMB / seconds_up,
+ compact_bytes_read / kGB, compact_bytes_read / kMB / seconds_up,
+ compact_micros / kMicrosInSec);
+ value->append(buf);
+
+ // Compaction interval
+ uint64_t interval_compact_bytes_write =
+ compact_bytes_write - cf_stats_snapshot_.compact_bytes_write;
+ uint64_t interval_compact_bytes_read =
+ compact_bytes_read - cf_stats_snapshot_.compact_bytes_read;
+ uint64_t interval_compact_micros =
+ compact_micros - cf_stats_snapshot_.compact_micros;
+
+ snprintf(
+ buf, sizeof(buf),
+ "Interval compaction: %.2f GB write, %.2f MB/s write, "
+ "%.2f GB read, %.2f MB/s read, %.1f seconds\n",
+ interval_compact_bytes_write / kGB,
+ interval_compact_bytes_write / kMB / std::max(interval_seconds_up, 0.001),
+ interval_compact_bytes_read / kGB,
+ interval_compact_bytes_read / kMB / std::max(interval_seconds_up, 0.001),
+ interval_compact_micros / kMicrosInSec);
+ value->append(buf);
+ cf_stats_snapshot_.compact_bytes_write = compact_bytes_write;
+ cf_stats_snapshot_.compact_bytes_read = compact_bytes_read;
+ cf_stats_snapshot_.compact_micros = compact_micros;
+
+ snprintf(buf, sizeof(buf),
+ "Stalls(count): %" PRIu64
+ " level0_slowdown, "
+ "%" PRIu64
+ " level0_slowdown_with_compaction, "
+ "%" PRIu64
+ " level0_numfiles, "
+ "%" PRIu64
+ " level0_numfiles_with_compaction, "
+ "%" PRIu64
+ " stop for pending_compaction_bytes, "
+ "%" PRIu64
+ " slowdown for pending_compaction_bytes, "
+ "%" PRIu64
+ " memtable_compaction, "
+ "%" PRIu64
+ " memtable_slowdown, "
+ "interval %" PRIu64 " total count\n",
+ cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS],
+ cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS],
+ cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS],
+ cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_STOPS],
+ cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS],
+ cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS],
+ cf_stats_count_[MEMTABLE_LIMIT_STOPS],
+ cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS],
+ total_stall_count - cf_stats_snapshot_.stall_count);
+ value->append(buf);
+
+ cf_stats_snapshot_.seconds_up = seconds_up;
+ cf_stats_snapshot_.ingest_bytes_flush = flush_ingest;
+ cf_stats_snapshot_.ingest_bytes_addfile = add_file_ingest;
+ cf_stats_snapshot_.ingest_files_addfile = ingest_files_addfile;
+ cf_stats_snapshot_.ingest_l0_files_addfile = ingest_l0_files_addfile;
+ cf_stats_snapshot_.ingest_keys_addfile = ingest_keys_addfile;
+ cf_stats_snapshot_.comp_stats = compaction_stats_sum;
+ cf_stats_snapshot_.stall_count = total_stall_count;
+}
+
+void InternalStats::DumpCFFileHistogram(std::string* value) {
+ char buf[2000];
+ snprintf(buf, sizeof(buf),
+ "\n** File Read Latency Histogram By Level [%s] **\n",
+ cfd_->GetName().c_str());
+ value->append(buf);
+
+ for (int level = 0; level < number_levels_; level++) {
+ if (!file_read_latency_[level].Empty()) {
+ char buf2[5000];
+ snprintf(buf2, sizeof(buf2),
+ "** Level %d read latency histogram (micros):\n%s\n", level,
+ file_read_latency_[level].ToString().c_str());
+ value->append(buf2);
+ }
+ }
+}
+
+#else
+
+const DBPropertyInfo* GetPropertyInfo(const Slice& /*property*/) {
+ return nullptr;
+}
+
+#endif // !ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/internal_stats.h b/src/rocksdb/db/internal_stats.h
new file mode 100644
index 000000000..ce83be244
--- /dev/null
+++ b/src/rocksdb/db/internal_stats.h
@@ -0,0 +1,697 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+
+#pragma once
+#include <map>
+#include <string>
+#include <vector>
+
+#include "db/version_set.h"
+
+class ColumnFamilyData;
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBImpl;
+class MemTableList;
+
+// Config for retrieving a property's value.
+struct DBPropertyInfo {
+ bool need_out_of_mutex;
+
+ // gcc had an internal error for initializing union of pointer-to-member-
+ // functions. Workaround is to populate exactly one of the following function
+ // pointers with a non-nullptr value.
+
+ // @param value Value-result argument for storing the property's string value
+ // @param suffix Argument portion of the property. For example, suffix would
+ // be "5" for the property "rocksdb.num-files-at-level5". So far, only
+ // certain string properties take an argument.
+ bool (InternalStats::*handle_string)(std::string* value, Slice suffix);
+
+ // @param value Value-result argument for storing the property's uint64 value
+ // @param db Many of the int properties rely on DBImpl methods.
+ // @param version Version is needed in case the property is retrieved without
+ // holding db mutex, which is only supported for int properties.
+ bool (InternalStats::*handle_int)(uint64_t* value, DBImpl* db,
+ Version* version);
+
+ // @param props Map of general properties to populate
+ bool (InternalStats::*handle_map)(std::map<std::string, std::string>* props);
+
+ // handle the string type properties rely on DBImpl methods
+ // @param value Value-result argument for storing the property's string value
+ bool (DBImpl::*handle_string_dbimpl)(std::string* value);
+};
+
+extern const DBPropertyInfo* GetPropertyInfo(const Slice& property);
+
+#ifndef ROCKSDB_LITE
+#undef SCORE
+enum class LevelStatType {
+ INVALID = 0,
+ NUM_FILES,
+ COMPACTED_FILES,
+ SIZE_BYTES,
+ SCORE,
+ READ_GB,
+ RN_GB,
+ RNP1_GB,
+ WRITE_GB,
+ W_NEW_GB,
+ MOVED_GB,
+ WRITE_AMP,
+ READ_MBPS,
+ WRITE_MBPS,
+ COMP_SEC,
+ COMP_CPU_SEC,
+ COMP_COUNT,
+ AVG_SEC,
+ KEY_IN,
+ KEY_DROP,
+ TOTAL // total number of types
+};
+
+struct LevelStat {
+ // This what will be L?.property_name in the flat map returned to the user
+ std::string property_name;
+ // This will be what we will print in the header in the cli
+ std::string header_name;
+};
+
+class InternalStats {
+ public:
+ static const std::map<LevelStatType, LevelStat> compaction_level_stats;
+
+ enum InternalCFStatsType {
+ L0_FILE_COUNT_LIMIT_SLOWDOWNS,
+ LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS,
+ MEMTABLE_LIMIT_STOPS,
+ MEMTABLE_LIMIT_SLOWDOWNS,
+ L0_FILE_COUNT_LIMIT_STOPS,
+ LOCKED_L0_FILE_COUNT_LIMIT_STOPS,
+ PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS,
+ PENDING_COMPACTION_BYTES_LIMIT_STOPS,
+ WRITE_STALLS_ENUM_MAX,
+ BYTES_FLUSHED,
+ BYTES_INGESTED_ADD_FILE,
+ INGESTED_NUM_FILES_TOTAL,
+ INGESTED_LEVEL0_NUM_FILES_TOTAL,
+ INGESTED_NUM_KEYS_TOTAL,
+ INTERNAL_CF_STATS_ENUM_MAX,
+ };
+
+ enum InternalDBStatsType {
+ kIntStatsWalFileBytes,
+ kIntStatsWalFileSynced,
+ kIntStatsBytesWritten,
+ kIntStatsNumKeysWritten,
+ kIntStatsWriteDoneByOther,
+ kIntStatsWriteDoneBySelf,
+ kIntStatsWriteWithWal,
+ kIntStatsWriteStallMicros,
+ kIntStatsNumMax,
+ };
+
+ InternalStats(int num_levels, Env* env, ColumnFamilyData* cfd)
+ : db_stats_{},
+ cf_stats_value_{},
+ cf_stats_count_{},
+ comp_stats_(num_levels),
+ comp_stats_by_pri_(Env::Priority::TOTAL),
+ file_read_latency_(num_levels),
+ bg_error_count_(0),
+ number_levels_(num_levels),
+ env_(env),
+ cfd_(cfd),
+ started_at_(env->NowMicros()) {}
+
+ // Per level compaction stats. comp_stats_[level] stores the stats for
+ // compactions that produced data for the specified "level".
+ struct CompactionStats {
+ uint64_t micros;
+ uint64_t cpu_micros;
+
+ // The number of bytes read from all non-output levels
+ uint64_t bytes_read_non_output_levels;
+
+ // The number of bytes read from the compaction output level.
+ uint64_t bytes_read_output_level;
+
+ // Total number of bytes written during compaction
+ uint64_t bytes_written;
+
+ // Total number of bytes moved to the output level
+ uint64_t bytes_moved;
+
+ // The number of compaction input files in all non-output levels.
+ int num_input_files_in_non_output_levels;
+
+ // The number of compaction input files in the output level.
+ int num_input_files_in_output_level;
+
+ // The number of compaction output files.
+ int num_output_files;
+
+ // Total incoming entries during compaction between levels N and N+1
+ uint64_t num_input_records;
+
+ // Accumulated diff number of entries
+ // (num input entries - num output entires) for compaction levels N and N+1
+ uint64_t num_dropped_records;
+
+ // Number of compactions done
+ int count;
+
+ // Number of compactions done per CompactionReason
+ int counts[static_cast<int>(CompactionReason::kNumOfReasons)];
+
+ explicit CompactionStats()
+ : micros(0),
+ cpu_micros(0),
+ bytes_read_non_output_levels(0),
+ bytes_read_output_level(0),
+ bytes_written(0),
+ bytes_moved(0),
+ num_input_files_in_non_output_levels(0),
+ num_input_files_in_output_level(0),
+ num_output_files(0),
+ num_input_records(0),
+ num_dropped_records(0),
+ count(0) {
+ int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+ for (int i = 0; i < num_of_reasons; i++) {
+ counts[i] = 0;
+ }
+ }
+
+ explicit CompactionStats(CompactionReason reason, int c)
+ : micros(0),
+ cpu_micros(0),
+ bytes_read_non_output_levels(0),
+ bytes_read_output_level(0),
+ bytes_written(0),
+ bytes_moved(0),
+ num_input_files_in_non_output_levels(0),
+ num_input_files_in_output_level(0),
+ num_output_files(0),
+ num_input_records(0),
+ num_dropped_records(0),
+ count(c) {
+ int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+ for (int i = 0; i < num_of_reasons; i++) {
+ counts[i] = 0;
+ }
+ int r = static_cast<int>(reason);
+ if (r >= 0 && r < num_of_reasons) {
+ counts[r] = c;
+ } else {
+ count = 0;
+ }
+ }
+
+ explicit CompactionStats(const CompactionStats& c)
+ : micros(c.micros),
+ cpu_micros(c.cpu_micros),
+ bytes_read_non_output_levels(c.bytes_read_non_output_levels),
+ bytes_read_output_level(c.bytes_read_output_level),
+ bytes_written(c.bytes_written),
+ bytes_moved(c.bytes_moved),
+ num_input_files_in_non_output_levels(
+ c.num_input_files_in_non_output_levels),
+ num_input_files_in_output_level(c.num_input_files_in_output_level),
+ num_output_files(c.num_output_files),
+ num_input_records(c.num_input_records),
+ num_dropped_records(c.num_dropped_records),
+ count(c.count) {
+ int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+ for (int i = 0; i < num_of_reasons; i++) {
+ counts[i] = c.counts[i];
+ }
+ }
+
+ CompactionStats& operator=(const CompactionStats& c) {
+ micros = c.micros;
+ cpu_micros = c.cpu_micros;
+ bytes_read_non_output_levels = c.bytes_read_non_output_levels;
+ bytes_read_output_level = c.bytes_read_output_level;
+ bytes_written = c.bytes_written;
+ bytes_moved = c.bytes_moved;
+ num_input_files_in_non_output_levels =
+ c.num_input_files_in_non_output_levels;
+ num_input_files_in_output_level = c.num_input_files_in_output_level;
+ num_output_files = c.num_output_files;
+ num_input_records = c.num_input_records;
+ num_dropped_records = c.num_dropped_records;
+ count = c.count;
+
+ int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+ for (int i = 0; i < num_of_reasons; i++) {
+ counts[i] = c.counts[i];
+ }
+ return *this;
+ }
+
+ void Clear() {
+ this->micros = 0;
+ this->cpu_micros = 0;
+ this->bytes_read_non_output_levels = 0;
+ this->bytes_read_output_level = 0;
+ this->bytes_written = 0;
+ this->bytes_moved = 0;
+ this->num_input_files_in_non_output_levels = 0;
+ this->num_input_files_in_output_level = 0;
+ this->num_output_files = 0;
+ this->num_input_records = 0;
+ this->num_dropped_records = 0;
+ this->count = 0;
+ int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+ for (int i = 0; i < num_of_reasons; i++) {
+ counts[i] = 0;
+ }
+ }
+
+ void Add(const CompactionStats& c) {
+ this->micros += c.micros;
+ this->cpu_micros += c.cpu_micros;
+ this->bytes_read_non_output_levels += c.bytes_read_non_output_levels;
+ this->bytes_read_output_level += c.bytes_read_output_level;
+ this->bytes_written += c.bytes_written;
+ this->bytes_moved += c.bytes_moved;
+ this->num_input_files_in_non_output_levels +=
+ c.num_input_files_in_non_output_levels;
+ this->num_input_files_in_output_level +=
+ c.num_input_files_in_output_level;
+ this->num_output_files += c.num_output_files;
+ this->num_input_records += c.num_input_records;
+ this->num_dropped_records += c.num_dropped_records;
+ this->count += c.count;
+ int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+ for (int i = 0; i< num_of_reasons; i++) {
+ counts[i] += c.counts[i];
+ }
+ }
+
+ void Subtract(const CompactionStats& c) {
+ this->micros -= c.micros;
+ this->cpu_micros -= c.cpu_micros;
+ this->bytes_read_non_output_levels -= c.bytes_read_non_output_levels;
+ this->bytes_read_output_level -= c.bytes_read_output_level;
+ this->bytes_written -= c.bytes_written;
+ this->bytes_moved -= c.bytes_moved;
+ this->num_input_files_in_non_output_levels -=
+ c.num_input_files_in_non_output_levels;
+ this->num_input_files_in_output_level -=
+ c.num_input_files_in_output_level;
+ this->num_output_files -= c.num_output_files;
+ this->num_input_records -= c.num_input_records;
+ this->num_dropped_records -= c.num_dropped_records;
+ this->count -= c.count;
+ int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+ for (int i = 0; i < num_of_reasons; i++) {
+ counts[i] -= c.counts[i];
+ }
+ }
+ };
+
+ void Clear() {
+ for (int i = 0; i < kIntStatsNumMax; i++) {
+ db_stats_[i].store(0);
+ }
+ for (int i = 0; i < INTERNAL_CF_STATS_ENUM_MAX; i++) {
+ cf_stats_count_[i] = 0;
+ cf_stats_value_[i] = 0;
+ }
+ for (auto& comp_stat : comp_stats_) {
+ comp_stat.Clear();
+ }
+ for (auto& h : file_read_latency_) {
+ h.Clear();
+ }
+ cf_stats_snapshot_.Clear();
+ db_stats_snapshot_.Clear();
+ bg_error_count_ = 0;
+ started_at_ = env_->NowMicros();
+ }
+
+ void AddCompactionStats(int level, Env::Priority thread_pri,
+ const CompactionStats& stats) {
+ comp_stats_[level].Add(stats);
+ comp_stats_by_pri_[thread_pri].Add(stats);
+ }
+
+ void IncBytesMoved(int level, uint64_t amount) {
+ comp_stats_[level].bytes_moved += amount;
+ }
+
+ void AddCFStats(InternalCFStatsType type, uint64_t value) {
+ cf_stats_value_[type] += value;
+ ++cf_stats_count_[type];
+ }
+
+ void AddDBStats(InternalDBStatsType type, uint64_t value,
+ bool concurrent = false) {
+ auto& v = db_stats_[type];
+ if (concurrent) {
+ v.fetch_add(value, std::memory_order_relaxed);
+ } else {
+ v.store(v.load(std::memory_order_relaxed) + value,
+ std::memory_order_relaxed);
+ }
+ }
+
+ uint64_t GetDBStats(InternalDBStatsType type) {
+ return db_stats_[type].load(std::memory_order_relaxed);
+ }
+
+ HistogramImpl* GetFileReadHist(int level) {
+ return &file_read_latency_[level];
+ }
+
+ uint64_t GetBackgroundErrorCount() const { return bg_error_count_; }
+
+ uint64_t BumpAndGetBackgroundErrorCount() { return ++bg_error_count_; }
+
+ bool GetStringProperty(const DBPropertyInfo& property_info,
+ const Slice& property, std::string* value);
+
+ bool GetMapProperty(const DBPropertyInfo& property_info,
+ const Slice& property,
+ std::map<std::string, std::string>* value);
+
+ bool GetIntProperty(const DBPropertyInfo& property_info, uint64_t* value,
+ DBImpl* db);
+
+ bool GetIntPropertyOutOfMutex(const DBPropertyInfo& property_info,
+ Version* version, uint64_t* value);
+
+ const std::vector<CompactionStats>& TEST_GetCompactionStats() const {
+ return comp_stats_;
+ }
+
+ // Store a mapping from the user-facing DB::Properties string to our
+ // DBPropertyInfo struct used internally for retrieving properties.
+ static const std::unordered_map<std::string, DBPropertyInfo> ppt_name_to_info;
+
+ private:
+ void DumpDBStats(std::string* value);
+ void DumpCFMapStats(std::map<std::string, std::string>* cf_stats);
+ void DumpCFMapStats(
+ std::map<int, std::map<LevelStatType, double>>* level_stats,
+ CompactionStats* compaction_stats_sum);
+ void DumpCFMapStatsByPriority(
+ std::map<int, std::map<LevelStatType, double>>* priorities_stats);
+ void DumpCFMapStatsIOStalls(std::map<std::string, std::string>* cf_stats);
+ void DumpCFStats(std::string* value);
+ void DumpCFStatsNoFileHistogram(std::string* value);
+ void DumpCFFileHistogram(std::string* value);
+
+ bool HandleBlockCacheStat(Cache** block_cache);
+
+ // Per-DB stats
+ std::atomic<uint64_t> db_stats_[kIntStatsNumMax];
+ // Per-ColumnFamily stats
+ uint64_t cf_stats_value_[INTERNAL_CF_STATS_ENUM_MAX];
+ uint64_t cf_stats_count_[INTERNAL_CF_STATS_ENUM_MAX];
+ // Per-ColumnFamily/level compaction stats
+ std::vector<CompactionStats> comp_stats_;
+ std::vector<CompactionStats> comp_stats_by_pri_;
+ std::vector<HistogramImpl> file_read_latency_;
+
+ // Used to compute per-interval statistics
+ struct CFStatsSnapshot {
+ // ColumnFamily-level stats
+ CompactionStats comp_stats;
+ uint64_t ingest_bytes_flush; // Bytes written to L0 (Flush)
+ uint64_t stall_count; // Stall count
+ // Stats from compaction jobs - bytes written, bytes read, duration.
+ uint64_t compact_bytes_write;
+ uint64_t compact_bytes_read;
+ uint64_t compact_micros;
+ double seconds_up;
+
+ // AddFile specific stats
+ uint64_t ingest_bytes_addfile; // Total Bytes ingested
+ uint64_t ingest_files_addfile; // Total number of files ingested
+ uint64_t ingest_l0_files_addfile; // Total number of files ingested to L0
+ uint64_t ingest_keys_addfile; // Total number of keys ingested
+
+ CFStatsSnapshot()
+ : ingest_bytes_flush(0),
+ stall_count(0),
+ compact_bytes_write(0),
+ compact_bytes_read(0),
+ compact_micros(0),
+ seconds_up(0),
+ ingest_bytes_addfile(0),
+ ingest_files_addfile(0),
+ ingest_l0_files_addfile(0),
+ ingest_keys_addfile(0) {}
+
+ void Clear() {
+ comp_stats.Clear();
+ ingest_bytes_flush = 0;
+ stall_count = 0;
+ compact_bytes_write = 0;
+ compact_bytes_read = 0;
+ compact_micros = 0;
+ seconds_up = 0;
+ ingest_bytes_addfile = 0;
+ ingest_files_addfile = 0;
+ ingest_l0_files_addfile = 0;
+ ingest_keys_addfile = 0;
+ }
+ } cf_stats_snapshot_;
+
+ struct DBStatsSnapshot {
+ // DB-level stats
+ uint64_t ingest_bytes; // Bytes written by user
+ uint64_t wal_bytes; // Bytes written to WAL
+ uint64_t wal_synced; // Number of times WAL is synced
+ uint64_t write_with_wal; // Number of writes that request WAL
+ // These count the number of writes processed by the calling thread or
+ // another thread.
+ uint64_t write_other;
+ uint64_t write_self;
+ // Total number of keys written. write_self and write_other measure number
+ // of write requests written, Each of the write request can contain updates
+ // to multiple keys. num_keys_written is total number of keys updated by all
+ // those writes.
+ uint64_t num_keys_written;
+ // Total time writes delayed by stalls.
+ uint64_t write_stall_micros;
+ double seconds_up;
+
+ DBStatsSnapshot()
+ : ingest_bytes(0),
+ wal_bytes(0),
+ wal_synced(0),
+ write_with_wal(0),
+ write_other(0),
+ write_self(0),
+ num_keys_written(0),
+ write_stall_micros(0),
+ seconds_up(0) {}
+
+ void Clear() {
+ ingest_bytes = 0;
+ wal_bytes = 0;
+ wal_synced = 0;
+ write_with_wal = 0;
+ write_other = 0;
+ write_self = 0;
+ num_keys_written = 0;
+ write_stall_micros = 0;
+ seconds_up = 0;
+ }
+ } db_stats_snapshot_;
+
+ // Handler functions for getting property values. They use "value" as a value-
+ // result argument, and return true upon successfully setting "value".
+ bool HandleNumFilesAtLevel(std::string* value, Slice suffix);
+ bool HandleCompressionRatioAtLevelPrefix(std::string* value, Slice suffix);
+ bool HandleLevelStats(std::string* value, Slice suffix);
+ bool HandleStats(std::string* value, Slice suffix);
+ bool HandleCFMapStats(std::map<std::string, std::string>* compaction_stats);
+ bool HandleCFStats(std::string* value, Slice suffix);
+ bool HandleCFStatsNoFileHistogram(std::string* value, Slice suffix);
+ bool HandleCFFileHistogram(std::string* value, Slice suffix);
+ bool HandleDBStats(std::string* value, Slice suffix);
+ bool HandleSsTables(std::string* value, Slice suffix);
+ bool HandleAggregatedTableProperties(std::string* value, Slice suffix);
+ bool HandleAggregatedTablePropertiesAtLevel(std::string* value, Slice suffix);
+ bool HandleNumImmutableMemTable(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleNumImmutableMemTableFlushed(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleMemTableFlushPending(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleNumRunningFlushes(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleCompactionPending(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleNumRunningCompactions(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleBackgroundErrors(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleCurSizeActiveMemTable(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleCurSizeAllMemTables(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleSizeAllMemTables(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleNumEntriesActiveMemTable(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleNumEntriesImmMemTables(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleNumDeletesActiveMemTable(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleNumDeletesImmMemTables(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleEstimateNumKeys(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleNumSnapshots(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleOldestSnapshotTime(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleOldestSnapshotSequence(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleNumLiveVersions(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleCurrentSuperVersionNumber(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleIsFileDeletionsEnabled(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleBaseLevel(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleTotalSstFilesSize(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleLiveSstFilesSize(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleEstimatePendingCompactionBytes(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleEstimateTableReadersMem(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleEstimateLiveDataSize(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleMinLogNumberToKeep(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleMinObsoleteSstNumberToKeep(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleActualDelayedWriteRate(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleIsWriteStopped(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleEstimateOldestKeyTime(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleBlockCacheCapacity(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleBlockCacheUsage(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleBlockCachePinnedUsage(uint64_t* value, DBImpl* db,
+ Version* version);
+ // Total number of background errors encountered. Every time a flush task
+ // or compaction task fails, this counter is incremented. The failure can
+ // be caused by any possible reason, including file system errors, out of
+ // resources, or input file corruption. Failing when retrying the same flush
+ // or compaction will cause the counter to increase too.
+ uint64_t bg_error_count_;
+
+ const int number_levels_;
+ Env* env_;
+ ColumnFamilyData* cfd_;
+ uint64_t started_at_;
+};
+
+#else
+
+class InternalStats {
+ public:
+ enum InternalCFStatsType {
+ L0_FILE_COUNT_LIMIT_SLOWDOWNS,
+ LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS,
+ MEMTABLE_LIMIT_STOPS,
+ MEMTABLE_LIMIT_SLOWDOWNS,
+ L0_FILE_COUNT_LIMIT_STOPS,
+ LOCKED_L0_FILE_COUNT_LIMIT_STOPS,
+ PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS,
+ PENDING_COMPACTION_BYTES_LIMIT_STOPS,
+ WRITE_STALLS_ENUM_MAX,
+ BYTES_FLUSHED,
+ BYTES_INGESTED_ADD_FILE,
+ INGESTED_NUM_FILES_TOTAL,
+ INGESTED_LEVEL0_NUM_FILES_TOTAL,
+ INGESTED_NUM_KEYS_TOTAL,
+ INTERNAL_CF_STATS_ENUM_MAX,
+ };
+
+ enum InternalDBStatsType {
+ kIntStatsWalFileBytes,
+ kIntStatsWalFileSynced,
+ kIntStatsBytesWritten,
+ kIntStatsNumKeysWritten,
+ kIntStatsWriteDoneByOther,
+ kIntStatsWriteDoneBySelf,
+ kIntStatsWriteWithWal,
+ kIntStatsWriteStallMicros,
+ kIntStatsNumMax,
+ };
+
+ InternalStats(int /*num_levels*/, Env* /*env*/, ColumnFamilyData* /*cfd*/) {}
+
+ struct CompactionStats {
+ uint64_t micros;
+ uint64_t cpu_micros;
+ uint64_t bytes_read_non_output_levels;
+ uint64_t bytes_read_output_level;
+ uint64_t bytes_written;
+ uint64_t bytes_moved;
+ int num_input_files_in_non_output_levels;
+ int num_input_files_in_output_level;
+ int num_output_files;
+ uint64_t num_input_records;
+ uint64_t num_dropped_records;
+ int count;
+
+ explicit CompactionStats() {}
+
+ explicit CompactionStats(CompactionReason /*reason*/, int /*c*/) {}
+
+ explicit CompactionStats(const CompactionStats& /*c*/) {}
+
+ void Add(const CompactionStats& /*c*/) {}
+
+ void Subtract(const CompactionStats& /*c*/) {}
+ };
+
+ void AddCompactionStats(int /*level*/, Env::Priority /*thread_pri*/,
+ const CompactionStats& /*stats*/) {}
+
+ void IncBytesMoved(int /*level*/, uint64_t /*amount*/) {}
+
+ void AddCFStats(InternalCFStatsType /*type*/, uint64_t /*value*/) {}
+
+ void AddDBStats(InternalDBStatsType /*type*/, uint64_t /*value*/,
+ bool /*concurrent */ = false) {}
+
+ HistogramImpl* GetFileReadHist(int /*level*/) { return nullptr; }
+
+ uint64_t GetBackgroundErrorCount() const { return 0; }
+
+ uint64_t BumpAndGetBackgroundErrorCount() { return 0; }
+
+ bool GetStringProperty(const DBPropertyInfo& /*property_info*/,
+ const Slice& /*property*/, std::string* /*value*/) {
+ return false;
+ }
+
+ bool GetMapProperty(const DBPropertyInfo& /*property_info*/,
+ const Slice& /*property*/,
+ std::map<std::string, std::string>* /*value*/) {
+ return false;
+ }
+
+ bool GetIntProperty(const DBPropertyInfo& /*property_info*/, uint64_t* /*value*/,
+ DBImpl* /*db*/) const {
+ return false;
+ }
+
+ bool GetIntPropertyOutOfMutex(const DBPropertyInfo& /*property_info*/,
+ Version* /*version*/, uint64_t* /*value*/) const {
+ return false;
+ }
+};
+#endif // !ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/job_context.h b/src/rocksdb/db/job_context.h
new file mode 100644
index 000000000..31ff26c3a
--- /dev/null
+++ b/src/rocksdb/db/job_context.h
@@ -0,0 +1,219 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "db/log_writer.h"
+#include "db/column_family.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MemTable;
+struct SuperVersion;
+
+struct SuperVersionContext {
+ struct WriteStallNotification {
+ WriteStallInfo write_stall_info;
+ const ImmutableCFOptions* immutable_cf_options;
+ };
+
+ autovector<SuperVersion*> superversions_to_free;
+#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION
+ autovector<WriteStallNotification> write_stall_notifications;
+#endif
+ std::unique_ptr<SuperVersion>
+ new_superversion; // if nullptr no new superversion
+
+ explicit SuperVersionContext(bool create_superversion = false)
+ : new_superversion(create_superversion ? new SuperVersion() : nullptr) {}
+
+ explicit SuperVersionContext(SuperVersionContext&& other)
+ : superversions_to_free(std::move(other.superversions_to_free)),
+#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION
+ write_stall_notifications(std::move(other.write_stall_notifications)),
+#endif
+ new_superversion(std::move(other.new_superversion)) {
+ }
+
+ void NewSuperVersion() {
+ new_superversion = std::unique_ptr<SuperVersion>(new SuperVersion());
+ }
+
+ inline bool HaveSomethingToDelete() const {
+#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION
+ return !superversions_to_free.empty() ||
+ !write_stall_notifications.empty();
+#else
+ return !superversions_to_free.empty();
+#endif
+ }
+
+ void PushWriteStallNotification(
+ WriteStallCondition old_cond, WriteStallCondition new_cond,
+ const std::string& name, const ImmutableCFOptions* ioptions) {
+#if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
+ WriteStallNotification notif;
+ notif.write_stall_info.cf_name = name;
+ notif.write_stall_info.condition.prev = old_cond;
+ notif.write_stall_info.condition.cur = new_cond;
+ notif.immutable_cf_options = ioptions;
+ write_stall_notifications.push_back(notif);
+#else
+ (void)old_cond;
+ (void)new_cond;
+ (void)name;
+ (void)ioptions;
+#endif // !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
+ }
+
+ void Clean() {
+#if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
+ // notify listeners on changed write stall conditions
+ for (auto& notif : write_stall_notifications) {
+ for (auto& listener : notif.immutable_cf_options->listeners) {
+ listener->OnStallConditionsChanged(notif.write_stall_info);
+ }
+ }
+ write_stall_notifications.clear();
+#endif // !ROCKSDB_LITE
+ // free superversions
+ for (auto s : superversions_to_free) {
+ delete s;
+ }
+ superversions_to_free.clear();
+ }
+
+ ~SuperVersionContext() {
+#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION
+ assert(write_stall_notifications.empty());
+#endif
+ assert(superversions_to_free.empty());
+ }
+};
+
+struct JobContext {
+ inline bool HaveSomethingToDelete() const {
+ return full_scan_candidate_files.size() || sst_delete_files.size() ||
+ log_delete_files.size() || manifest_delete_files.size();
+ }
+
+ inline bool HaveSomethingToClean() const {
+ bool sv_have_sth = false;
+ for (const auto& sv_ctx : superversion_contexts) {
+ if (sv_ctx.HaveSomethingToDelete()) {
+ sv_have_sth = true;
+ break;
+ }
+ }
+ return memtables_to_free.size() > 0 || logs_to_free.size() > 0 ||
+ sv_have_sth;
+ }
+
+ // Structure to store information for candidate files to delete.
+ struct CandidateFileInfo {
+ std::string file_name;
+ std::string file_path;
+ CandidateFileInfo(std::string name, std::string path)
+ : file_name(std::move(name)), file_path(std::move(path)) {}
+ bool operator==(const CandidateFileInfo& other) const {
+ return file_name == other.file_name &&
+ file_path == other.file_path;
+ }
+ };
+
+ // Unique job id
+ int job_id;
+
+ // a list of all files that we'll consider deleting
+ // (every once in a while this is filled up with all files
+ // in the DB directory)
+ // (filled only if we're doing full scan)
+ std::vector<CandidateFileInfo> full_scan_candidate_files;
+
+ // the list of all live sst files that cannot be deleted
+ std::vector<FileDescriptor> sst_live;
+
+ // a list of sst files that we need to delete
+ std::vector<ObsoleteFileInfo> sst_delete_files;
+
+ // a list of log files that we need to delete
+ std::vector<uint64_t> log_delete_files;
+
+ // a list of log files that we need to preserve during full purge since they
+ // will be reused later
+ std::vector<uint64_t> log_recycle_files;
+
+ // a list of manifest files that we need to delete
+ std::vector<std::string> manifest_delete_files;
+
+ // a list of memtables to be free
+ autovector<MemTable*> memtables_to_free;
+
+ // contexts for installing superversions for multiple column families
+ std::vector<SuperVersionContext> superversion_contexts;
+
+ autovector<log::Writer*> logs_to_free;
+
+ // the current manifest_file_number, log_number and prev_log_number
+ // that corresponds to the set of files in 'live'.
+ uint64_t manifest_file_number;
+ uint64_t pending_manifest_file_number;
+ uint64_t log_number;
+ uint64_t prev_log_number;
+
+ uint64_t min_pending_output = 0;
+ uint64_t prev_total_log_size = 0;
+ size_t num_alive_log_files = 0;
+ uint64_t size_log_to_delete = 0;
+
+ // Snapshot taken before flush/compaction job.
+ std::unique_ptr<ManagedSnapshot> job_snapshot;
+
+ explicit JobContext(int _job_id, bool create_superversion = false) {
+ job_id = _job_id;
+ manifest_file_number = 0;
+ pending_manifest_file_number = 0;
+ log_number = 0;
+ prev_log_number = 0;
+ superversion_contexts.emplace_back(
+ SuperVersionContext(create_superversion));
+ }
+
+ // For non-empty JobContext Clean() has to be called at least once before
+ // before destruction (see asserts in ~JobContext()). Should be called with
+ // unlocked DB mutex. Destructor doesn't call Clean() to avoid accidentally
+ // doing potentially slow Clean() with locked DB mutex.
+ void Clean() {
+ // free superversions
+ for (auto& sv_context : superversion_contexts) {
+ sv_context.Clean();
+ }
+ // free pending memtables
+ for (auto m : memtables_to_free) {
+ delete m;
+ }
+ for (auto l : logs_to_free) {
+ delete l;
+ }
+
+ memtables_to_free.clear();
+ logs_to_free.clear();
+ job_snapshot.reset();
+ }
+
+ ~JobContext() {
+ assert(memtables_to_free.size() == 0);
+ assert(logs_to_free.size() == 0);
+ }
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/listener_test.cc b/src/rocksdb/db/listener_test.cc
new file mode 100644
index 000000000..eb1a08a35
--- /dev/null
+++ b/src/rocksdb/db/listener_test.cc
@@ -0,0 +1,1042 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob_index.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "file/filename.h"
+#include "logging/logging.h"
+#include "memtable/hash_linklist_rep.h"
+#include "monitoring/statistics.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/plain/plain_table_factory.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/hash.h"
+#include "util/mutexlock.h"
+#include "util/rate_limiter.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+#ifndef ROCKSDB_LITE
+
+namespace ROCKSDB_NAMESPACE {
+
+class EventListenerTest : public DBTestBase {
+ public:
+ EventListenerTest() : DBTestBase("/listener_test") {}
+
+ static std::string BlobStr(uint64_t blob_file_number, uint64_t offset,
+ uint64_t size) {
+ std::string blob_index;
+ BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size,
+ kNoCompression);
+ return blob_index;
+ }
+
+ const size_t k110KB = 110 << 10;
+};
+
+struct TestPropertiesCollector
+ : public ROCKSDB_NAMESPACE::TablePropertiesCollector {
+ ROCKSDB_NAMESPACE::Status AddUserKey(
+ const ROCKSDB_NAMESPACE::Slice& /*key*/,
+ const ROCKSDB_NAMESPACE::Slice& /*value*/,
+ ROCKSDB_NAMESPACE::EntryType /*type*/,
+ ROCKSDB_NAMESPACE::SequenceNumber /*seq*/,
+ uint64_t /*file_size*/) override {
+ return Status::OK();
+ }
+ ROCKSDB_NAMESPACE::Status Finish(
+ ROCKSDB_NAMESPACE::UserCollectedProperties* properties) override {
+ properties->insert({"0", "1"});
+ return Status::OK();
+ }
+
+ const char* Name() const override { return "TestTablePropertiesCollector"; }
+
+ ROCKSDB_NAMESPACE::UserCollectedProperties GetReadableProperties()
+ const override {
+ ROCKSDB_NAMESPACE::UserCollectedProperties ret;
+ ret["2"] = "3";
+ return ret;
+ }
+};
+
+class TestPropertiesCollectorFactory : public TablePropertiesCollectorFactory {
+ public:
+ TablePropertiesCollector* CreateTablePropertiesCollector(
+ TablePropertiesCollectorFactory::Context /*context*/) override {
+ return new TestPropertiesCollector;
+ }
+ const char* Name() const override { return "TestTablePropertiesCollector"; }
+};
+
+class TestCompactionListener : public EventListener {
+ public:
+ explicit TestCompactionListener(EventListenerTest* test) : test_(test) {}
+
+ void OnCompactionCompleted(DB *db, const CompactionJobInfo& ci) override {
+ std::lock_guard<std::mutex> lock(mutex_);
+ compacted_dbs_.push_back(db);
+ ASSERT_GT(ci.input_files.size(), 0U);
+ ASSERT_EQ(ci.input_files.size(), ci.input_file_infos.size());
+
+ for (size_t i = 0; i < ci.input_file_infos.size(); ++i) {
+ ASSERT_EQ(ci.input_file_infos[i].level, ci.base_input_level);
+ ASSERT_EQ(ci.input_file_infos[i].file_number,
+ TableFileNameToNumber(ci.input_files[i]));
+ }
+
+ ASSERT_GT(ci.output_files.size(), 0U);
+ ASSERT_EQ(ci.output_files.size(), ci.output_file_infos.size());
+
+ ASSERT_TRUE(test_);
+ ASSERT_EQ(test_->db_, db);
+
+ std::vector<std::vector<FileMetaData>> files_by_level;
+ test_->dbfull()->TEST_GetFilesMetaData(test_->handles_[ci.cf_id],
+ &files_by_level);
+ ASSERT_GT(files_by_level.size(), ci.output_level);
+
+ for (size_t i = 0; i < ci.output_file_infos.size(); ++i) {
+ ASSERT_EQ(ci.output_file_infos[i].level, ci.output_level);
+ ASSERT_EQ(ci.output_file_infos[i].file_number,
+ TableFileNameToNumber(ci.output_files[i]));
+
+ auto it = std::find_if(
+ files_by_level[ci.output_level].begin(),
+ files_by_level[ci.output_level].end(), [&](const FileMetaData& meta) {
+ return meta.fd.GetNumber() == ci.output_file_infos[i].file_number;
+ });
+ ASSERT_NE(it, files_by_level[ci.output_level].end());
+
+ ASSERT_EQ(ci.output_file_infos[i].oldest_blob_file_number,
+ it->oldest_blob_file_number);
+ }
+
+ ASSERT_EQ(db->GetEnv()->GetThreadID(), ci.thread_id);
+ ASSERT_GT(ci.thread_id, 0U);
+
+ for (auto fl : {ci.input_files, ci.output_files}) {
+ for (auto fn : fl) {
+ auto it = ci.table_properties.find(fn);
+ ASSERT_NE(it, ci.table_properties.end());
+ auto tp = it->second;
+ ASSERT_TRUE(tp != nullptr);
+ ASSERT_EQ(tp->user_collected_properties.find("0")->second, "1");
+ }
+ }
+ }
+
+ EventListenerTest* test_;
+ std::vector<DB*> compacted_dbs_;
+ std::mutex mutex_;
+};
+
+TEST_F(EventListenerTest, OnSingleDBCompactionTest) {
+ const int kTestKeySize = 16;
+ const int kTestValueSize = 984;
+ const int kEntrySize = kTestKeySize + kTestValueSize;
+ const int kEntriesPerBuffer = 100;
+ const int kNumL0Files = 4;
+
+ Options options;
+ options.env = CurrentOptions().env;
+ options.create_if_missing = true;
+ options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+ options.compaction_style = kCompactionStyleLevel;
+ options.target_file_size_base = options.write_buffer_size;
+ options.max_bytes_for_level_base = options.target_file_size_base * 2;
+ options.max_bytes_for_level_multiplier = 2;
+ options.compression = kNoCompression;
+#ifdef ROCKSDB_USING_THREAD_STATUS
+ options.enable_thread_tracking = true;
+#endif // ROCKSDB_USING_THREAD_STATUS
+ options.level0_file_num_compaction_trigger = kNumL0Files;
+ options.table_properties_collector_factories.push_back(
+ std::make_shared<TestPropertiesCollectorFactory>());
+
+ TestCompactionListener* listener = new TestCompactionListener(this);
+ options.listeners.emplace_back(listener);
+ std::vector<std::string> cf_names = {
+ "pikachu", "ilya", "muromec", "dobrynia",
+ "nikitich", "alyosha", "popovich"};
+ CreateAndReopenWithCF(cf_names, options);
+ ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
+
+ WriteBatch batch;
+ ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 1, "ditto",
+ BlobStr(123, 0, 1 << 10)));
+ ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+
+ ASSERT_OK(Put(2, "ilya", std::string(90000, 'i')));
+ ASSERT_OK(Put(3, "muromec", std::string(90000, 'm')));
+ ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd')));
+ ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n')));
+ ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a')));
+ ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
+ for (int i = 1; i < 8; ++i) {
+ ASSERT_OK(Flush(i));
+ dbfull()->TEST_WaitForFlushMemTable();
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[i],
+ nullptr, nullptr));
+ dbfull()->TEST_WaitForCompact();
+ }
+
+ ASSERT_EQ(listener->compacted_dbs_.size(), cf_names.size());
+ for (size_t i = 0; i < cf_names.size(); ++i) {
+ ASSERT_EQ(listener->compacted_dbs_[i], db_);
+ }
+}
+
+// This simple Listener can only handle one flush at a time.
+class TestFlushListener : public EventListener {
+ public:
+ TestFlushListener(Env* env, EventListenerTest* test)
+ : slowdown_count(0), stop_count(0), db_closed(), env_(env), test_(test) {
+ db_closed = false;
+ }
+ void OnTableFileCreated(
+ const TableFileCreationInfo& info) override {
+ // remember the info for later checking the FlushJobInfo.
+ prev_fc_info_ = info;
+ ASSERT_GT(info.db_name.size(), 0U);
+ ASSERT_GT(info.cf_name.size(), 0U);
+ ASSERT_GT(info.file_path.size(), 0U);
+ ASSERT_GT(info.job_id, 0);
+ ASSERT_GT(info.table_properties.data_size, 0U);
+ ASSERT_GT(info.table_properties.raw_key_size, 0U);
+ ASSERT_GT(info.table_properties.raw_value_size, 0U);
+ ASSERT_GT(info.table_properties.num_data_blocks, 0U);
+ ASSERT_GT(info.table_properties.num_entries, 0U);
+
+#ifdef ROCKSDB_USING_THREAD_STATUS
+ // Verify the id of the current thread that created this table
+ // file matches the id of any active flush or compaction thread.
+ uint64_t thread_id = env_->GetThreadID();
+ std::vector<ThreadStatus> thread_list;
+ ASSERT_OK(env_->GetThreadList(&thread_list));
+ bool found_match = false;
+ for (auto thread_status : thread_list) {
+ if (thread_status.operation_type == ThreadStatus::OP_FLUSH ||
+ thread_status.operation_type == ThreadStatus::OP_COMPACTION) {
+ if (thread_id == thread_status.thread_id) {
+ found_match = true;
+ break;
+ }
+ }
+ }
+ ASSERT_TRUE(found_match);
+#endif // ROCKSDB_USING_THREAD_STATUS
+ }
+
+ void OnFlushCompleted(
+ DB* db, const FlushJobInfo& info) override {
+ flushed_dbs_.push_back(db);
+ flushed_column_family_names_.push_back(info.cf_name);
+ if (info.triggered_writes_slowdown) {
+ slowdown_count++;
+ }
+ if (info.triggered_writes_stop) {
+ stop_count++;
+ }
+ // verify whether the previously created file matches the flushed file.
+ ASSERT_EQ(prev_fc_info_.db_name, db->GetName());
+ ASSERT_EQ(prev_fc_info_.cf_name, info.cf_name);
+ ASSERT_EQ(prev_fc_info_.job_id, info.job_id);
+ ASSERT_EQ(prev_fc_info_.file_path, info.file_path);
+ ASSERT_EQ(TableFileNameToNumber(info.file_path), info.file_number);
+
+ // Note: the following chunk relies on the notification pertaining to the
+ // database pointed to by DBTestBase::db_, and is thus bypassed when
+ // that assumption does not hold (see the test case MultiDBMultiListeners
+ // below).
+ ASSERT_TRUE(test_);
+ if (db == test_->db_) {
+ std::vector<std::vector<FileMetaData>> files_by_level;
+ test_->dbfull()->TEST_GetFilesMetaData(test_->handles_[info.cf_id],
+ &files_by_level);
+
+ ASSERT_FALSE(files_by_level.empty());
+ auto it = std::find_if(files_by_level[0].begin(), files_by_level[0].end(),
+ [&](const FileMetaData& meta) {
+ return meta.fd.GetNumber() == info.file_number;
+ });
+ ASSERT_NE(it, files_by_level[0].end());
+ ASSERT_EQ(info.oldest_blob_file_number, it->oldest_blob_file_number);
+ }
+
+ ASSERT_EQ(db->GetEnv()->GetThreadID(), info.thread_id);
+ ASSERT_GT(info.thread_id, 0U);
+ ASSERT_EQ(info.table_properties.user_collected_properties.find("0")->second,
+ "1");
+ }
+
+ std::vector<std::string> flushed_column_family_names_;
+ std::vector<DB*> flushed_dbs_;
+ int slowdown_count;
+ int stop_count;
+ bool db_closing;
+ std::atomic_bool db_closed;
+ TableFileCreationInfo prev_fc_info_;
+
+ protected:
+ Env* env_;
+ EventListenerTest* test_;
+};
+
+TEST_F(EventListenerTest, OnSingleDBFlushTest) {
+ Options options;
+ options.env = CurrentOptions().env;
+ options.write_buffer_size = k110KB;
+#ifdef ROCKSDB_USING_THREAD_STATUS
+ options.enable_thread_tracking = true;
+#endif // ROCKSDB_USING_THREAD_STATUS
+ TestFlushListener* listener = new TestFlushListener(options.env, this);
+ options.listeners.emplace_back(listener);
+ std::vector<std::string> cf_names = {
+ "pikachu", "ilya", "muromec", "dobrynia",
+ "nikitich", "alyosha", "popovich"};
+ options.table_properties_collector_factories.push_back(
+ std::make_shared<TestPropertiesCollectorFactory>());
+ CreateAndReopenWithCF(cf_names, options);
+
+ ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
+
+ WriteBatch batch;
+ ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 1, "ditto",
+ BlobStr(456, 0, 1 << 10)));
+ ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+
+ ASSERT_OK(Put(2, "ilya", std::string(90000, 'i')));
+ ASSERT_OK(Put(3, "muromec", std::string(90000, 'm')));
+ ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd')));
+ ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n')));
+ ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a')));
+ ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
+ for (int i = 1; i < 8; ++i) {
+ ASSERT_OK(Flush(i));
+ dbfull()->TEST_WaitForFlushMemTable();
+ ASSERT_EQ(listener->flushed_dbs_.size(), i);
+ ASSERT_EQ(listener->flushed_column_family_names_.size(), i);
+ }
+
+ // make sure callback functions are called in the right order
+ for (size_t i = 0; i < cf_names.size(); ++i) {
+ ASSERT_EQ(listener->flushed_dbs_[i], db_);
+ ASSERT_EQ(listener->flushed_column_family_names_[i], cf_names[i]);
+ }
+}
+
+TEST_F(EventListenerTest, MultiCF) {
+ Options options;
+ options.env = CurrentOptions().env;
+ options.write_buffer_size = k110KB;
+#ifdef ROCKSDB_USING_THREAD_STATUS
+ options.enable_thread_tracking = true;
+#endif // ROCKSDB_USING_THREAD_STATUS
+ TestFlushListener* listener = new TestFlushListener(options.env, this);
+ options.listeners.emplace_back(listener);
+ options.table_properties_collector_factories.push_back(
+ std::make_shared<TestPropertiesCollectorFactory>());
+ std::vector<std::string> cf_names = {
+ "pikachu", "ilya", "muromec", "dobrynia",
+ "nikitich", "alyosha", "popovich"};
+ CreateAndReopenWithCF(cf_names, options);
+
+ ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
+ ASSERT_OK(Put(2, "ilya", std::string(90000, 'i')));
+ ASSERT_OK(Put(3, "muromec", std::string(90000, 'm')));
+ ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd')));
+ ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n')));
+ ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a')));
+ ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
+ for (int i = 1; i < 8; ++i) {
+ ASSERT_OK(Flush(i));
+ ASSERT_EQ(listener->flushed_dbs_.size(), i);
+ ASSERT_EQ(listener->flushed_column_family_names_.size(), i);
+ }
+
+ // make sure callback functions are called in the right order
+ for (size_t i = 0; i < cf_names.size(); i++) {
+ ASSERT_EQ(listener->flushed_dbs_[i], db_);
+ ASSERT_EQ(listener->flushed_column_family_names_[i], cf_names[i]);
+ }
+}
+
+TEST_F(EventListenerTest, MultiDBMultiListeners) {
+ Options options;
+ options.env = CurrentOptions().env;
+#ifdef ROCKSDB_USING_THREAD_STATUS
+ options.enable_thread_tracking = true;
+#endif // ROCKSDB_USING_THREAD_STATUS
+ options.table_properties_collector_factories.push_back(
+ std::make_shared<TestPropertiesCollectorFactory>());
+ std::vector<TestFlushListener*> listeners;
+ const int kNumDBs = 5;
+ const int kNumListeners = 10;
+ for (int i = 0; i < kNumListeners; ++i) {
+ listeners.emplace_back(new TestFlushListener(options.env, this));
+ }
+
+ std::vector<std::string> cf_names = {
+ "pikachu", "ilya", "muromec", "dobrynia",
+ "nikitich", "alyosha", "popovich"};
+
+ options.create_if_missing = true;
+ for (int i = 0; i < kNumListeners; ++i) {
+ options.listeners.emplace_back(listeners[i]);
+ }
+ DBOptions db_opts(options);
+ ColumnFamilyOptions cf_opts(options);
+
+ std::vector<DB*> dbs;
+ std::vector<std::vector<ColumnFamilyHandle *>> vec_handles;
+
+ for (int d = 0; d < kNumDBs; ++d) {
+ ASSERT_OK(DestroyDB(dbname_ + ToString(d), options));
+ DB* db;
+ std::vector<ColumnFamilyHandle*> handles;
+ ASSERT_OK(DB::Open(options, dbname_ + ToString(d), &db));
+ for (size_t c = 0; c < cf_names.size(); ++c) {
+ ColumnFamilyHandle* handle;
+ db->CreateColumnFamily(cf_opts, cf_names[c], &handle);
+ handles.push_back(handle);
+ }
+
+ vec_handles.push_back(std::move(handles));
+ dbs.push_back(db);
+ }
+
+ for (int d = 0; d < kNumDBs; ++d) {
+ for (size_t c = 0; c < cf_names.size(); ++c) {
+ ASSERT_OK(dbs[d]->Put(WriteOptions(), vec_handles[d][c],
+ cf_names[c], cf_names[c]));
+ }
+ }
+
+ for (size_t c = 0; c < cf_names.size(); ++c) {
+ for (int d = 0; d < kNumDBs; ++d) {
+ ASSERT_OK(dbs[d]->Flush(FlushOptions(), vec_handles[d][c]));
+ reinterpret_cast<DBImpl*>(dbs[d])->TEST_WaitForFlushMemTable();
+ }
+ }
+
+ for (auto* listener : listeners) {
+ int pos = 0;
+ for (size_t c = 0; c < cf_names.size(); ++c) {
+ for (int d = 0; d < kNumDBs; ++d) {
+ ASSERT_EQ(listener->flushed_dbs_[pos], dbs[d]);
+ ASSERT_EQ(listener->flushed_column_family_names_[pos], cf_names[c]);
+ pos++;
+ }
+ }
+ }
+
+
+ for (auto handles : vec_handles) {
+ for (auto h : handles) {
+ delete h;
+ }
+ handles.clear();
+ }
+ vec_handles.clear();
+
+ for (auto db : dbs) {
+ delete db;
+ }
+}
+
+TEST_F(EventListenerTest, DisableBGCompaction) {
+ Options options;
+ options.env = CurrentOptions().env;
+#ifdef ROCKSDB_USING_THREAD_STATUS
+ options.enable_thread_tracking = true;
+#endif // ROCKSDB_USING_THREAD_STATUS
+ TestFlushListener* listener = new TestFlushListener(options.env, this);
+ const int kCompactionTrigger = 1;
+ const int kSlowdownTrigger = 5;
+ const int kStopTrigger = 100;
+ options.level0_file_num_compaction_trigger = kCompactionTrigger;
+ options.level0_slowdown_writes_trigger = kSlowdownTrigger;
+ options.level0_stop_writes_trigger = kStopTrigger;
+ options.max_write_buffer_number = 10;
+ options.listeners.emplace_back(listener);
+ // BG compaction is disabled. Number of L0 files will simply keeps
+ // increasing in this test.
+ options.compaction_style = kCompactionStyleNone;
+ options.compression = kNoCompression;
+ options.write_buffer_size = 100000; // Small write buffer
+ options.table_properties_collector_factories.push_back(
+ std::make_shared<TestPropertiesCollectorFactory>());
+
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ColumnFamilyMetaData cf_meta;
+ db_->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+
+ // keep writing until writes are forced to stop.
+ for (int i = 0; static_cast<int>(cf_meta.file_count) < kSlowdownTrigger * 10;
+ ++i) {
+ Put(1, ToString(i), std::string(10000, 'x'), WriteOptions());
+ FlushOptions fo;
+ fo.allow_write_stall = true;
+ db_->Flush(fo, handles_[1]);
+ db_->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+ }
+ ASSERT_GE(listener->slowdown_count, kSlowdownTrigger * 9);
+}
+
+class TestCompactionReasonListener : public EventListener {
+ public:
+ void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
+ std::lock_guard<std::mutex> lock(mutex_);
+ compaction_reasons_.push_back(ci.compaction_reason);
+ }
+
+ std::vector<CompactionReason> compaction_reasons_;
+ std::mutex mutex_;
+};
+
+TEST_F(EventListenerTest, CompactionReasonLevel) {
+ Options options;
+ options.env = CurrentOptions().env;
+ options.create_if_missing = true;
+ options.memtable_factory.reset(
+ new SpecialSkipListFactory(DBTestBase::kNumKeysByGenerateNewRandomFile));
+
+ TestCompactionReasonListener* listener = new TestCompactionReasonListener();
+ options.listeners.emplace_back(listener);
+
+ options.level0_file_num_compaction_trigger = 4;
+ options.compaction_style = kCompactionStyleLevel;
+
+ DestroyAndReopen(options);
+ Random rnd(301);
+
+ // Write 4 files in L0
+ for (int i = 0; i < 4; i++) {
+ GenerateNewRandomFile(&rnd);
+ }
+ dbfull()->TEST_WaitForCompact();
+
+ ASSERT_EQ(listener->compaction_reasons_.size(), 1);
+ ASSERT_EQ(listener->compaction_reasons_[0],
+ CompactionReason::kLevelL0FilesNum);
+
+ DestroyAndReopen(options);
+
+ // Write 3 non-overlapping files in L0
+ for (int k = 1; k <= 30; k++) {
+ ASSERT_OK(Put(Key(k), Key(k)));
+ if (k % 10 == 0) {
+ Flush();
+ }
+ }
+
+ // Do a trivial move from L0 -> L1
+ db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+ options.max_bytes_for_level_base = 1;
+ Close();
+ listener->compaction_reasons_.clear();
+ Reopen(options);
+
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_GT(listener->compaction_reasons_.size(), 1);
+
+ for (auto compaction_reason : listener->compaction_reasons_) {
+ ASSERT_EQ(compaction_reason, CompactionReason::kLevelMaxLevelSize);
+ }
+
+ options.disable_auto_compactions = true;
+ Close();
+ listener->compaction_reasons_.clear();
+ Reopen(options);
+
+ Put("key", "value");
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_GT(listener->compaction_reasons_.size(), 0);
+ for (auto compaction_reason : listener->compaction_reasons_) {
+ ASSERT_EQ(compaction_reason, CompactionReason::kManualCompaction);
+ }
+}
+
+TEST_F(EventListenerTest, CompactionReasonUniversal) {
+ Options options;
+ options.env = CurrentOptions().env;
+ options.create_if_missing = true;
+ options.memtable_factory.reset(
+ new SpecialSkipListFactory(DBTestBase::kNumKeysByGenerateNewRandomFile));
+
+ TestCompactionReasonListener* listener = new TestCompactionReasonListener();
+ options.listeners.emplace_back(listener);
+
+ options.compaction_style = kCompactionStyleUniversal;
+
+ Random rnd(301);
+
+ options.level0_file_num_compaction_trigger = 8;
+ options.compaction_options_universal.max_size_amplification_percent = 100000;
+ options.compaction_options_universal.size_ratio = 100000;
+ DestroyAndReopen(options);
+ listener->compaction_reasons_.clear();
+
+ // Write 8 files in L0
+ for (int i = 0; i < 8; i++) {
+ GenerateNewRandomFile(&rnd);
+ }
+ dbfull()->TEST_WaitForCompact();
+
+ ASSERT_GT(listener->compaction_reasons_.size(), 0);
+ for (auto compaction_reason : listener->compaction_reasons_) {
+ ASSERT_EQ(compaction_reason, CompactionReason::kUniversalSizeRatio);
+ }
+
+ options.level0_file_num_compaction_trigger = 8;
+ options.compaction_options_universal.max_size_amplification_percent = 1;
+ options.compaction_options_universal.size_ratio = 100000;
+
+ DestroyAndReopen(options);
+ listener->compaction_reasons_.clear();
+
+ // Write 8 files in L0
+ for (int i = 0; i < 8; i++) {
+ GenerateNewRandomFile(&rnd);
+ }
+ dbfull()->TEST_WaitForCompact();
+
+ ASSERT_GT(listener->compaction_reasons_.size(), 0);
+ for (auto compaction_reason : listener->compaction_reasons_) {
+ ASSERT_EQ(compaction_reason, CompactionReason::kUniversalSizeAmplification);
+ }
+
+ options.disable_auto_compactions = true;
+ Close();
+ listener->compaction_reasons_.clear();
+ Reopen(options);
+
+ db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+ ASSERT_GT(listener->compaction_reasons_.size(), 0);
+ for (auto compaction_reason : listener->compaction_reasons_) {
+ ASSERT_EQ(compaction_reason, CompactionReason::kManualCompaction);
+ }
+}
+
+TEST_F(EventListenerTest, CompactionReasonFIFO) {
+ Options options;
+ options.env = CurrentOptions().env;
+ options.create_if_missing = true;
+ options.memtable_factory.reset(
+ new SpecialSkipListFactory(DBTestBase::kNumKeysByGenerateNewRandomFile));
+
+ TestCompactionReasonListener* listener = new TestCompactionReasonListener();
+ options.listeners.emplace_back(listener);
+
+ options.level0_file_num_compaction_trigger = 4;
+ options.compaction_style = kCompactionStyleFIFO;
+ options.compaction_options_fifo.max_table_files_size = 1;
+
+ DestroyAndReopen(options);
+ Random rnd(301);
+
+ // Write 4 files in L0
+ for (int i = 0; i < 4; i++) {
+ GenerateNewRandomFile(&rnd);
+ }
+ dbfull()->TEST_WaitForCompact();
+
+ ASSERT_GT(listener->compaction_reasons_.size(), 0);
+ for (auto compaction_reason : listener->compaction_reasons_) {
+ ASSERT_EQ(compaction_reason, CompactionReason::kFIFOMaxSize);
+ }
+}
+
+class TableFileCreationListener : public EventListener {
+ public:
+ class TestEnv : public EnvWrapper {
+ public:
+ TestEnv() : EnvWrapper(Env::Default()) {}
+
+ void SetStatus(Status s) { status_ = s; }
+
+ Status NewWritableFile(const std::string& fname,
+ std::unique_ptr<WritableFile>* result,
+ const EnvOptions& options) override {
+ if (fname.size() > 4 && fname.substr(fname.size() - 4) == ".sst") {
+ if (!status_.ok()) {
+ return status_;
+ }
+ }
+ return Env::Default()->NewWritableFile(fname, result, options);
+ }
+
+ private:
+ Status status_;
+ };
+
+ TableFileCreationListener() {
+ for (int i = 0; i < 2; i++) {
+ started_[i] = finished_[i] = failure_[i] = 0;
+ }
+ }
+
+ int Index(TableFileCreationReason reason) {
+ int idx;
+ switch (reason) {
+ case TableFileCreationReason::kFlush:
+ idx = 0;
+ break;
+ case TableFileCreationReason::kCompaction:
+ idx = 1;
+ break;
+ default:
+ idx = -1;
+ }
+ return idx;
+ }
+
+ void CheckAndResetCounters(int flush_started, int flush_finished,
+ int flush_failure, int compaction_started,
+ int compaction_finished, int compaction_failure) {
+ ASSERT_EQ(started_[0], flush_started);
+ ASSERT_EQ(finished_[0], flush_finished);
+ ASSERT_EQ(failure_[0], flush_failure);
+ ASSERT_EQ(started_[1], compaction_started);
+ ASSERT_EQ(finished_[1], compaction_finished);
+ ASSERT_EQ(failure_[1], compaction_failure);
+ for (int i = 0; i < 2; i++) {
+ started_[i] = finished_[i] = failure_[i] = 0;
+ }
+ }
+
+ void OnTableFileCreationStarted(
+ const TableFileCreationBriefInfo& info) override {
+ int idx = Index(info.reason);
+ if (idx >= 0) {
+ started_[idx]++;
+ }
+ ASSERT_GT(info.db_name.size(), 0U);
+ ASSERT_GT(info.cf_name.size(), 0U);
+ ASSERT_GT(info.file_path.size(), 0U);
+ ASSERT_GT(info.job_id, 0);
+ }
+
+ void OnTableFileCreated(const TableFileCreationInfo& info) override {
+ int idx = Index(info.reason);
+ if (idx >= 0) {
+ finished_[idx]++;
+ }
+ ASSERT_GT(info.db_name.size(), 0U);
+ ASSERT_GT(info.cf_name.size(), 0U);
+ ASSERT_GT(info.file_path.size(), 0U);
+ ASSERT_GT(info.job_id, 0);
+ if (info.status.ok()) {
+ ASSERT_GT(info.table_properties.data_size, 0U);
+ ASSERT_GT(info.table_properties.raw_key_size, 0U);
+ ASSERT_GT(info.table_properties.raw_value_size, 0U);
+ ASSERT_GT(info.table_properties.num_data_blocks, 0U);
+ ASSERT_GT(info.table_properties.num_entries, 0U);
+ } else {
+ if (idx >= 0) {
+ failure_[idx]++;
+ }
+ }
+ }
+
+ TestEnv test_env;
+ int started_[2];
+ int finished_[2];
+ int failure_[2];
+};
+
+TEST_F(EventListenerTest, TableFileCreationListenersTest) {
+ auto listener = std::make_shared<TableFileCreationListener>();
+ Options options;
+ options.create_if_missing = true;
+ options.listeners.push_back(listener);
+ options.env = &listener->test_env;
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("foo", "aaa"));
+ ASSERT_OK(Put("bar", "bbb"));
+ ASSERT_OK(Flush());
+ dbfull()->TEST_WaitForFlushMemTable();
+ listener->CheckAndResetCounters(1, 1, 0, 0, 0, 0);
+
+ ASSERT_OK(Put("foo", "aaa1"));
+ ASSERT_OK(Put("bar", "bbb1"));
+ listener->test_env.SetStatus(Status::NotSupported("not supported"));
+ ASSERT_NOK(Flush());
+ listener->CheckAndResetCounters(1, 1, 1, 0, 0, 0);
+ listener->test_env.SetStatus(Status::OK());
+
+ Reopen(options);
+ ASSERT_OK(Put("foo", "aaa2"));
+ ASSERT_OK(Put("bar", "bbb2"));
+ ASSERT_OK(Flush());
+ dbfull()->TEST_WaitForFlushMemTable();
+ listener->CheckAndResetCounters(1, 1, 0, 0, 0, 0);
+
+ const Slice kRangeStart = "a";
+ const Slice kRangeEnd = "z";
+ dbfull()->CompactRange(CompactRangeOptions(), &kRangeStart, &kRangeEnd);
+ dbfull()->TEST_WaitForCompact();
+ listener->CheckAndResetCounters(0, 0, 0, 1, 1, 0);
+
+ ASSERT_OK(Put("foo", "aaa3"));
+ ASSERT_OK(Put("bar", "bbb3"));
+ ASSERT_OK(Flush());
+ listener->test_env.SetStatus(Status::NotSupported("not supported"));
+ dbfull()->CompactRange(CompactRangeOptions(), &kRangeStart, &kRangeEnd);
+ dbfull()->TEST_WaitForCompact();
+ listener->CheckAndResetCounters(1, 1, 0, 1, 1, 1);
+}
+
+class MemTableSealedListener : public EventListener {
+private:
+ SequenceNumber latest_seq_number_;
+public:
+ MemTableSealedListener() {}
+ void OnMemTableSealed(const MemTableInfo& info) override {
+ latest_seq_number_ = info.first_seqno;
+ }
+
+ void OnFlushCompleted(DB* /*db*/,
+ const FlushJobInfo& flush_job_info) override {
+ ASSERT_LE(flush_job_info.smallest_seqno, latest_seq_number_);
+ }
+};
+
+TEST_F(EventListenerTest, MemTableSealedListenerTest) {
+ auto listener = std::make_shared<MemTableSealedListener>();
+ Options options;
+ options.create_if_missing = true;
+ options.listeners.push_back(listener);
+ DestroyAndReopen(options);
+
+ for (unsigned int i = 0; i < 10; i++) {
+ std::string tag = std::to_string(i);
+ ASSERT_OK(Put("foo"+tag, "aaa"));
+ ASSERT_OK(Put("bar"+tag, "bbb"));
+
+ ASSERT_OK(Flush());
+ }
+}
+
+class ColumnFamilyHandleDeletionStartedListener : public EventListener {
+ private:
+ std::vector<std::string> cfs_;
+ int counter;
+
+ public:
+ explicit ColumnFamilyHandleDeletionStartedListener(
+ const std::vector<std::string>& cfs)
+ : cfs_(cfs), counter(0) {
+ cfs_.insert(cfs_.begin(), kDefaultColumnFamilyName);
+ }
+ void OnColumnFamilyHandleDeletionStarted(
+ ColumnFamilyHandle* handle) override {
+ ASSERT_EQ(cfs_[handle->GetID()], handle->GetName());
+ counter++;
+ }
+ int getCounter() { return counter; }
+};
+
+TEST_F(EventListenerTest, ColumnFamilyHandleDeletionStartedListenerTest) {
+ std::vector<std::string> cfs{"pikachu", "eevee", "Mewtwo"};
+ auto listener =
+ std::make_shared<ColumnFamilyHandleDeletionStartedListener>(cfs);
+ Options options;
+ options.env = CurrentOptions().env;
+ options.create_if_missing = true;
+ options.listeners.push_back(listener);
+ CreateAndReopenWithCF(cfs, options);
+ ASSERT_EQ(handles_.size(), 4);
+ delete handles_[3];
+ delete handles_[2];
+ delete handles_[1];
+ handles_.resize(1);
+ ASSERT_EQ(listener->getCounter(), 3);
+}
+
+class BackgroundErrorListener : public EventListener {
+ private:
+ SpecialEnv* env_;
+ int counter_;
+
+ public:
+ BackgroundErrorListener(SpecialEnv* env) : env_(env), counter_(0) {}
+
+ void OnBackgroundError(BackgroundErrorReason /*reason*/,
+ Status* bg_error) override {
+ if (counter_ == 0) {
+ // suppress the first error and disable write-dropping such that a retry
+ // can succeed.
+ *bg_error = Status::OK();
+ env_->drop_writes_.store(false, std::memory_order_release);
+ env_->no_slowdown_ = false;
+ }
+ ++counter_;
+ }
+
+ int counter() { return counter_; }
+};
+
+TEST_F(EventListenerTest, BackgroundErrorListenerFailedFlushTest) {
+ auto listener = std::make_shared<BackgroundErrorListener>(env_);
+ Options options;
+ options.create_if_missing = true;
+ options.env = env_;
+ options.listeners.push_back(listener);
+ options.memtable_factory.reset(new SpecialSkipListFactory(1));
+ options.paranoid_checks = true;
+ DestroyAndReopen(options);
+
+ // the usual TEST_WaitForFlushMemTable() doesn't work for failed flushes, so
+ // forge a custom one for the failed flush case.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::BGWorkFlush:done",
+ "EventListenerTest:BackgroundErrorListenerFailedFlushTest:1"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ env_->drop_writes_.store(true, std::memory_order_release);
+ env_->no_slowdown_ = true;
+
+ ASSERT_OK(Put("key0", "val"));
+ ASSERT_OK(Put("key1", "val"));
+ TEST_SYNC_POINT("EventListenerTest:BackgroundErrorListenerFailedFlushTest:1");
+ ASSERT_EQ(1, listener->counter());
+ ASSERT_OK(Put("key2", "val"));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+}
+
+TEST_F(EventListenerTest, BackgroundErrorListenerFailedCompactionTest) {
+ auto listener = std::make_shared<BackgroundErrorListener>(env_);
+ Options options;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ options.env = env_;
+ options.level0_file_num_compaction_trigger = 2;
+ options.listeners.push_back(listener);
+ options.memtable_factory.reset(new SpecialSkipListFactory(2));
+ options.paranoid_checks = true;
+ DestroyAndReopen(options);
+
+ // third iteration triggers the second memtable's flush
+ for (int i = 0; i < 3; ++i) {
+ ASSERT_OK(Put("key0", "val"));
+ if (i > 0) {
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ }
+ ASSERT_OK(Put("key1", "val"));
+ }
+ ASSERT_EQ(2, NumTableFilesAtLevel(0));
+
+ env_->drop_writes_.store(true, std::memory_order_release);
+ env_->no_slowdown_ = true;
+ ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(1, listener->counter());
+
+ // trigger flush so compaction is triggered again; this time it succeeds
+ // The previous failed compaction may get retried automatically, so we may
+ // be left with 0 or 1 files in level 1, depending on when the retry gets
+ // scheduled
+ ASSERT_OK(Put("key0", "val"));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_LE(1, NumTableFilesAtLevel(0));
+}
+
+class TestFileOperationListener : public EventListener {
+ public:
+ TestFileOperationListener() {
+ file_reads_.store(0);
+ file_reads_success_.store(0);
+ file_writes_.store(0);
+ file_writes_success_.store(0);
+ }
+
+ void OnFileReadFinish(const FileOperationInfo& info) override {
+ ++file_reads_;
+ if (info.status.ok()) {
+ ++file_reads_success_;
+ }
+ ReportDuration(info);
+ }
+
+ void OnFileWriteFinish(const FileOperationInfo& info) override {
+ ++file_writes_;
+ if (info.status.ok()) {
+ ++file_writes_success_;
+ }
+ ReportDuration(info);
+ }
+
+ bool ShouldBeNotifiedOnFileIO() override { return true; }
+
+ std::atomic<size_t> file_reads_;
+ std::atomic<size_t> file_reads_success_;
+ std::atomic<size_t> file_writes_;
+ std::atomic<size_t> file_writes_success_;
+
+ private:
+ void ReportDuration(const FileOperationInfo& info) const {
+ auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(
+ info.finish_timestamp - info.start_timestamp);
+ ASSERT_GT(duration.count(), 0);
+ }
+};
+
+TEST_F(EventListenerTest, OnFileOperationTest) {
+ Options options;
+ options.env = CurrentOptions().env;
+ options.create_if_missing = true;
+
+ TestFileOperationListener* listener = new TestFileOperationListener();
+ options.listeners.emplace_back(listener);
+
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("foo", "aaa"));
+ dbfull()->Flush(FlushOptions());
+ dbfull()->TEST_WaitForFlushMemTable();
+ ASSERT_GE(listener->file_writes_.load(),
+ listener->file_writes_success_.load());
+ ASSERT_GT(listener->file_writes_.load(), 0);
+ Close();
+
+ Reopen(options);
+ ASSERT_GE(listener->file_reads_.load(), listener->file_reads_success_.load());
+ ASSERT_GT(listener->file_reads_.load(), 0);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/log_format.h b/src/rocksdb/db/log_format.h
new file mode 100644
index 000000000..c22e2b6bc
--- /dev/null
+++ b/src/rocksdb/db/log_format.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Log format information shared by reader and writer.
+// See ../doc/log_format.txt for more detail.
+
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace log {
+
+enum RecordType {
+ // Zero is reserved for preallocated files
+ kZeroType = 0,
+ kFullType = 1,
+
+ // For fragments
+ kFirstType = 2,
+ kMiddleType = 3,
+ kLastType = 4,
+
+ // For recycled log files
+ kRecyclableFullType = 5,
+ kRecyclableFirstType = 6,
+ kRecyclableMiddleType = 7,
+ kRecyclableLastType = 8,
+};
+static const int kMaxRecordType = kRecyclableLastType;
+
+static const unsigned int kBlockSize = 32768;
+
+// Header is checksum (4 bytes), length (2 bytes), type (1 byte)
+static const int kHeaderSize = 4 + 2 + 1;
+
+// Recyclable header is checksum (4 bytes), length (2 bytes), type (1 byte),
+// log number (4 bytes).
+static const int kRecyclableHeaderSize = 4 + 2 + 1 + 4;
+
+} // namespace log
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/log_reader.cc b/src/rocksdb/db/log_reader.cc
new file mode 100644
index 000000000..c60a814b9
--- /dev/null
+++ b/src/rocksdb/db/log_reader.cc
@@ -0,0 +1,624 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_reader.h"
+
+#include <stdio.h>
+#include "file/sequence_file_reader.h"
+#include "rocksdb/env.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/util.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace log {
+
+Reader::Reporter::~Reporter() {
+}
+
+Reader::Reader(std::shared_ptr<Logger> info_log,
+ std::unique_ptr<SequentialFileReader>&& _file,
+ Reporter* reporter, bool checksum, uint64_t log_num)
+ : info_log_(info_log),
+ file_(std::move(_file)),
+ reporter_(reporter),
+ checksum_(checksum),
+ backing_store_(new char[kBlockSize]),
+ buffer_(),
+ eof_(false),
+ read_error_(false),
+ eof_offset_(0),
+ last_record_offset_(0),
+ end_of_buffer_offset_(0),
+ log_number_(log_num),
+ recycled_(false) {}
+
+Reader::~Reader() {
+ delete[] backing_store_;
+}
+
+// For kAbsoluteConsistency, on clean shutdown we don't expect any error
+// in the log files. For other modes, we can ignore only incomplete records
+// in the last log file, which are presumably due to a write in progress
+// during restart (or from log recycling).
+//
+// TODO krad: Evaluate if we need to move to a more strict mode where we
+// restrict the inconsistency to only the last log
+bool Reader::ReadRecord(Slice* record, std::string* scratch,
+ WALRecoveryMode wal_recovery_mode) {
+ scratch->clear();
+ record->clear();
+ bool in_fragmented_record = false;
+ // Record offset of the logical record that we're reading
+ // 0 is a dummy value to make compilers happy
+ uint64_t prospective_record_offset = 0;
+
+ Slice fragment;
+ while (true) {
+ uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
+ size_t drop_size = 0;
+ const unsigned int record_type = ReadPhysicalRecord(&fragment, &drop_size);
+ switch (record_type) {
+ case kFullType:
+ case kRecyclableFullType:
+ if (in_fragmented_record && !scratch->empty()) {
+ // Handle bug in earlier versions of log::Writer where
+ // it could emit an empty kFirstType record at the tail end
+ // of a block followed by a kFullType or kFirstType record
+ // at the beginning of the next block.
+ ReportCorruption(scratch->size(), "partial record without end(1)");
+ }
+ prospective_record_offset = physical_record_offset;
+ scratch->clear();
+ *record = fragment;
+ last_record_offset_ = prospective_record_offset;
+ return true;
+
+ case kFirstType:
+ case kRecyclableFirstType:
+ if (in_fragmented_record && !scratch->empty()) {
+ // Handle bug in earlier versions of log::Writer where
+ // it could emit an empty kFirstType record at the tail end
+ // of a block followed by a kFullType or kFirstType record
+ // at the beginning of the next block.
+ ReportCorruption(scratch->size(), "partial record without end(2)");
+ }
+ prospective_record_offset = physical_record_offset;
+ scratch->assign(fragment.data(), fragment.size());
+ in_fragmented_record = true;
+ break;
+
+ case kMiddleType:
+ case kRecyclableMiddleType:
+ if (!in_fragmented_record) {
+ ReportCorruption(fragment.size(),
+ "missing start of fragmented record(1)");
+ } else {
+ scratch->append(fragment.data(), fragment.size());
+ }
+ break;
+
+ case kLastType:
+ case kRecyclableLastType:
+ if (!in_fragmented_record) {
+ ReportCorruption(fragment.size(),
+ "missing start of fragmented record(2)");
+ } else {
+ scratch->append(fragment.data(), fragment.size());
+ *record = Slice(*scratch);
+ last_record_offset_ = prospective_record_offset;
+ return true;
+ }
+ break;
+
+ case kBadHeader:
+ if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) {
+ // in clean shutdown we don't expect any error in the log files
+ ReportCorruption(drop_size, "truncated header");
+ }
+ FALLTHROUGH_INTENDED;
+
+ case kEof:
+ if (in_fragmented_record) {
+ if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) {
+ // in clean shutdown we don't expect any error in the log files
+ ReportCorruption(scratch->size(), "error reading trailing data");
+ }
+ // This can be caused by the writer dying immediately after
+ // writing a physical record but before completing the next; don't
+ // treat it as a corruption, just ignore the entire logical record.
+ scratch->clear();
+ }
+ return false;
+
+ case kOldRecord:
+ if (wal_recovery_mode != WALRecoveryMode::kSkipAnyCorruptedRecords) {
+ // Treat a record from a previous instance of the log as EOF.
+ if (in_fragmented_record) {
+ if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) {
+ // in clean shutdown we don't expect any error in the log files
+ ReportCorruption(scratch->size(), "error reading trailing data");
+ }
+ // This can be caused by the writer dying immediately after
+ // writing a physical record but before completing the next; don't
+ // treat it as a corruption, just ignore the entire logical record.
+ scratch->clear();
+ }
+ return false;
+ }
+ FALLTHROUGH_INTENDED;
+
+ case kBadRecord:
+ if (in_fragmented_record) {
+ ReportCorruption(scratch->size(), "error in middle of record");
+ in_fragmented_record = false;
+ scratch->clear();
+ }
+ break;
+
+ case kBadRecordLen:
+ case kBadRecordChecksum:
+ if (recycled_ &&
+ wal_recovery_mode ==
+ WALRecoveryMode::kTolerateCorruptedTailRecords) {
+ scratch->clear();
+ return false;
+ }
+ if (record_type == kBadRecordLen) {
+ ReportCorruption(drop_size, "bad record length");
+ } else {
+ ReportCorruption(drop_size, "checksum mismatch");
+ }
+ if (in_fragmented_record) {
+ ReportCorruption(scratch->size(), "error in middle of record");
+ in_fragmented_record = false;
+ scratch->clear();
+ }
+ break;
+
+ default: {
+ char buf[40];
+ snprintf(buf, sizeof(buf), "unknown record type %u", record_type);
+ ReportCorruption(
+ (fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
+ buf);
+ in_fragmented_record = false;
+ scratch->clear();
+ break;
+ }
+ }
+ }
+ return false;
+}
+
+uint64_t Reader::LastRecordOffset() {
+ return last_record_offset_;
+}
+
+void Reader::UnmarkEOF() {
+ if (read_error_) {
+ return;
+ }
+ eof_ = false;
+ if (eof_offset_ == 0) {
+ return;
+ }
+ UnmarkEOFInternal();
+}
+
+void Reader::UnmarkEOFInternal() {
+ // If the EOF was in the middle of a block (a partial block was read) we have
+ // to read the rest of the block as ReadPhysicalRecord can only read full
+ // blocks and expects the file position indicator to be aligned to the start
+ // of a block.
+ //
+ // consumed_bytes + buffer_size() + remaining == kBlockSize
+
+ size_t consumed_bytes = eof_offset_ - buffer_.size();
+ size_t remaining = kBlockSize - eof_offset_;
+
+ // backing_store_ is used to concatenate what is left in buffer_ and
+ // the remainder of the block. If buffer_ already uses backing_store_,
+ // we just append the new data.
+ if (buffer_.data() != backing_store_ + consumed_bytes) {
+ // Buffer_ does not use backing_store_ for storage.
+ // Copy what is left in buffer_ to backing_store.
+ memmove(backing_store_ + consumed_bytes, buffer_.data(), buffer_.size());
+ }
+
+ Slice read_buffer;
+ Status status = file_->Read(remaining, &read_buffer,
+ backing_store_ + eof_offset_);
+
+ size_t added = read_buffer.size();
+ end_of_buffer_offset_ += added;
+
+ if (!status.ok()) {
+ if (added > 0) {
+ ReportDrop(added, status);
+ }
+
+ read_error_ = true;
+ return;
+ }
+
+ if (read_buffer.data() != backing_store_ + eof_offset_) {
+ // Read did not write to backing_store_
+ memmove(backing_store_ + eof_offset_, read_buffer.data(),
+ read_buffer.size());
+ }
+
+ buffer_ = Slice(backing_store_ + consumed_bytes,
+ eof_offset_ + added - consumed_bytes);
+
+ if (added < remaining) {
+ eof_ = true;
+ eof_offset_ += added;
+ } else {
+ eof_offset_ = 0;
+ }
+}
+
+void Reader::ReportCorruption(size_t bytes, const char* reason) {
+ ReportDrop(bytes, Status::Corruption(reason));
+}
+
+void Reader::ReportDrop(size_t bytes, const Status& reason) {
+ if (reporter_ != nullptr) {
+ reporter_->Corruption(bytes, reason);
+ }
+}
+
+bool Reader::ReadMore(size_t* drop_size, int *error) {
+ if (!eof_ && !read_error_) {
+ // Last read was a full read, so this is a trailer to skip
+ buffer_.clear();
+ Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
+ end_of_buffer_offset_ += buffer_.size();
+ if (!status.ok()) {
+ buffer_.clear();
+ ReportDrop(kBlockSize, status);
+ read_error_ = true;
+ *error = kEof;
+ return false;
+ } else if (buffer_.size() < static_cast<size_t>(kBlockSize)) {
+ eof_ = true;
+ eof_offset_ = buffer_.size();
+ }
+ return true;
+ } else {
+ // Note that if buffer_ is non-empty, we have a truncated header at the
+ // end of the file, which can be caused by the writer crashing in the
+ // middle of writing the header. Unless explicitly requested we don't
+ // considering this an error, just report EOF.
+ if (buffer_.size()) {
+ *drop_size = buffer_.size();
+ buffer_.clear();
+ *error = kBadHeader;
+ return false;
+ }
+ buffer_.clear();
+ *error = kEof;
+ return false;
+ }
+}
+
+unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size) {
+ while (true) {
+ // We need at least the minimum header size
+ if (buffer_.size() < static_cast<size_t>(kHeaderSize)) {
+ // the default value of r is meaningless because ReadMore will overwrite
+ // it if it returns false; in case it returns true, the return value will
+ // not be used anyway
+ int r = kEof;
+ if (!ReadMore(drop_size, &r)) {
+ return r;
+ }
+ continue;
+ }
+
+ // Parse the header
+ const char* header = buffer_.data();
+ const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
+ const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
+ const unsigned int type = header[6];
+ const uint32_t length = a | (b << 8);
+ int header_size = kHeaderSize;
+ if (type >= kRecyclableFullType && type <= kRecyclableLastType) {
+ if (end_of_buffer_offset_ - buffer_.size() == 0) {
+ recycled_ = true;
+ }
+ header_size = kRecyclableHeaderSize;
+ // We need enough for the larger header
+ if (buffer_.size() < static_cast<size_t>(kRecyclableHeaderSize)) {
+ int r = kEof;
+ if (!ReadMore(drop_size, &r)) {
+ return r;
+ }
+ continue;
+ }
+ const uint32_t log_num = DecodeFixed32(header + 7);
+ if (log_num != log_number_) {
+ return kOldRecord;
+ }
+ }
+ if (header_size + length > buffer_.size()) {
+ *drop_size = buffer_.size();
+ buffer_.clear();
+ if (!eof_) {
+ return kBadRecordLen;
+ }
+ // If the end of the file has been reached without reading |length|
+ // bytes of payload, assume the writer died in the middle of writing the
+ // record. Don't report a corruption unless requested.
+ if (*drop_size) {
+ return kBadHeader;
+ }
+ return kEof;
+ }
+
+ if (type == kZeroType && length == 0) {
+ // Skip zero length record without reporting any drops since
+ // such records are produced by the mmap based writing code in
+ // env_posix.cc that preallocates file regions.
+ // NOTE: this should never happen in DB written by new RocksDB versions,
+ // since we turn off mmap writes to manifest and log files
+ buffer_.clear();
+ return kBadRecord;
+ }
+
+ // Check crc
+ if (checksum_) {
+ uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
+ uint32_t actual_crc = crc32c::Value(header + 6, length + header_size - 6);
+ if (actual_crc != expected_crc) {
+ // Drop the rest of the buffer since "length" itself may have
+ // been corrupted and if we trust it, we could find some
+ // fragment of a real log record that just happens to look
+ // like a valid log record.
+ *drop_size = buffer_.size();
+ buffer_.clear();
+ return kBadRecordChecksum;
+ }
+ }
+
+ buffer_.remove_prefix(header_size + length);
+
+ *result = Slice(header + header_size, length);
+ return type;
+ }
+}
+
+bool FragmentBufferedReader::ReadRecord(Slice* record, std::string* scratch,
+ WALRecoveryMode /*unused*/) {
+ assert(record != nullptr);
+ assert(scratch != nullptr);
+ record->clear();
+ scratch->clear();
+
+ uint64_t prospective_record_offset = 0;
+ uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
+ size_t drop_size = 0;
+ unsigned int fragment_type_or_err = 0; // Initialize to make compiler happy
+ Slice fragment;
+ while (TryReadFragment(&fragment, &drop_size, &fragment_type_or_err)) {
+ switch (fragment_type_or_err) {
+ case kFullType:
+ case kRecyclableFullType:
+ if (in_fragmented_record_ && !fragments_.empty()) {
+ ReportCorruption(fragments_.size(), "partial record without end(1)");
+ }
+ fragments_.clear();
+ *record = fragment;
+ prospective_record_offset = physical_record_offset;
+ last_record_offset_ = prospective_record_offset;
+ in_fragmented_record_ = false;
+ return true;
+
+ case kFirstType:
+ case kRecyclableFirstType:
+ if (in_fragmented_record_ || !fragments_.empty()) {
+ ReportCorruption(fragments_.size(), "partial record without end(2)");
+ }
+ prospective_record_offset = physical_record_offset;
+ fragments_.assign(fragment.data(), fragment.size());
+ in_fragmented_record_ = true;
+ break;
+
+ case kMiddleType:
+ case kRecyclableMiddleType:
+ if (!in_fragmented_record_) {
+ ReportCorruption(fragment.size(),
+ "missing start of fragmented record(1)");
+ } else {
+ fragments_.append(fragment.data(), fragment.size());
+ }
+ break;
+
+ case kLastType:
+ case kRecyclableLastType:
+ if (!in_fragmented_record_) {
+ ReportCorruption(fragment.size(),
+ "missing start of fragmented record(2)");
+ } else {
+ fragments_.append(fragment.data(), fragment.size());
+ scratch->assign(fragments_.data(), fragments_.size());
+ fragments_.clear();
+ *record = Slice(*scratch);
+ last_record_offset_ = prospective_record_offset;
+ in_fragmented_record_ = false;
+ return true;
+ }
+ break;
+
+ case kBadHeader:
+ case kBadRecord:
+ case kEof:
+ case kOldRecord:
+ if (in_fragmented_record_) {
+ ReportCorruption(fragments_.size(), "error in middle of record");
+ in_fragmented_record_ = false;
+ fragments_.clear();
+ }
+ break;
+
+ case kBadRecordChecksum:
+ if (recycled_) {
+ fragments_.clear();
+ return false;
+ }
+ ReportCorruption(drop_size, "checksum mismatch");
+ if (in_fragmented_record_) {
+ ReportCorruption(fragments_.size(), "error in middle of record");
+ in_fragmented_record_ = false;
+ fragments_.clear();
+ }
+ break;
+
+ default: {
+ char buf[40];
+ snprintf(buf, sizeof(buf), "unknown record type %u",
+ fragment_type_or_err);
+ ReportCorruption(
+ fragment.size() + (in_fragmented_record_ ? fragments_.size() : 0),
+ buf);
+ in_fragmented_record_ = false;
+ fragments_.clear();
+ break;
+ }
+ }
+ }
+ return false;
+}
+
+void FragmentBufferedReader::UnmarkEOF() {
+ if (read_error_) {
+ return;
+ }
+ eof_ = false;
+ UnmarkEOFInternal();
+}
+
+bool FragmentBufferedReader::TryReadMore(size_t* drop_size, int* error) {
+ if (!eof_ && !read_error_) {
+ // Last read was a full read, so this is a trailer to skip
+ buffer_.clear();
+ Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
+ end_of_buffer_offset_ += buffer_.size();
+ if (!status.ok()) {
+ buffer_.clear();
+ ReportDrop(kBlockSize, status);
+ read_error_ = true;
+ *error = kEof;
+ return false;
+ } else if (buffer_.size() < static_cast<size_t>(kBlockSize)) {
+ eof_ = true;
+ eof_offset_ = buffer_.size();
+ TEST_SYNC_POINT_CALLBACK(
+ "FragmentBufferedLogReader::TryReadMore:FirstEOF", nullptr);
+ }
+ return true;
+ } else if (!read_error_) {
+ UnmarkEOF();
+ }
+ if (!read_error_) {
+ return true;
+ }
+ *error = kEof;
+ *drop_size = buffer_.size();
+ if (buffer_.size() > 0) {
+ *error = kBadHeader;
+ }
+ buffer_.clear();
+ return false;
+}
+
+// return true if the caller should process the fragment_type_or_err.
+bool FragmentBufferedReader::TryReadFragment(
+ Slice* fragment, size_t* drop_size, unsigned int* fragment_type_or_err) {
+ assert(fragment != nullptr);
+ assert(drop_size != nullptr);
+ assert(fragment_type_or_err != nullptr);
+
+ while (buffer_.size() < static_cast<size_t>(kHeaderSize)) {
+ size_t old_size = buffer_.size();
+ int error = kEof;
+ if (!TryReadMore(drop_size, &error)) {
+ *fragment_type_or_err = error;
+ return false;
+ } else if (old_size == buffer_.size()) {
+ return false;
+ }
+ }
+ const char* header = buffer_.data();
+ const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
+ const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
+ const unsigned int type = header[6];
+ const uint32_t length = a | (b << 8);
+ int header_size = kHeaderSize;
+ if (type >= kRecyclableFullType && type <= kRecyclableLastType) {
+ if (end_of_buffer_offset_ - buffer_.size() == 0) {
+ recycled_ = true;
+ }
+ header_size = kRecyclableHeaderSize;
+ while (buffer_.size() < static_cast<size_t>(kRecyclableHeaderSize)) {
+ size_t old_size = buffer_.size();
+ int error = kEof;
+ if (!TryReadMore(drop_size, &error)) {
+ *fragment_type_or_err = error;
+ return false;
+ } else if (old_size == buffer_.size()) {
+ return false;
+ }
+ }
+ const uint32_t log_num = DecodeFixed32(header + 7);
+ if (log_num != log_number_) {
+ *fragment_type_or_err = kOldRecord;
+ return true;
+ }
+ }
+
+ while (header_size + length > buffer_.size()) {
+ size_t old_size = buffer_.size();
+ int error = kEof;
+ if (!TryReadMore(drop_size, &error)) {
+ *fragment_type_or_err = error;
+ return false;
+ } else if (old_size == buffer_.size()) {
+ return false;
+ }
+ }
+
+ if (type == kZeroType && length == 0) {
+ buffer_.clear();
+ *fragment_type_or_err = kBadRecord;
+ return true;
+ }
+
+ if (checksum_) {
+ uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
+ uint32_t actual_crc = crc32c::Value(header + 6, length + header_size - 6);
+ if (actual_crc != expected_crc) {
+ *drop_size = buffer_.size();
+ buffer_.clear();
+ *fragment_type_or_err = kBadRecordChecksum;
+ return true;
+ }
+ }
+
+ buffer_.remove_prefix(header_size + length);
+
+ *fragment = Slice(header + header_size, length);
+ *fragment_type_or_err = type;
+ return true;
+}
+
+} // namespace log
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/log_reader.h b/src/rocksdb/db/log_reader.h
new file mode 100644
index 000000000..293ae957c
--- /dev/null
+++ b/src/rocksdb/db/log_reader.h
@@ -0,0 +1,189 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <memory>
+#include <stdint.h>
+
+#include "db/log_format.h"
+#include "file/sequence_file_reader.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Logger;
+
+namespace log {
+
+/**
+ * Reader is a general purpose log stream reader implementation. The actual job
+ * of reading from the device is implemented by the SequentialFile interface.
+ *
+ * Please see Writer for details on the file and record layout.
+ */
+class Reader {
+ public:
+ // Interface for reporting errors.
+ class Reporter {
+ public:
+ virtual ~Reporter();
+
+ // Some corruption was detected. "size" is the approximate number
+ // of bytes dropped due to the corruption.
+ virtual void Corruption(size_t bytes, const Status& status) = 0;
+ };
+
+ // Create a reader that will return log records from "*file".
+ // "*file" must remain live while this Reader is in use.
+ //
+ // If "reporter" is non-nullptr, it is notified whenever some data is
+ // dropped due to a detected corruption. "*reporter" must remain
+ // live while this Reader is in use.
+ //
+ // If "checksum" is true, verify checksums if available.
+ Reader(std::shared_ptr<Logger> info_log,
+ // @lint-ignore TXT2 T25377293 Grandfathered in
+ std::unique_ptr<SequentialFileReader>&& file, Reporter* reporter,
+ bool checksum, uint64_t log_num);
+ // No copying allowed
+ Reader(const Reader&) = delete;
+ void operator=(const Reader&) = delete;
+
+ virtual ~Reader();
+
+ // Read the next record into *record. Returns true if read
+ // successfully, false if we hit end of the input. May use
+ // "*scratch" as temporary storage. The contents filled in *record
+ // will only be valid until the next mutating operation on this
+ // reader or the next mutation to *scratch.
+ virtual bool ReadRecord(Slice* record, std::string* scratch,
+ WALRecoveryMode wal_recovery_mode =
+ WALRecoveryMode::kTolerateCorruptedTailRecords);
+
+ // Returns the physical offset of the last record returned by ReadRecord.
+ //
+ // Undefined before the first call to ReadRecord.
+ uint64_t LastRecordOffset();
+
+ // returns true if the reader has encountered an eof condition.
+ bool IsEOF() {
+ return eof_;
+ }
+
+ // returns true if the reader has encountered read error.
+ bool hasReadError() const { return read_error_; }
+
+ // when we know more data has been written to the file. we can use this
+ // function to force the reader to look again in the file.
+ // Also aligns the file position indicator to the start of the next block
+ // by reading the rest of the data from the EOF position to the end of the
+ // block that was partially read.
+ virtual void UnmarkEOF();
+
+ SequentialFileReader* file() { return file_.get(); }
+
+ Reporter* GetReporter() const { return reporter_; }
+
+ uint64_t GetLogNumber() const { return log_number_; }
+
+ size_t GetReadOffset() const {
+ return static_cast<size_t>(end_of_buffer_offset_);
+ }
+
+ protected:
+ std::shared_ptr<Logger> info_log_;
+ const std::unique_ptr<SequentialFileReader> file_;
+ Reporter* const reporter_;
+ bool const checksum_;
+ char* const backing_store_;
+
+ // Internal state variables used for reading records
+ Slice buffer_;
+ bool eof_; // Last Read() indicated EOF by returning < kBlockSize
+ bool read_error_; // Error occurred while reading from file
+
+ // Offset of the file position indicator within the last block when an
+ // EOF was detected.
+ size_t eof_offset_;
+
+ // Offset of the last record returned by ReadRecord.
+ uint64_t last_record_offset_;
+ // Offset of the first location past the end of buffer_.
+ uint64_t end_of_buffer_offset_;
+
+ // which log number this is
+ uint64_t const log_number_;
+
+ // Whether this is a recycled log file
+ bool recycled_;
+
+ // Extend record types with the following special values
+ enum {
+ kEof = kMaxRecordType + 1,
+ // Returned whenever we find an invalid physical record.
+ // Currently there are three situations in which this happens:
+ // * The record has an invalid CRC (ReadPhysicalRecord reports a drop)
+ // * The record is a 0-length record (No drop is reported)
+ kBadRecord = kMaxRecordType + 2,
+ // Returned when we fail to read a valid header.
+ kBadHeader = kMaxRecordType + 3,
+ // Returned when we read an old record from a previous user of the log.
+ kOldRecord = kMaxRecordType + 4,
+ // Returned when we get a bad record length
+ kBadRecordLen = kMaxRecordType + 5,
+ // Returned when we get a bad record checksum
+ kBadRecordChecksum = kMaxRecordType + 6,
+ };
+
+ // Return type, or one of the preceding special values
+ unsigned int ReadPhysicalRecord(Slice* result, size_t* drop_size);
+
+ // Read some more
+ bool ReadMore(size_t* drop_size, int *error);
+
+ void UnmarkEOFInternal();
+
+ // Reports dropped bytes to the reporter.
+ // buffer_ must be updated to remove the dropped bytes prior to invocation.
+ void ReportCorruption(size_t bytes, const char* reason);
+ void ReportDrop(size_t bytes, const Status& reason);
+};
+
+class FragmentBufferedReader : public Reader {
+ public:
+ FragmentBufferedReader(std::shared_ptr<Logger> info_log,
+ // @lint-ignore TXT2 T25377293 Grandfathered in
+ std::unique_ptr<SequentialFileReader>&& _file,
+ Reporter* reporter, bool checksum, uint64_t log_num)
+ : Reader(info_log, std::move(_file), reporter, checksum, log_num),
+ fragments_(),
+ in_fragmented_record_(false) {}
+ ~FragmentBufferedReader() override {}
+ bool ReadRecord(Slice* record, std::string* scratch,
+ WALRecoveryMode wal_recovery_mode =
+ WALRecoveryMode::kTolerateCorruptedTailRecords) override;
+ void UnmarkEOF() override;
+
+ private:
+ std::string fragments_;
+ bool in_fragmented_record_;
+
+ bool TryReadFragment(Slice* result, size_t* drop_size,
+ unsigned int* fragment_type_or_err);
+
+ bool TryReadMore(size_t* drop_size, int* error);
+
+ // No copy allowed
+ FragmentBufferedReader(const FragmentBufferedReader&);
+ void operator=(const FragmentBufferedReader&);
+};
+
+} // namespace log
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/log_test.cc b/src/rocksdb/db/log_test.cc
new file mode 100644
index 000000000..849b89d8a
--- /dev/null
+++ b/src/rocksdb/db/log_test.cc
@@ -0,0 +1,928 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "env/composite_env_wrapper.h"
+#include "file/sequence_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/env.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace log {
+
+// Construct a string of the specified length made out of the supplied
+// partial string.
+static std::string BigString(const std::string& partial_string, size_t n) {
+ std::string result;
+ while (result.size() < n) {
+ result.append(partial_string);
+ }
+ result.resize(n);
+ return result;
+}
+
+// Construct a string from a number
+static std::string NumberString(int n) {
+ char buf[50];
+ snprintf(buf, sizeof(buf), "%d.", n);
+ return std::string(buf);
+}
+
+// Return a skewed potentially long string
+static std::string RandomSkewedString(int i, Random* rnd) {
+ return BigString(NumberString(i), rnd->Skewed(17));
+}
+
+// Param type is tuple<int, bool>
+// get<0>(tuple): non-zero if recycling log, zero if regular log
+// get<1>(tuple): true if allow retry after read EOF, false otherwise
+class LogTest : public ::testing::TestWithParam<std::tuple<int, bool>> {
+ private:
+ class StringSource : public SequentialFile {
+ public:
+ Slice& contents_;
+ bool force_error_;
+ size_t force_error_position_;
+ bool force_eof_;
+ size_t force_eof_position_;
+ bool returned_partial_;
+ bool fail_after_read_partial_;
+ explicit StringSource(Slice& contents, bool fail_after_read_partial)
+ : contents_(contents),
+ force_error_(false),
+ force_error_position_(0),
+ force_eof_(false),
+ force_eof_position_(0),
+ returned_partial_(false),
+ fail_after_read_partial_(fail_after_read_partial) {}
+
+ Status Read(size_t n, Slice* result, char* scratch) override {
+ if (fail_after_read_partial_) {
+ EXPECT_TRUE(!returned_partial_) << "must not Read() after eof/error";
+ }
+
+ if (force_error_) {
+ if (force_error_position_ >= n) {
+ force_error_position_ -= n;
+ } else {
+ *result = Slice(contents_.data(), force_error_position_);
+ contents_.remove_prefix(force_error_position_);
+ force_error_ = false;
+ returned_partial_ = true;
+ return Status::Corruption("read error");
+ }
+ }
+
+ if (contents_.size() < n) {
+ n = contents_.size();
+ returned_partial_ = true;
+ }
+
+ if (force_eof_) {
+ if (force_eof_position_ >= n) {
+ force_eof_position_ -= n;
+ } else {
+ force_eof_ = false;
+ n = force_eof_position_;
+ returned_partial_ = true;
+ }
+ }
+
+ // By using scratch we ensure that caller has control over the
+ // lifetime of result.data()
+ memcpy(scratch, contents_.data(), n);
+ *result = Slice(scratch, n);
+
+ contents_.remove_prefix(n);
+ return Status::OK();
+ }
+
+ Status Skip(uint64_t n) override {
+ if (n > contents_.size()) {
+ contents_.clear();
+ return Status::NotFound("in-memory file skipepd past end");
+ }
+
+ contents_.remove_prefix(n);
+
+ return Status::OK();
+ }
+ };
+
+ inline StringSource* GetStringSourceFromLegacyReader(
+ SequentialFileReader* reader) {
+ LegacySequentialFileWrapper* file =
+ static_cast<LegacySequentialFileWrapper*>(reader->file());
+ return static_cast<StringSource*>(file->target());
+ }
+
+ class ReportCollector : public Reader::Reporter {
+ public:
+ size_t dropped_bytes_;
+ std::string message_;
+
+ ReportCollector() : dropped_bytes_(0) { }
+ void Corruption(size_t bytes, const Status& status) override {
+ dropped_bytes_ += bytes;
+ message_.append(status.ToString());
+ }
+ };
+
+ std::string& dest_contents() {
+ auto dest = test::GetStringSinkFromLegacyWriter(writer_.file());
+ assert(dest);
+ return dest->contents_;
+ }
+
+ const std::string& dest_contents() const {
+ auto dest = test::GetStringSinkFromLegacyWriter(writer_.file());
+ assert(dest);
+ return dest->contents_;
+ }
+
+ void reset_source_contents() {
+ auto src = GetStringSourceFromLegacyReader(reader_->file());
+ assert(src);
+ src->contents_ = dest_contents();
+ }
+
+ Slice reader_contents_;
+ std::unique_ptr<WritableFileWriter> dest_holder_;
+ std::unique_ptr<SequentialFileReader> source_holder_;
+ ReportCollector report_;
+ Writer writer_;
+ std::unique_ptr<Reader> reader_;
+
+ protected:
+ bool allow_retry_read_;
+
+ public:
+ LogTest()
+ : reader_contents_(),
+ dest_holder_(test::GetWritableFileWriter(
+ new test::StringSink(&reader_contents_), "" /* don't care */)),
+ source_holder_(test::GetSequentialFileReader(
+ new StringSource(reader_contents_, !std::get<1>(GetParam())),
+ "" /* file name */)),
+ writer_(std::move(dest_holder_), 123, std::get<0>(GetParam())),
+ allow_retry_read_(std::get<1>(GetParam())) {
+ if (allow_retry_read_) {
+ reader_.reset(new FragmentBufferedReader(
+ nullptr, std::move(source_holder_), &report_, true /* checksum */,
+ 123 /* log_number */));
+ } else {
+ reader_.reset(new Reader(nullptr, std::move(source_holder_), &report_,
+ true /* checksum */, 123 /* log_number */));
+ }
+ }
+
+ Slice* get_reader_contents() { return &reader_contents_; }
+
+ void Write(const std::string& msg) {
+ writer_.AddRecord(Slice(msg));
+ }
+
+ size_t WrittenBytes() const {
+ return dest_contents().size();
+ }
+
+ std::string Read(const WALRecoveryMode wal_recovery_mode =
+ WALRecoveryMode::kTolerateCorruptedTailRecords) {
+ std::string scratch;
+ Slice record;
+ bool ret = false;
+ ret = reader_->ReadRecord(&record, &scratch, wal_recovery_mode);
+ if (ret) {
+ return record.ToString();
+ } else {
+ return "EOF";
+ }
+ }
+
+ void IncrementByte(int offset, char delta) {
+ dest_contents()[offset] += delta;
+ }
+
+ void SetByte(int offset, char new_byte) {
+ dest_contents()[offset] = new_byte;
+ }
+
+ void ShrinkSize(int bytes) {
+ auto dest = test::GetStringSinkFromLegacyWriter(writer_.file());
+ assert(dest);
+ dest->Drop(bytes);
+ }
+
+ void FixChecksum(int header_offset, int len, bool recyclable) {
+ // Compute crc of type/len/data
+ int header_size = recyclable ? kRecyclableHeaderSize : kHeaderSize;
+ uint32_t crc = crc32c::Value(&dest_contents()[header_offset + 6],
+ header_size - 6 + len);
+ crc = crc32c::Mask(crc);
+ EncodeFixed32(&dest_contents()[header_offset], crc);
+ }
+
+ void ForceError(size_t position = 0) {
+ auto src = GetStringSourceFromLegacyReader(reader_->file());
+ src->force_error_ = true;
+ src->force_error_position_ = position;
+ }
+
+ size_t DroppedBytes() const {
+ return report_.dropped_bytes_;
+ }
+
+ std::string ReportMessage() const {
+ return report_.message_;
+ }
+
+ void ForceEOF(size_t position = 0) {
+ auto src = GetStringSourceFromLegacyReader(reader_->file());
+ src->force_eof_ = true;
+ src->force_eof_position_ = position;
+ }
+
+ void UnmarkEOF() {
+ auto src = GetStringSourceFromLegacyReader(reader_->file());
+ src->returned_partial_ = false;
+ reader_->UnmarkEOF();
+ }
+
+ bool IsEOF() { return reader_->IsEOF(); }
+
+ // Returns OK iff recorded error message contains "msg"
+ std::string MatchError(const std::string& msg) const {
+ if (report_.message_.find(msg) == std::string::npos) {
+ return report_.message_;
+ } else {
+ return "OK";
+ }
+ }
+};
+
+TEST_P(LogTest, Empty) { ASSERT_EQ("EOF", Read()); }
+
+TEST_P(LogTest, ReadWrite) {
+ Write("foo");
+ Write("bar");
+ Write("");
+ Write("xxxx");
+ ASSERT_EQ("foo", Read());
+ ASSERT_EQ("bar", Read());
+ ASSERT_EQ("", Read());
+ ASSERT_EQ("xxxx", Read());
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ("EOF", Read()); // Make sure reads at eof work
+}
+
+TEST_P(LogTest, ManyBlocks) {
+ for (int i = 0; i < 100000; i++) {
+ Write(NumberString(i));
+ }
+ for (int i = 0; i < 100000; i++) {
+ ASSERT_EQ(NumberString(i), Read());
+ }
+ ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(LogTest, Fragmentation) {
+ Write("small");
+ Write(BigString("medium", 50000));
+ Write(BigString("large", 100000));
+ ASSERT_EQ("small", Read());
+ ASSERT_EQ(BigString("medium", 50000), Read());
+ ASSERT_EQ(BigString("large", 100000), Read());
+ ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(LogTest, MarginalTrailer) {
+ // Make a trailer that is exactly the same length as an empty record.
+ int header_size =
+ std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize;
+ const int n = kBlockSize - 2 * header_size;
+ Write(BigString("foo", n));
+ ASSERT_EQ((unsigned int)(kBlockSize - header_size), WrittenBytes());
+ Write("");
+ Write("bar");
+ ASSERT_EQ(BigString("foo", n), Read());
+ ASSERT_EQ("", Read());
+ ASSERT_EQ("bar", Read());
+ ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(LogTest, MarginalTrailer2) {
+ // Make a trailer that is exactly the same length as an empty record.
+ int header_size =
+ std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize;
+ const int n = kBlockSize - 2 * header_size;
+ Write(BigString("foo", n));
+ ASSERT_EQ((unsigned int)(kBlockSize - header_size), WrittenBytes());
+ Write("bar");
+ ASSERT_EQ(BigString("foo", n), Read());
+ ASSERT_EQ("bar", Read());
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(0U, DroppedBytes());
+ ASSERT_EQ("", ReportMessage());
+}
+
+TEST_P(LogTest, ShortTrailer) {
+ int header_size =
+ std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize;
+ const int n = kBlockSize - 2 * header_size + 4;
+ Write(BigString("foo", n));
+ ASSERT_EQ((unsigned int)(kBlockSize - header_size + 4), WrittenBytes());
+ Write("");
+ Write("bar");
+ ASSERT_EQ(BigString("foo", n), Read());
+ ASSERT_EQ("", Read());
+ ASSERT_EQ("bar", Read());
+ ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(LogTest, AlignedEof) {
+ int header_size =
+ std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize;
+ const int n = kBlockSize - 2 * header_size + 4;
+ Write(BigString("foo", n));
+ ASSERT_EQ((unsigned int)(kBlockSize - header_size + 4), WrittenBytes());
+ ASSERT_EQ(BigString("foo", n), Read());
+ ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(LogTest, RandomRead) {
+ const int N = 500;
+ Random write_rnd(301);
+ for (int i = 0; i < N; i++) {
+ Write(RandomSkewedString(i, &write_rnd));
+ }
+ Random read_rnd(301);
+ for (int i = 0; i < N; i++) {
+ ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read());
+ }
+ ASSERT_EQ("EOF", Read());
+}
+
+// Tests of all the error paths in log_reader.cc follow:
+
+TEST_P(LogTest, ReadError) {
+ Write("foo");
+ ForceError();
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ((unsigned int)kBlockSize, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("read error"));
+}
+
+TEST_P(LogTest, BadRecordType) {
+ Write("foo");
+ // Type is stored in header[6]
+ IncrementByte(6, 100);
+ FixChecksum(0, 3, false);
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(3U, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("unknown record type"));
+}
+
+TEST_P(LogTest, TruncatedTrailingRecordIsIgnored) {
+ Write("foo");
+ ShrinkSize(4); // Drop all payload as well as a header byte
+ ASSERT_EQ("EOF", Read());
+ // Truncated last record is ignored, not treated as an error
+ ASSERT_EQ(0U, DroppedBytes());
+ ASSERT_EQ("", ReportMessage());
+}
+
+TEST_P(LogTest, TruncatedTrailingRecordIsNotIgnored) {
+ if (allow_retry_read_) {
+ // If read retry is allowed, then truncated trailing record should not
+ // raise an error.
+ return;
+ }
+ Write("foo");
+ ShrinkSize(4); // Drop all payload as well as a header byte
+ ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
+ // Truncated last record is ignored, not treated as an error
+ ASSERT_GT(DroppedBytes(), 0U);
+ ASSERT_EQ("OK", MatchError("Corruption: truncated header"));
+}
+
+TEST_P(LogTest, BadLength) {
+ if (allow_retry_read_) {
+ // If read retry is allowed, then we should not raise an error when the
+ // record length specified in header is longer than data currently
+ // available. It's possible that the body of the record is not written yet.
+ return;
+ }
+ bool recyclable_log = (std::get<0>(GetParam()) != 0);
+ int header_size = recyclable_log ? kRecyclableHeaderSize : kHeaderSize;
+ const int kPayloadSize = kBlockSize - header_size;
+ Write(BigString("bar", kPayloadSize));
+ Write("foo");
+ // Least significant size byte is stored in header[4].
+ IncrementByte(4, 1);
+ if (!recyclable_log) {
+ ASSERT_EQ("foo", Read());
+ ASSERT_EQ(kBlockSize, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("bad record length"));
+ } else {
+ ASSERT_EQ("EOF", Read());
+ }
+}
+
+TEST_P(LogTest, BadLengthAtEndIsIgnored) {
+ if (allow_retry_read_) {
+ // If read retry is allowed, then we should not raise an error when the
+ // record length specified in header is longer than data currently
+ // available. It's possible that the body of the record is not written yet.
+ return;
+ }
+ Write("foo");
+ ShrinkSize(1);
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(0U, DroppedBytes());
+ ASSERT_EQ("", ReportMessage());
+}
+
+TEST_P(LogTest, BadLengthAtEndIsNotIgnored) {
+ if (allow_retry_read_) {
+ // If read retry is allowed, then we should not raise an error when the
+ // record length specified in header is longer than data currently
+ // available. It's possible that the body of the record is not written yet.
+ return;
+ }
+ Write("foo");
+ ShrinkSize(1);
+ ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
+ ASSERT_GT(DroppedBytes(), 0U);
+ ASSERT_EQ("OK", MatchError("Corruption: truncated header"));
+}
+
+TEST_P(LogTest, ChecksumMismatch) {
+ Write("foooooo");
+ IncrementByte(0, 14);
+ ASSERT_EQ("EOF", Read());
+ bool recyclable_log = (std::get<0>(GetParam()) != 0);
+ if (!recyclable_log) {
+ ASSERT_EQ(14U, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("checksum mismatch"));
+ } else {
+ ASSERT_EQ(0U, DroppedBytes());
+ ASSERT_EQ("", ReportMessage());
+ }
+}
+
+TEST_P(LogTest, UnexpectedMiddleType) {
+ Write("foo");
+ bool recyclable_log = (std::get<0>(GetParam()) != 0);
+ SetByte(6, static_cast<char>(recyclable_log ? kRecyclableMiddleType
+ : kMiddleType));
+ FixChecksum(0, 3, !!recyclable_log);
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(3U, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("missing start"));
+}
+
+TEST_P(LogTest, UnexpectedLastType) {
+ Write("foo");
+ bool recyclable_log = (std::get<0>(GetParam()) != 0);
+ SetByte(6,
+ static_cast<char>(recyclable_log ? kRecyclableLastType : kLastType));
+ FixChecksum(0, 3, !!recyclable_log);
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(3U, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("missing start"));
+}
+
+TEST_P(LogTest, UnexpectedFullType) {
+ Write("foo");
+ Write("bar");
+ bool recyclable_log = (std::get<0>(GetParam()) != 0);
+ SetByte(
+ 6, static_cast<char>(recyclable_log ? kRecyclableFirstType : kFirstType));
+ FixChecksum(0, 3, !!recyclable_log);
+ ASSERT_EQ("bar", Read());
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(3U, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("partial record without end"));
+}
+
+TEST_P(LogTest, UnexpectedFirstType) {
+ Write("foo");
+ Write(BigString("bar", 100000));
+ bool recyclable_log = (std::get<0>(GetParam()) != 0);
+ SetByte(
+ 6, static_cast<char>(recyclable_log ? kRecyclableFirstType : kFirstType));
+ FixChecksum(0, 3, !!recyclable_log);
+ ASSERT_EQ(BigString("bar", 100000), Read());
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(3U, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("partial record without end"));
+}
+
+TEST_P(LogTest, MissingLastIsIgnored) {
+ Write(BigString("bar", kBlockSize));
+ // Remove the LAST block, including header.
+ ShrinkSize(14);
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ("", ReportMessage());
+ ASSERT_EQ(0U, DroppedBytes());
+}
+
+TEST_P(LogTest, MissingLastIsNotIgnored) {
+ if (allow_retry_read_) {
+ // If read retry is allowed, then truncated trailing record should not
+ // raise an error.
+ return;
+ }
+ Write(BigString("bar", kBlockSize));
+ // Remove the LAST block, including header.
+ ShrinkSize(14);
+ ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
+ ASSERT_GT(DroppedBytes(), 0U);
+ ASSERT_EQ("OK", MatchError("Corruption: error reading trailing data"));
+}
+
+TEST_P(LogTest, PartialLastIsIgnored) {
+ Write(BigString("bar", kBlockSize));
+ // Cause a bad record length in the LAST block.
+ ShrinkSize(1);
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ("", ReportMessage());
+ ASSERT_EQ(0U, DroppedBytes());
+}
+
+TEST_P(LogTest, PartialLastIsNotIgnored) {
+ if (allow_retry_read_) {
+ // If read retry is allowed, then truncated trailing record should not
+ // raise an error.
+ return;
+ }
+ Write(BigString("bar", kBlockSize));
+ // Cause a bad record length in the LAST block.
+ ShrinkSize(1);
+ ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
+ ASSERT_GT(DroppedBytes(), 0U);
+ ASSERT_EQ("OK", MatchError(
+ "Corruption: truncated headerCorruption: "
+ "error reading trailing data"));
+}
+
+TEST_P(LogTest, ErrorJoinsRecords) {
+ // Consider two fragmented records:
+ // first(R1) last(R1) first(R2) last(R2)
+ // where the middle two fragments disappear. We do not want
+ // first(R1),last(R2) to get joined and returned as a valid record.
+
+ // Write records that span two blocks
+ Write(BigString("foo", kBlockSize));
+ Write(BigString("bar", kBlockSize));
+ Write("correct");
+
+ // Wipe the middle block
+ for (unsigned int offset = kBlockSize; offset < 2*kBlockSize; offset++) {
+ SetByte(offset, 'x');
+ }
+
+ bool recyclable_log = (std::get<0>(GetParam()) != 0);
+ if (!recyclable_log) {
+ ASSERT_EQ("correct", Read());
+ ASSERT_EQ("EOF", Read());
+ size_t dropped = DroppedBytes();
+ ASSERT_LE(dropped, 2 * kBlockSize + 100);
+ ASSERT_GE(dropped, 2 * kBlockSize);
+ } else {
+ ASSERT_EQ("EOF", Read());
+ }
+}
+
+TEST_P(LogTest, ClearEofSingleBlock) {
+ Write("foo");
+ Write("bar");
+ bool recyclable_log = (std::get<0>(GetParam()) != 0);
+ int header_size = recyclable_log ? kRecyclableHeaderSize : kHeaderSize;
+ ForceEOF(3 + header_size + 2);
+ ASSERT_EQ("foo", Read());
+ UnmarkEOF();
+ ASSERT_EQ("bar", Read());
+ ASSERT_TRUE(IsEOF());
+ ASSERT_EQ("EOF", Read());
+ Write("xxx");
+ UnmarkEOF();
+ ASSERT_EQ("xxx", Read());
+ ASSERT_TRUE(IsEOF());
+}
+
+TEST_P(LogTest, ClearEofMultiBlock) {
+ size_t num_full_blocks = 5;
+ bool recyclable_log = (std::get<0>(GetParam()) != 0);
+ int header_size = recyclable_log ? kRecyclableHeaderSize : kHeaderSize;
+ size_t n = (kBlockSize - header_size) * num_full_blocks + 25;
+ Write(BigString("foo", n));
+ Write(BigString("bar", n));
+ ForceEOF(n + num_full_blocks * header_size + header_size + 3);
+ ASSERT_EQ(BigString("foo", n), Read());
+ ASSERT_TRUE(IsEOF());
+ UnmarkEOF();
+ ASSERT_EQ(BigString("bar", n), Read());
+ ASSERT_TRUE(IsEOF());
+ Write(BigString("xxx", n));
+ UnmarkEOF();
+ ASSERT_EQ(BigString("xxx", n), Read());
+ ASSERT_TRUE(IsEOF());
+}
+
+TEST_P(LogTest, ClearEofError) {
+ // If an error occurs during Read() in UnmarkEOF(), the records contained
+ // in the buffer should be returned on subsequent calls of ReadRecord()
+ // until no more full records are left, whereafter ReadRecord() should return
+ // false to indicate that it cannot read any further.
+
+ Write("foo");
+ Write("bar");
+ UnmarkEOF();
+ ASSERT_EQ("foo", Read());
+ ASSERT_TRUE(IsEOF());
+ Write("xxx");
+ ForceError(0);
+ UnmarkEOF();
+ ASSERT_EQ("bar", Read());
+ ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(LogTest, ClearEofError2) {
+ Write("foo");
+ Write("bar");
+ UnmarkEOF();
+ ASSERT_EQ("foo", Read());
+ Write("xxx");
+ ForceError(3);
+ UnmarkEOF();
+ ASSERT_EQ("bar", Read());
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(3U, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("read error"));
+}
+
+TEST_P(LogTest, Recycle) {
+ bool recyclable_log = (std::get<0>(GetParam()) != 0);
+ if (!recyclable_log) {
+ return; // test is only valid for recycled logs
+ }
+ Write("foo");
+ Write("bar");
+ Write("baz");
+ Write("bif");
+ Write("blitz");
+ while (get_reader_contents()->size() < log::kBlockSize * 2) {
+ Write("xxxxxxxxxxxxxxxx");
+ }
+ std::unique_ptr<WritableFileWriter> dest_holder(test::GetWritableFileWriter(
+ new test::OverwritingStringSink(get_reader_contents()),
+ "" /* don't care */));
+ Writer recycle_writer(std::move(dest_holder), 123, true);
+ recycle_writer.AddRecord(Slice("foooo"));
+ recycle_writer.AddRecord(Slice("bar"));
+ ASSERT_GE(get_reader_contents()->size(), log::kBlockSize * 2);
+ ASSERT_EQ("foooo", Read());
+ ASSERT_EQ("bar", Read());
+ ASSERT_EQ("EOF", Read());
+}
+
+INSTANTIATE_TEST_CASE_P(bool, LogTest,
+ ::testing::Values(std::make_tuple(0, false),
+ std::make_tuple(0, true),
+ std::make_tuple(1, false),
+ std::make_tuple(1, true)));
+
+class RetriableLogTest : public ::testing::TestWithParam<int> {
+ private:
+ class ReportCollector : public Reader::Reporter {
+ public:
+ size_t dropped_bytes_;
+ std::string message_;
+
+ ReportCollector() : dropped_bytes_(0) {}
+ void Corruption(size_t bytes, const Status& status) override {
+ dropped_bytes_ += bytes;
+ message_.append(status.ToString());
+ }
+ };
+
+ Slice contents_;
+ std::unique_ptr<WritableFileWriter> dest_holder_;
+ std::unique_ptr<Writer> log_writer_;
+ Env* env_;
+ EnvOptions env_options_;
+ const std::string test_dir_;
+ const std::string log_file_;
+ std::unique_ptr<WritableFileWriter> writer_;
+ std::unique_ptr<SequentialFileReader> reader_;
+ ReportCollector report_;
+ std::unique_ptr<FragmentBufferedReader> log_reader_;
+
+ public:
+ RetriableLogTest()
+ : contents_(),
+ dest_holder_(nullptr),
+ log_writer_(nullptr),
+ env_(Env::Default()),
+ test_dir_(test::PerThreadDBPath("retriable_log_test")),
+ log_file_(test_dir_ + "/log"),
+ writer_(nullptr),
+ reader_(nullptr),
+ log_reader_(nullptr) {}
+
+ Status SetupTestEnv() {
+ dest_holder_.reset(test::GetWritableFileWriter(
+ new test::StringSink(&contents_), "" /* file name */));
+ assert(dest_holder_ != nullptr);
+ log_writer_.reset(new Writer(std::move(dest_holder_), 123, GetParam()));
+ assert(log_writer_ != nullptr);
+
+ Status s;
+ s = env_->CreateDirIfMissing(test_dir_);
+ std::unique_ptr<WritableFile> writable_file;
+ if (s.ok()) {
+ s = env_->NewWritableFile(log_file_, &writable_file, env_options_);
+ }
+ if (s.ok()) {
+ writer_.reset(new WritableFileWriter(
+ NewLegacyWritableFileWrapper(std::move(writable_file)), log_file_,
+ env_options_));
+ assert(writer_ != nullptr);
+ }
+ std::unique_ptr<SequentialFile> seq_file;
+ if (s.ok()) {
+ s = env_->NewSequentialFile(log_file_, &seq_file, env_options_);
+ }
+ if (s.ok()) {
+ reader_.reset(new SequentialFileReader(
+ NewLegacySequentialFileWrapper(seq_file), log_file_));
+ assert(reader_ != nullptr);
+ log_reader_.reset(new FragmentBufferedReader(
+ nullptr, std::move(reader_), &report_, true /* checksum */,
+ 123 /* log_number */));
+ assert(log_reader_ != nullptr);
+ }
+ return s;
+ }
+
+ std::string contents() {
+ auto file = test::GetStringSinkFromLegacyWriter(log_writer_->file());
+ assert(file != nullptr);
+ return file->contents_;
+ }
+
+ void Encode(const std::string& msg) { log_writer_->AddRecord(Slice(msg)); }
+
+ void Write(const Slice& data) {
+ writer_->Append(data);
+ writer_->Sync(true);
+ }
+
+ bool TryRead(std::string* result) {
+ assert(result != nullptr);
+ result->clear();
+ std::string scratch;
+ Slice record;
+ bool r = log_reader_->ReadRecord(&record, &scratch);
+ if (r) {
+ result->assign(record.data(), record.size());
+ return true;
+ } else {
+ return false;
+ }
+ }
+};
+
+TEST_P(RetriableLogTest, TailLog_PartialHeader) {
+ ASSERT_OK(SetupTestEnv());
+ std::vector<int> remaining_bytes_in_last_record;
+ size_t header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
+ bool eof = false;
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"RetriableLogTest::TailLog:AfterPart1",
+ "RetriableLogTest::TailLog:BeforeReadRecord"},
+ {"FragmentBufferedLogReader::TryReadMore:FirstEOF",
+ "RetriableLogTest::TailLog:BeforePart2"}});
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "FragmentBufferedLogReader::TryReadMore:FirstEOF",
+ [&](void* /*arg*/) { eof = true; });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ size_t delta = header_size - 1;
+ port::Thread log_writer_thread([&]() {
+ size_t old_sz = contents().size();
+ Encode("foo");
+ size_t new_sz = contents().size();
+ std::string part1 = contents().substr(old_sz, delta);
+ std::string part2 =
+ contents().substr(old_sz + delta, new_sz - old_sz - delta);
+ Write(Slice(part1));
+ TEST_SYNC_POINT("RetriableLogTest::TailLog:AfterPart1");
+ TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforePart2");
+ Write(Slice(part2));
+ });
+
+ std::string record;
+ port::Thread log_reader_thread([&]() {
+ TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforeReadRecord");
+ while (!TryRead(&record)) {
+ }
+ });
+ log_reader_thread.join();
+ log_writer_thread.join();
+ ASSERT_EQ("foo", record);
+ ASSERT_TRUE(eof);
+}
+
+TEST_P(RetriableLogTest, TailLog_FullHeader) {
+ ASSERT_OK(SetupTestEnv());
+ std::vector<int> remaining_bytes_in_last_record;
+ size_t header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
+ bool eof = false;
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"RetriableLogTest::TailLog:AfterPart1",
+ "RetriableLogTest::TailLog:BeforeReadRecord"},
+ {"FragmentBufferedLogReader::TryReadMore:FirstEOF",
+ "RetriableLogTest::TailLog:BeforePart2"}});
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "FragmentBufferedLogReader::TryReadMore:FirstEOF",
+ [&](void* /*arg*/) { eof = true; });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ size_t delta = header_size + 1;
+ port::Thread log_writer_thread([&]() {
+ size_t old_sz = contents().size();
+ Encode("foo");
+ size_t new_sz = contents().size();
+ std::string part1 = contents().substr(old_sz, delta);
+ std::string part2 =
+ contents().substr(old_sz + delta, new_sz - old_sz - delta);
+ Write(Slice(part1));
+ TEST_SYNC_POINT("RetriableLogTest::TailLog:AfterPart1");
+ TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforePart2");
+ Write(Slice(part2));
+ ASSERT_TRUE(eof);
+ });
+
+ std::string record;
+ port::Thread log_reader_thread([&]() {
+ TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforeReadRecord");
+ while (!TryRead(&record)) {
+ }
+ });
+ log_reader_thread.join();
+ log_writer_thread.join();
+ ASSERT_EQ("foo", record);
+}
+
+TEST_P(RetriableLogTest, NonBlockingReadFullRecord) {
+ // Clear all sync point callbacks even if this test does not use sync point.
+ // It is necessary, otherwise the execute of this test may hit a sync point
+ // with which a callback is registered. The registered callback may access
+ // some dead variable, causing segfault.
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ ASSERT_OK(SetupTestEnv());
+ size_t header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
+ size_t delta = header_size - 1;
+ size_t old_sz = contents().size();
+ Encode("foo-bar");
+ size_t new_sz = contents().size();
+ std::string part1 = contents().substr(old_sz, delta);
+ std::string part2 =
+ contents().substr(old_sz + delta, new_sz - old_sz - delta);
+ Write(Slice(part1));
+ std::string record;
+ ASSERT_FALSE(TryRead(&record));
+ ASSERT_TRUE(record.empty());
+ Write(Slice(part2));
+ ASSERT_TRUE(TryRead(&record));
+ ASSERT_EQ("foo-bar", record);
+}
+
+INSTANTIATE_TEST_CASE_P(bool, RetriableLogTest, ::testing::Values(0, 2));
+
+} // namespace log
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/log_writer.cc b/src/rocksdb/db/log_writer.cc
new file mode 100644
index 000000000..0222ee2a7
--- /dev/null
+++ b/src/rocksdb/db/log_writer.cc
@@ -0,0 +1,162 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_writer.h"
+
+#include <stdint.h>
+#include "file/writable_file_writer.h"
+#include "rocksdb/env.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace log {
+
+Writer::Writer(std::unique_ptr<WritableFileWriter>&& dest, uint64_t log_number,
+ bool recycle_log_files, bool manual_flush)
+ : dest_(std::move(dest)),
+ block_offset_(0),
+ log_number_(log_number),
+ recycle_log_files_(recycle_log_files),
+ manual_flush_(manual_flush) {
+ for (int i = 0; i <= kMaxRecordType; i++) {
+ char t = static_cast<char>(i);
+ type_crc_[i] = crc32c::Value(&t, 1);
+ }
+}
+
+Writer::~Writer() {
+ if (dest_) {
+ WriteBuffer();
+ }
+}
+
+Status Writer::WriteBuffer() { return dest_->Flush(); }
+
+Status Writer::Close() {
+ Status s;
+ if (dest_) {
+ s = dest_->Close();
+ dest_.reset();
+ }
+ return s;
+}
+
+Status Writer::AddRecord(const Slice& slice) {
+ const char* ptr = slice.data();
+ size_t left = slice.size();
+
+ // Header size varies depending on whether we are recycling or not.
+ const int header_size =
+ recycle_log_files_ ? kRecyclableHeaderSize : kHeaderSize;
+
+ // Fragment the record if necessary and emit it. Note that if slice
+ // is empty, we still want to iterate once to emit a single
+ // zero-length record
+ Status s;
+ bool begin = true;
+ do {
+ const int64_t leftover = kBlockSize - block_offset_;
+ assert(leftover >= 0);
+ if (leftover < header_size) {
+ // Switch to a new block
+ if (leftover > 0) {
+ // Fill the trailer (literal below relies on kHeaderSize and
+ // kRecyclableHeaderSize being <= 11)
+ assert(header_size <= 11);
+ s = dest_->Append(Slice("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+ static_cast<size_t>(leftover)));
+ if (!s.ok()) {
+ break;
+ }
+ }
+ block_offset_ = 0;
+ }
+
+ // Invariant: we never leave < header_size bytes in a block.
+ assert(static_cast<int64_t>(kBlockSize - block_offset_) >= header_size);
+
+ const size_t avail = kBlockSize - block_offset_ - header_size;
+ const size_t fragment_length = (left < avail) ? left : avail;
+
+ RecordType type;
+ const bool end = (left == fragment_length);
+ if (begin && end) {
+ type = recycle_log_files_ ? kRecyclableFullType : kFullType;
+ } else if (begin) {
+ type = recycle_log_files_ ? kRecyclableFirstType : kFirstType;
+ } else if (end) {
+ type = recycle_log_files_ ? kRecyclableLastType : kLastType;
+ } else {
+ type = recycle_log_files_ ? kRecyclableMiddleType : kMiddleType;
+ }
+
+ s = EmitPhysicalRecord(type, ptr, fragment_length);
+ ptr += fragment_length;
+ left -= fragment_length;
+ begin = false;
+ } while (s.ok() && left > 0);
+
+ if (s.ok()) {
+ if (!manual_flush_) {
+ s = dest_->Flush();
+ }
+ }
+
+ return s;
+}
+
+bool Writer::TEST_BufferIsEmpty() { return dest_->TEST_BufferIsEmpty(); }
+
+Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) {
+ assert(n <= 0xffff); // Must fit in two bytes
+
+ size_t header_size;
+ char buf[kRecyclableHeaderSize];
+
+ // Format the header
+ buf[4] = static_cast<char>(n & 0xff);
+ buf[5] = static_cast<char>(n >> 8);
+ buf[6] = static_cast<char>(t);
+
+ uint32_t crc = type_crc_[t];
+ if (t < kRecyclableFullType) {
+ // Legacy record format
+ assert(block_offset_ + kHeaderSize + n <= kBlockSize);
+ header_size = kHeaderSize;
+ } else {
+ // Recyclable record format
+ assert(block_offset_ + kRecyclableHeaderSize + n <= kBlockSize);
+ header_size = kRecyclableHeaderSize;
+
+ // Only encode low 32-bits of the 64-bit log number. This means
+ // we will fail to detect an old record if we recycled a log from
+ // ~4 billion logs ago, but that is effectively impossible, and
+ // even if it were we'dbe far more likely to see a false positive
+ // on the 32-bit CRC.
+ EncodeFixed32(buf + 7, static_cast<uint32_t>(log_number_));
+ crc = crc32c::Extend(crc, buf + 7, 4);
+ }
+
+ // Compute the crc of the record type and the payload.
+ crc = crc32c::Extend(crc, ptr, n);
+ crc = crc32c::Mask(crc); // Adjust for storage
+ EncodeFixed32(buf, crc);
+
+ // Write the header and the payload
+ Status s = dest_->Append(Slice(buf, header_size));
+ if (s.ok()) {
+ s = dest_->Append(Slice(ptr, n));
+ }
+ block_offset_ += header_size + n;
+ return s;
+}
+
+} // namespace log
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/log_writer.h b/src/rocksdb/db/log_writer.h
new file mode 100644
index 000000000..a7f952edd
--- /dev/null
+++ b/src/rocksdb/db/log_writer.h
@@ -0,0 +1,114 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <stdint.h>
+
+#include <memory>
+
+#include "db/log_format.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WritableFileWriter;
+
+namespace log {
+
+/**
+ * Writer is a general purpose log stream writer. It provides an append-only
+ * abstraction for writing data. The details of the how the data is written is
+ * handled by the WriteableFile sub-class implementation.
+ *
+ * File format:
+ *
+ * File is broken down into variable sized records. The format of each record
+ * is described below.
+ * +-----+-------------+--+----+----------+------+-- ... ----+
+ * File | r0 | r1 |P | r2 | r3 | r4 | |
+ * +-----+-------------+--+----+----------+------+-- ... ----+
+ * <--- kBlockSize ------>|<-- kBlockSize ------>|
+ * rn = variable size records
+ * P = Padding
+ *
+ * Data is written out in kBlockSize chunks. If next record does not fit
+ * into the space left, the leftover space will be padded with \0.
+ *
+ * Legacy record format:
+ *
+ * +---------+-----------+-----------+--- ... ---+
+ * |CRC (4B) | Size (2B) | Type (1B) | Payload |
+ * +---------+-----------+-----------+--- ... ---+
+ *
+ * CRC = 32bit hash computed over the record type and payload using CRC
+ * Size = Length of the payload data
+ * Type = Type of record
+ * (kZeroType, kFullType, kFirstType, kLastType, kMiddleType )
+ * The type is used to group a bunch of records together to represent
+ * blocks that are larger than kBlockSize
+ * Payload = Byte stream as long as specified by the payload size
+ *
+ * Recyclable record format:
+ *
+ * +---------+-----------+-----------+----------------+--- ... ---+
+ * |CRC (4B) | Size (2B) | Type (1B) | Log number (4B)| Payload |
+ * +---------+-----------+-----------+----------------+--- ... ---+
+ *
+ * Same as above, with the addition of
+ * Log number = 32bit log file number, so that we can distinguish between
+ * records written by the most recent log writer vs a previous one.
+ */
+class Writer {
+ public:
+ // Create a writer that will append data to "*dest".
+ // "*dest" must be initially empty.
+ // "*dest" must remain live while this Writer is in use.
+ explicit Writer(std::unique_ptr<WritableFileWriter>&& dest,
+ uint64_t log_number, bool recycle_log_files,
+ bool manual_flush = false);
+ // No copying allowed
+ Writer(const Writer&) = delete;
+ void operator=(const Writer&) = delete;
+
+ ~Writer();
+
+ Status AddRecord(const Slice& slice);
+
+ WritableFileWriter* file() { return dest_.get(); }
+ const WritableFileWriter* file() const { return dest_.get(); }
+
+ uint64_t get_log_number() const { return log_number_; }
+
+ Status WriteBuffer();
+
+ Status Close();
+
+ bool TEST_BufferIsEmpty();
+
+ private:
+ std::unique_ptr<WritableFileWriter> dest_;
+ size_t block_offset_; // Current offset in block
+ uint64_t log_number_;
+ bool recycle_log_files_;
+
+ // crc32c values for all supported record types. These are
+ // pre-computed to reduce the overhead of computing the crc of the
+ // record type stored in the header.
+ uint32_t type_crc_[kMaxRecordType + 1];
+
+ Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length);
+
+ // If true, it does not flush after each write. Instead it relies on the upper
+ // layer to manually does the flush by calling ::WriteBuffer()
+ bool manual_flush_;
+};
+
+} // namespace log
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/logs_with_prep_tracker.cc b/src/rocksdb/db/logs_with_prep_tracker.cc
new file mode 100644
index 000000000..ff98155c4
--- /dev/null
+++ b/src/rocksdb/db/logs_with_prep_tracker.cc
@@ -0,0 +1,67 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#include "db/logs_with_prep_tracker.h"
+
+#include "port/likely.h"
+
+namespace ROCKSDB_NAMESPACE {
+void LogsWithPrepTracker::MarkLogAsHavingPrepSectionFlushed(uint64_t log) {
+ assert(log != 0);
+ std::lock_guard<std::mutex> lock(prepared_section_completed_mutex_);
+ auto it = prepared_section_completed_.find(log);
+ if (UNLIKELY(it == prepared_section_completed_.end())) {
+ prepared_section_completed_[log] = 1;
+ } else {
+ it->second += 1;
+ }
+}
+
+void LogsWithPrepTracker::MarkLogAsContainingPrepSection(uint64_t log) {
+ assert(log != 0);
+ std::lock_guard<std::mutex> lock(logs_with_prep_mutex_);
+
+ auto rit = logs_with_prep_.rbegin();
+ bool updated = false;
+ // Most probably the last log is the one that is being marked for
+ // having a prepare section; so search from the end.
+ for (; rit != logs_with_prep_.rend() && rit->log >= log; ++rit) {
+ if (rit->log == log) {
+ rit->cnt++;
+ updated = true;
+ break;
+ }
+ }
+ if (!updated) {
+ // We are either at the start, or at a position with rit->log < log
+ logs_with_prep_.insert(rit.base(), {log, 1});
+ }
+}
+
+uint64_t LogsWithPrepTracker::FindMinLogContainingOutstandingPrep() {
+ std::lock_guard<std::mutex> lock(logs_with_prep_mutex_);
+ auto it = logs_with_prep_.begin();
+ // start with the smallest log
+ for (; it != logs_with_prep_.end();) {
+ auto min_log = it->log;
+ {
+ std::lock_guard<std::mutex> lock2(prepared_section_completed_mutex_);
+ auto completed_it = prepared_section_completed_.find(min_log);
+ if (completed_it == prepared_section_completed_.end() ||
+ completed_it->second < it->cnt) {
+ return min_log;
+ }
+ assert(completed_it != prepared_section_completed_.end() &&
+ completed_it->second == it->cnt);
+ prepared_section_completed_.erase(completed_it);
+ }
+ // erase from beginning in vector is not efficient but this function is not
+ // on the fast path.
+ it = logs_with_prep_.erase(it);
+ }
+ // no such log found
+ return 0;
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/logs_with_prep_tracker.h b/src/rocksdb/db/logs_with_prep_tracker.h
new file mode 100644
index 000000000..86c88012a
--- /dev/null
+++ b/src/rocksdb/db/logs_with_prep_tracker.h
@@ -0,0 +1,63 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#include <stdint.h>
+#include <cassert>
+#include <cstdlib>
+#include <mutex>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This class is used to track the log files with outstanding prepare entries.
+class LogsWithPrepTracker {
+ public:
+ // Called when a transaction prepared in `log` has been committed or aborted.
+ void MarkLogAsHavingPrepSectionFlushed(uint64_t log);
+ // Called when a transaction is prepared in `log`.
+ void MarkLogAsContainingPrepSection(uint64_t log);
+ // Return the earliest log file with outstanding prepare entries.
+ uint64_t FindMinLogContainingOutstandingPrep();
+ size_t TEST_PreparedSectionCompletedSize() {
+ return prepared_section_completed_.size();
+ }
+ size_t TEST_LogsWithPrepSize() { return logs_with_prep_.size(); }
+
+ private:
+ // REQUIRES: logs_with_prep_mutex_ held
+ //
+ // sorted list of log numbers still containing prepared data.
+ // this is used by FindObsoleteFiles to determine which
+ // flushed logs we must keep around because they still
+ // contain prepared data which has not been committed or rolled back
+ struct LogCnt {
+ uint64_t log; // the log number
+ uint64_t cnt; // number of prepared sections in the log
+ };
+ std::vector<LogCnt> logs_with_prep_;
+ std::mutex logs_with_prep_mutex_;
+
+ // REQUIRES: prepared_section_completed_mutex_ held
+ //
+ // to be used in conjunction with logs_with_prep_.
+ // once a transaction with data in log L is committed or rolled back
+ // rather than updating logs_with_prep_ directly we keep track of that
+ // in prepared_section_completed_ which maps LOG -> instance_count. This helps
+ // avoiding contention between a commit thread and the prepare threads.
+ //
+ // when trying to determine the minimum log still active we first
+ // consult logs_with_prep_. while that root value maps to
+ // an equal value in prepared_section_completed_ we erase the log from
+ // both logs_with_prep_ and prepared_section_completed_.
+ std::unordered_map<uint64_t, uint64_t> prepared_section_completed_;
+ std::mutex prepared_section_completed_mutex_;
+
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/lookup_key.h b/src/rocksdb/db/lookup_key.h
new file mode 100644
index 000000000..51e5daed1
--- /dev/null
+++ b/src/rocksdb/db/lookup_key.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <string>
+#include <utility>
+#include "rocksdb/db.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A helper class useful for DBImpl::Get()
+class LookupKey {
+ public:
+ // Initialize *this for looking up user_key at a snapshot with
+ // the specified sequence number.
+ LookupKey(const Slice& _user_key, SequenceNumber sequence,
+ const Slice* ts = nullptr);
+
+ ~LookupKey();
+
+ // Return a key suitable for lookup in a MemTable.
+ Slice memtable_key() const {
+ return Slice(start_, static_cast<size_t>(end_ - start_));
+ }
+
+ // Return an internal key (suitable for passing to an internal iterator)
+ Slice internal_key() const {
+ return Slice(kstart_, static_cast<size_t>(end_ - kstart_));
+ }
+
+ // Return the user key
+ Slice user_key() const {
+ return Slice(kstart_, static_cast<size_t>(end_ - kstart_ - 8));
+ }
+
+ private:
+ // We construct a char array of the form:
+ // klength varint32 <-- start_
+ // userkey char[klength] <-- kstart_
+ // tag uint64
+ // <-- end_
+ // The array is a suitable MemTable key.
+ // The suffix starting with "userkey" can be used as an InternalKey.
+ const char* start_;
+ const char* kstart_;
+ const char* end_;
+ char space_[200]; // Avoid allocation for short keys
+
+ // No copying allowed
+ LookupKey(const LookupKey&);
+ void operator=(const LookupKey&);
+};
+
+inline LookupKey::~LookupKey() {
+ if (start_ != space_) delete[] start_;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/malloc_stats.cc b/src/rocksdb/db/malloc_stats.cc
new file mode 100644
index 000000000..12824e516
--- /dev/null
+++ b/src/rocksdb/db/malloc_stats.cc
@@ -0,0 +1,54 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/malloc_stats.h"
+
+#ifndef ROCKSDB_LITE
+#include <memory>
+#include <string.h>
+
+#include "port/jemalloc_helper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifdef ROCKSDB_JEMALLOC
+
+typedef struct {
+ char* cur;
+ char* end;
+} MallocStatus;
+
+static void GetJemallocStatus(void* mstat_arg, const char* status) {
+ MallocStatus* mstat = reinterpret_cast<MallocStatus*>(mstat_arg);
+ size_t status_len = status ? strlen(status) : 0;
+ size_t buf_size = (size_t)(mstat->end - mstat->cur);
+ if (!status_len || status_len > buf_size) {
+ return;
+ }
+
+ snprintf(mstat->cur, buf_size, "%s", status);
+ mstat->cur += status_len;
+}
+void DumpMallocStats(std::string* stats) {
+ if (!HasJemalloc()) {
+ return;
+ }
+ MallocStatus mstat;
+ const unsigned int kMallocStatusLen = 1000000;
+ std::unique_ptr<char[]> buf{new char[kMallocStatusLen + 1]};
+ mstat.cur = buf.get();
+ mstat.end = buf.get() + kMallocStatusLen;
+ malloc_stats_print(GetJemallocStatus, &mstat, "");
+ stats->append(buf.get());
+}
+#else
+void DumpMallocStats(std::string*) {}
+#endif // ROCKSDB_JEMALLOC
+} // namespace ROCKSDB_NAMESPACE
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/malloc_stats.h b/src/rocksdb/db/malloc_stats.h
new file mode 100644
index 000000000..18aff3ad0
--- /dev/null
+++ b/src/rocksdb/db/malloc_stats.h
@@ -0,0 +1,24 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void DumpMallocStats(std::string*);
+
+}
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/manual_compaction_test.cc b/src/rocksdb/db/manual_compaction_test.cc
new file mode 100644
index 000000000..22cd919b5
--- /dev/null
+++ b/src/rocksdb/db/manual_compaction_test.cc
@@ -0,0 +1,160 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Test for issue 178: a manual compaction causes deleted data to reappear.
+#include <iostream>
+#include <sstream>
+#include <cstdlib>
+
+#include "port/port.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/write_batch.h"
+#include "test_util/testharness.h"
+
+using namespace ROCKSDB_NAMESPACE;
+
+namespace {
+
+// Reasoning: previously the number was 1100000. Since the keys are written to
+// the batch in one write each write will result into one SST file. each write
+// will result into one SST file. We reduced the write_buffer_size to 1K to
+// basically have the same effect with however less number of keys, which
+// results into less test runtime.
+const int kNumKeys = 1100;
+
+std::string Key1(int i) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "my_key_%d", i);
+ return buf;
+}
+
+std::string Key2(int i) {
+ return Key1(i) + "_xxx";
+}
+
+class ManualCompactionTest : public testing::Test {
+ public:
+ ManualCompactionTest() {
+ // Get rid of any state from an old run.
+ dbname_ = ROCKSDB_NAMESPACE::test::PerThreadDBPath("rocksdb_cbug_test");
+ DestroyDB(dbname_, ROCKSDB_NAMESPACE::Options());
+ }
+
+ std::string dbname_;
+};
+
+class DestroyAllCompactionFilter : public CompactionFilter {
+ public:
+ DestroyAllCompactionFilter() {}
+
+ bool Filter(int /*level*/, const Slice& /*key*/, const Slice& existing_value,
+ std::string* /*new_value*/,
+ bool* /*value_changed*/) const override {
+ return existing_value.ToString() == "destroy";
+ }
+
+ const char* Name() const override { return "DestroyAllCompactionFilter"; }
+};
+
+TEST_F(ManualCompactionTest, CompactTouchesAllKeys) {
+ for (int iter = 0; iter < 2; ++iter) {
+ DB* db;
+ Options options;
+ if (iter == 0) { // level compaction
+ options.num_levels = 3;
+ options.compaction_style = kCompactionStyleLevel;
+ } else { // universal compaction
+ options.compaction_style = kCompactionStyleUniversal;
+ }
+ options.create_if_missing = true;
+ options.compression = ROCKSDB_NAMESPACE::kNoCompression;
+ options.compaction_filter = new DestroyAllCompactionFilter();
+ ASSERT_OK(DB::Open(options, dbname_, &db));
+
+ db->Put(WriteOptions(), Slice("key1"), Slice("destroy"));
+ db->Put(WriteOptions(), Slice("key2"), Slice("destroy"));
+ db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
+ db->Put(WriteOptions(), Slice("key4"), Slice("destroy"));
+
+ Slice key4("key4");
+ db->CompactRange(CompactRangeOptions(), nullptr, &key4);
+ Iterator* itr = db->NewIterator(ReadOptions());
+ itr->SeekToFirst();
+ ASSERT_TRUE(itr->Valid());
+ ASSERT_EQ("key3", itr->key().ToString());
+ itr->Next();
+ ASSERT_TRUE(!itr->Valid());
+ delete itr;
+
+ delete options.compaction_filter;
+ delete db;
+ DestroyDB(dbname_, options);
+ }
+}
+
+TEST_F(ManualCompactionTest, Test) {
+ // Open database. Disable compression since it affects the creation
+ // of layers and the code below is trying to test against a very
+ // specific scenario.
+ ROCKSDB_NAMESPACE::DB* db;
+ ROCKSDB_NAMESPACE::Options db_options;
+ db_options.write_buffer_size = 1024;
+ db_options.create_if_missing = true;
+ db_options.compression = ROCKSDB_NAMESPACE::kNoCompression;
+ ASSERT_OK(ROCKSDB_NAMESPACE::DB::Open(db_options, dbname_, &db));
+
+ // create first key range
+ ROCKSDB_NAMESPACE::WriteBatch batch;
+ for (int i = 0; i < kNumKeys; i++) {
+ batch.Put(Key1(i), "value for range 1 key");
+ }
+ ASSERT_OK(db->Write(ROCKSDB_NAMESPACE::WriteOptions(), &batch));
+
+ // create second key range
+ batch.Clear();
+ for (int i = 0; i < kNumKeys; i++) {
+ batch.Put(Key2(i), "value for range 2 key");
+ }
+ ASSERT_OK(db->Write(ROCKSDB_NAMESPACE::WriteOptions(), &batch));
+
+ // delete second key range
+ batch.Clear();
+ for (int i = 0; i < kNumKeys; i++) {
+ batch.Delete(Key2(i));
+ }
+ ASSERT_OK(db->Write(ROCKSDB_NAMESPACE::WriteOptions(), &batch));
+
+ // compact database
+ std::string start_key = Key1(0);
+ std::string end_key = Key1(kNumKeys - 1);
+ ROCKSDB_NAMESPACE::Slice least(start_key.data(), start_key.size());
+ ROCKSDB_NAMESPACE::Slice greatest(end_key.data(), end_key.size());
+
+ // commenting out the line below causes the example to work correctly
+ db->CompactRange(CompactRangeOptions(), &least, &greatest);
+
+ // count the keys
+ ROCKSDB_NAMESPACE::Iterator* iter =
+ db->NewIterator(ROCKSDB_NAMESPACE::ReadOptions());
+ int num_keys = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ num_keys++;
+ }
+ delete iter;
+ ASSERT_EQ(kNumKeys, num_keys) << "Bad number of keys";
+
+ // close database
+ delete db;
+ DestroyDB(dbname_, ROCKSDB_NAMESPACE::Options());
+}
+
+} // anonymous namespace
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/memtable.cc b/src/rocksdb/db/memtable.cc
new file mode 100644
index 000000000..45483ea09
--- /dev/null
+++ b/src/rocksdb/db/memtable.cc
@@ -0,0 +1,1122 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/memtable.h"
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <memory>
+#include "db/dbformat.h"
+#include "db/merge_context.h"
+#include "db/merge_helper.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/read_callback.h"
+#include "memory/arena.h"
+#include "memory/memory_usage.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/statistics.h"
+#include "port/port.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/internal_iterator.h"
+#include "table/iterator_wrapper.h"
+#include "table/merging_iterator.h"
+#include "util/autovector.h"
+#include "util/coding.h"
+#include "util/mutexlock.h"
+#include "util/util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+ImmutableMemTableOptions::ImmutableMemTableOptions(
+ const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options)
+ : arena_block_size(mutable_cf_options.arena_block_size),
+ memtable_prefix_bloom_bits(
+ static_cast<uint32_t>(
+ static_cast<double>(mutable_cf_options.write_buffer_size) *
+ mutable_cf_options.memtable_prefix_bloom_size_ratio) *
+ 8u),
+ memtable_huge_page_size(mutable_cf_options.memtable_huge_page_size),
+ memtable_whole_key_filtering(
+ mutable_cf_options.memtable_whole_key_filtering),
+ inplace_update_support(ioptions.inplace_update_support),
+ inplace_update_num_locks(mutable_cf_options.inplace_update_num_locks),
+ inplace_callback(ioptions.inplace_callback),
+ max_successive_merges(mutable_cf_options.max_successive_merges),
+ statistics(ioptions.statistics),
+ merge_operator(ioptions.merge_operator),
+ info_log(ioptions.info_log) {}
+
+MemTable::MemTable(const InternalKeyComparator& cmp,
+ const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options,
+ WriteBufferManager* write_buffer_manager,
+ SequenceNumber latest_seq, uint32_t column_family_id)
+ : comparator_(cmp),
+ moptions_(ioptions, mutable_cf_options),
+ refs_(0),
+ kArenaBlockSize(OptimizeBlockSize(moptions_.arena_block_size)),
+ mem_tracker_(write_buffer_manager),
+ arena_(moptions_.arena_block_size,
+ (write_buffer_manager != nullptr &&
+ (write_buffer_manager->enabled() ||
+ write_buffer_manager->cost_to_cache()))
+ ? &mem_tracker_
+ : nullptr,
+ mutable_cf_options.memtable_huge_page_size),
+ table_(ioptions.memtable_factory->CreateMemTableRep(
+ comparator_, &arena_, mutable_cf_options.prefix_extractor.get(),
+ ioptions.info_log, column_family_id)),
+ range_del_table_(SkipListFactory().CreateMemTableRep(
+ comparator_, &arena_, nullptr /* transform */, ioptions.info_log,
+ column_family_id)),
+ is_range_del_table_empty_(true),
+ data_size_(0),
+ num_entries_(0),
+ num_deletes_(0),
+ write_buffer_size_(mutable_cf_options.write_buffer_size),
+ flush_in_progress_(false),
+ flush_completed_(false),
+ file_number_(0),
+ first_seqno_(0),
+ earliest_seqno_(latest_seq),
+ creation_seq_(latest_seq),
+ mem_next_logfile_number_(0),
+ min_prep_log_referenced_(0),
+ locks_(moptions_.inplace_update_support
+ ? moptions_.inplace_update_num_locks
+ : 0),
+ prefix_extractor_(mutable_cf_options.prefix_extractor.get()),
+ flush_state_(FLUSH_NOT_REQUESTED),
+ env_(ioptions.env),
+ insert_with_hint_prefix_extractor_(
+ ioptions.memtable_insert_with_hint_prefix_extractor),
+ oldest_key_time_(std::numeric_limits<uint64_t>::max()),
+ atomic_flush_seqno_(kMaxSequenceNumber),
+ approximate_memory_usage_(0) {
+ UpdateFlushState();
+ // something went wrong if we need to flush before inserting anything
+ assert(!ShouldScheduleFlush());
+
+ // use bloom_filter_ for both whole key and prefix bloom filter
+ if ((prefix_extractor_ || moptions_.memtable_whole_key_filtering) &&
+ moptions_.memtable_prefix_bloom_bits > 0) {
+ bloom_filter_.reset(
+ new DynamicBloom(&arena_, moptions_.memtable_prefix_bloom_bits,
+ 6 /* hard coded 6 probes */,
+ moptions_.memtable_huge_page_size, ioptions.info_log));
+ }
+}
+
+MemTable::~MemTable() {
+ mem_tracker_.FreeMem();
+ assert(refs_ == 0);
+}
+
+size_t MemTable::ApproximateMemoryUsage() {
+ autovector<size_t> usages = {
+ arena_.ApproximateMemoryUsage(), table_->ApproximateMemoryUsage(),
+ range_del_table_->ApproximateMemoryUsage(),
+ ROCKSDB_NAMESPACE::ApproximateMemoryUsage(insert_hints_)};
+ size_t total_usage = 0;
+ for (size_t usage : usages) {
+ // If usage + total_usage >= kMaxSizet, return kMaxSizet.
+ // the following variation is to avoid numeric overflow.
+ if (usage >= port::kMaxSizet - total_usage) {
+ return port::kMaxSizet;
+ }
+ total_usage += usage;
+ }
+ approximate_memory_usage_.store(total_usage, std::memory_order_relaxed);
+ // otherwise, return the actual usage
+ return total_usage;
+}
+
+bool MemTable::ShouldFlushNow() {
+ size_t write_buffer_size = write_buffer_size_.load(std::memory_order_relaxed);
+ // In a lot of times, we cannot allocate arena blocks that exactly matches the
+ // buffer size. Thus we have to decide if we should over-allocate or
+ // under-allocate.
+ // This constant variable can be interpreted as: if we still have more than
+ // "kAllowOverAllocationRatio * kArenaBlockSize" space left, we'd try to over
+ // allocate one more block.
+ const double kAllowOverAllocationRatio = 0.6;
+
+ // If arena still have room for new block allocation, we can safely say it
+ // shouldn't flush.
+ auto allocated_memory = table_->ApproximateMemoryUsage() +
+ range_del_table_->ApproximateMemoryUsage() +
+ arena_.MemoryAllocatedBytes();
+
+ approximate_memory_usage_.store(allocated_memory, std::memory_order_relaxed);
+
+ // if we can still allocate one more block without exceeding the
+ // over-allocation ratio, then we should not flush.
+ if (allocated_memory + kArenaBlockSize <
+ write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) {
+ return false;
+ }
+
+ // if user keeps adding entries that exceeds write_buffer_size, we need to
+ // flush earlier even though we still have much available memory left.
+ if (allocated_memory >
+ write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) {
+ return true;
+ }
+
+ // In this code path, Arena has already allocated its "last block", which
+ // means the total allocatedmemory size is either:
+ // (1) "moderately" over allocated the memory (no more than `0.6 * arena
+ // block size`. Or,
+ // (2) the allocated memory is less than write buffer size, but we'll stop
+ // here since if we allocate a new arena block, we'll over allocate too much
+ // more (half of the arena block size) memory.
+ //
+ // In either case, to avoid over-allocate, the last block will stop allocation
+ // when its usage reaches a certain ratio, which we carefully choose "0.75
+ // full" as the stop condition because it addresses the following issue with
+ // great simplicity: What if the next inserted entry's size is
+ // bigger than AllocatedAndUnused()?
+ //
+ // The answer is: if the entry size is also bigger than 0.25 *
+ // kArenaBlockSize, a dedicated block will be allocated for it; otherwise
+ // arena will anyway skip the AllocatedAndUnused() and allocate a new, empty
+ // and regular block. In either case, we *overly* over-allocated.
+ //
+ // Therefore, setting the last block to be at most "0.75 full" avoids both
+ // cases.
+ //
+ // NOTE: the average percentage of waste space of this approach can be counted
+ // as: "arena block size * 0.25 / write buffer size". User who specify a small
+ // write buffer size and/or big arena block size may suffer.
+ return arena_.AllocatedAndUnused() < kArenaBlockSize / 4;
+}
+
+void MemTable::UpdateFlushState() {
+ auto state = flush_state_.load(std::memory_order_relaxed);
+ if (state == FLUSH_NOT_REQUESTED && ShouldFlushNow()) {
+ // ignore CAS failure, because that means somebody else requested
+ // a flush
+ flush_state_.compare_exchange_strong(state, FLUSH_REQUESTED,
+ std::memory_order_relaxed,
+ std::memory_order_relaxed);
+ }
+}
+
+void MemTable::UpdateOldestKeyTime() {
+ uint64_t oldest_key_time = oldest_key_time_.load(std::memory_order_relaxed);
+ if (oldest_key_time == std::numeric_limits<uint64_t>::max()) {
+ int64_t current_time = 0;
+ auto s = env_->GetCurrentTime(&current_time);
+ if (s.ok()) {
+ assert(current_time >= 0);
+ // If fail, the timestamp is already set.
+ oldest_key_time_.compare_exchange_strong(
+ oldest_key_time, static_cast<uint64_t>(current_time),
+ std::memory_order_relaxed, std::memory_order_relaxed);
+ }
+ }
+}
+
+int MemTable::KeyComparator::operator()(const char* prefix_len_key1,
+ const char* prefix_len_key2) const {
+ // Internal keys are encoded as length-prefixed strings.
+ Slice k1 = GetLengthPrefixedSlice(prefix_len_key1);
+ Slice k2 = GetLengthPrefixedSlice(prefix_len_key2);
+ return comparator.CompareKeySeq(k1, k2);
+}
+
+int MemTable::KeyComparator::operator()(const char* prefix_len_key,
+ const KeyComparator::DecodedType& key)
+ const {
+ // Internal keys are encoded as length-prefixed strings.
+ Slice a = GetLengthPrefixedSlice(prefix_len_key);
+ return comparator.CompareKeySeq(a, key);
+}
+
+void MemTableRep::InsertConcurrently(KeyHandle /*handle*/) {
+#ifndef ROCKSDB_LITE
+ throw std::runtime_error("concurrent insert not supported");
+#else
+ abort();
+#endif
+}
+
+Slice MemTableRep::UserKey(const char* key) const {
+ Slice slice = GetLengthPrefixedSlice(key);
+ return Slice(slice.data(), slice.size() - 8);
+}
+
+KeyHandle MemTableRep::Allocate(const size_t len, char** buf) {
+ *buf = allocator_->Allocate(len);
+ return static_cast<KeyHandle>(*buf);
+}
+
+// Encode a suitable internal key target for "target" and return it.
+// Uses *scratch as scratch space, and the returned pointer will point
+// into this scratch space.
+const char* EncodeKey(std::string* scratch, const Slice& target) {
+ scratch->clear();
+ PutVarint32(scratch, static_cast<uint32_t>(target.size()));
+ scratch->append(target.data(), target.size());
+ return scratch->data();
+}
+
+class MemTableIterator : public InternalIterator {
+ public:
+ MemTableIterator(const MemTable& mem, const ReadOptions& read_options,
+ Arena* arena, bool use_range_del_table = false)
+ : bloom_(nullptr),
+ prefix_extractor_(mem.prefix_extractor_),
+ comparator_(mem.comparator_),
+ valid_(false),
+ arena_mode_(arena != nullptr),
+ value_pinned_(
+ !mem.GetImmutableMemTableOptions()->inplace_update_support) {
+ if (use_range_del_table) {
+ iter_ = mem.range_del_table_->GetIterator(arena);
+ } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek &&
+ !read_options.auto_prefix_mode) {
+ // Auto prefix mode is not implemented in memtable yet.
+ bloom_ = mem.bloom_filter_.get();
+ iter_ = mem.table_->GetDynamicPrefixIterator(arena);
+ } else {
+ iter_ = mem.table_->GetIterator(arena);
+ }
+ }
+ // No copying allowed
+ MemTableIterator(const MemTableIterator&) = delete;
+ void operator=(const MemTableIterator&) = delete;
+
+ ~MemTableIterator() override {
+#ifndef NDEBUG
+ // Assert that the MemTableIterator is never deleted while
+ // Pinning is Enabled.
+ assert(!pinned_iters_mgr_ || !pinned_iters_mgr_->PinningEnabled());
+#endif
+ if (arena_mode_) {
+ iter_->~Iterator();
+ } else {
+ delete iter_;
+ }
+ }
+
+#ifndef NDEBUG
+ void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+ pinned_iters_mgr_ = pinned_iters_mgr;
+ }
+ PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
+#endif
+
+ bool Valid() const override { return valid_; }
+ void Seek(const Slice& k) override {
+ PERF_TIMER_GUARD(seek_on_memtable_time);
+ PERF_COUNTER_ADD(seek_on_memtable_count, 1);
+ if (bloom_) {
+ // iterator should only use prefix bloom filter
+ Slice user_k(ExtractUserKey(k));
+ if (prefix_extractor_->InDomain(user_k) &&
+ !bloom_->MayContain(prefix_extractor_->Transform(user_k))) {
+ PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
+ valid_ = false;
+ return;
+ } else {
+ PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
+ }
+ }
+ iter_->Seek(k, nullptr);
+ valid_ = iter_->Valid();
+ }
+ void SeekForPrev(const Slice& k) override {
+ PERF_TIMER_GUARD(seek_on_memtable_time);
+ PERF_COUNTER_ADD(seek_on_memtable_count, 1);
+ if (bloom_) {
+ Slice user_k(ExtractUserKey(k));
+ if (prefix_extractor_->InDomain(user_k) &&
+ !bloom_->MayContain(prefix_extractor_->Transform(user_k))) {
+ PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
+ valid_ = false;
+ return;
+ } else {
+ PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
+ }
+ }
+ iter_->Seek(k, nullptr);
+ valid_ = iter_->Valid();
+ if (!Valid()) {
+ SeekToLast();
+ }
+ while (Valid() && comparator_.comparator.Compare(k, key()) < 0) {
+ Prev();
+ }
+ }
+ void SeekToFirst() override {
+ iter_->SeekToFirst();
+ valid_ = iter_->Valid();
+ }
+ void SeekToLast() override {
+ iter_->SeekToLast();
+ valid_ = iter_->Valid();
+ }
+ void Next() override {
+ PERF_COUNTER_ADD(next_on_memtable_count, 1);
+ assert(Valid());
+ iter_->Next();
+ valid_ = iter_->Valid();
+ }
+ void Prev() override {
+ PERF_COUNTER_ADD(prev_on_memtable_count, 1);
+ assert(Valid());
+ iter_->Prev();
+ valid_ = iter_->Valid();
+ }
+ Slice key() const override {
+ assert(Valid());
+ return GetLengthPrefixedSlice(iter_->key());
+ }
+ Slice value() const override {
+ assert(Valid());
+ Slice key_slice = GetLengthPrefixedSlice(iter_->key());
+ return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
+ }
+
+ Status status() const override { return Status::OK(); }
+
+ bool IsKeyPinned() const override {
+ // memtable data is always pinned
+ return true;
+ }
+
+ bool IsValuePinned() const override {
+ // memtable value is always pinned, except if we allow inplace update.
+ return value_pinned_;
+ }
+
+ private:
+ DynamicBloom* bloom_;
+ const SliceTransform* const prefix_extractor_;
+ const MemTable::KeyComparator comparator_;
+ MemTableRep::Iterator* iter_;
+ bool valid_;
+ bool arena_mode_;
+ bool value_pinned_;
+};
+
+InternalIterator* MemTable::NewIterator(const ReadOptions& read_options,
+ Arena* arena) {
+ assert(arena != nullptr);
+ auto mem = arena->AllocateAligned(sizeof(MemTableIterator));
+ return new (mem) MemTableIterator(*this, read_options, arena);
+}
+
+FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator(
+ const ReadOptions& read_options, SequenceNumber read_seq) {
+ if (read_options.ignore_range_deletions ||
+ is_range_del_table_empty_.load(std::memory_order_relaxed)) {
+ return nullptr;
+ }
+ auto* unfragmented_iter = new MemTableIterator(
+ *this, read_options, nullptr /* arena */, true /* use_range_del_table */);
+ if (unfragmented_iter == nullptr) {
+ return nullptr;
+ }
+ auto fragmented_tombstone_list =
+ std::make_shared<FragmentedRangeTombstoneList>(
+ std::unique_ptr<InternalIterator>(unfragmented_iter),
+ comparator_.comparator);
+
+ auto* fragmented_iter = new FragmentedRangeTombstoneIterator(
+ fragmented_tombstone_list, comparator_.comparator, read_seq);
+ return fragmented_iter;
+}
+
+port::RWMutex* MemTable::GetLock(const Slice& key) {
+ return &locks_[fastrange64(GetSliceNPHash64(key), locks_.size())];
+}
+
+MemTable::MemTableStats MemTable::ApproximateStats(const Slice& start_ikey,
+ const Slice& end_ikey) {
+ uint64_t entry_count = table_->ApproximateNumEntries(start_ikey, end_ikey);
+ entry_count += range_del_table_->ApproximateNumEntries(start_ikey, end_ikey);
+ if (entry_count == 0) {
+ return {0, 0};
+ }
+ uint64_t n = num_entries_.load(std::memory_order_relaxed);
+ if (n == 0) {
+ return {0, 0};
+ }
+ if (entry_count > n) {
+ // (range_del_)table_->ApproximateNumEntries() is just an estimate so it can
+ // be larger than actual entries we have. Cap it to entries we have to limit
+ // the inaccuracy.
+ entry_count = n;
+ }
+ uint64_t data_size = data_size_.load(std::memory_order_relaxed);
+ return {entry_count * (data_size / n), entry_count};
+}
+
+bool MemTable::Add(SequenceNumber s, ValueType type,
+ const Slice& key, /* user key */
+ const Slice& value, bool allow_concurrent,
+ MemTablePostProcessInfo* post_process_info, void** hint) {
+ // Format of an entry is concatenation of:
+ // key_size : varint32 of internal_key.size()
+ // key bytes : char[internal_key.size()]
+ // value_size : varint32 of value.size()
+ // value bytes : char[value.size()]
+ uint32_t key_size = static_cast<uint32_t>(key.size());
+ uint32_t val_size = static_cast<uint32_t>(value.size());
+ uint32_t internal_key_size = key_size + 8;
+ const uint32_t encoded_len = VarintLength(internal_key_size) +
+ internal_key_size + VarintLength(val_size) +
+ val_size;
+ char* buf = nullptr;
+ std::unique_ptr<MemTableRep>& table =
+ type == kTypeRangeDeletion ? range_del_table_ : table_;
+ KeyHandle handle = table->Allocate(encoded_len, &buf);
+
+ char* p = EncodeVarint32(buf, internal_key_size);
+ memcpy(p, key.data(), key_size);
+ Slice key_slice(p, key_size);
+ p += key_size;
+ uint64_t packed = PackSequenceAndType(s, type);
+ EncodeFixed64(p, packed);
+ p += 8;
+ p = EncodeVarint32(p, val_size);
+ memcpy(p, value.data(), val_size);
+ assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len);
+ size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size();
+
+ if (!allow_concurrent) {
+ // Extract prefix for insert with hint.
+ if (insert_with_hint_prefix_extractor_ != nullptr &&
+ insert_with_hint_prefix_extractor_->InDomain(key_slice)) {
+ Slice prefix = insert_with_hint_prefix_extractor_->Transform(key_slice);
+ bool res = table->InsertKeyWithHint(handle, &insert_hints_[prefix]);
+ if (UNLIKELY(!res)) {
+ return res;
+ }
+ } else {
+ bool res = table->InsertKey(handle);
+ if (UNLIKELY(!res)) {
+ return res;
+ }
+ }
+
+ // this is a bit ugly, but is the way to avoid locked instructions
+ // when incrementing an atomic
+ num_entries_.store(num_entries_.load(std::memory_order_relaxed) + 1,
+ std::memory_order_relaxed);
+ data_size_.store(data_size_.load(std::memory_order_relaxed) + encoded_len,
+ std::memory_order_relaxed);
+ if (type == kTypeDeletion) {
+ num_deletes_.store(num_deletes_.load(std::memory_order_relaxed) + 1,
+ std::memory_order_relaxed);
+ }
+
+ if (bloom_filter_ && prefix_extractor_ &&
+ prefix_extractor_->InDomain(key)) {
+ bloom_filter_->Add(prefix_extractor_->Transform(key));
+ }
+ if (bloom_filter_ && moptions_.memtable_whole_key_filtering) {
+ bloom_filter_->Add(StripTimestampFromUserKey(key, ts_sz));
+ }
+
+ // The first sequence number inserted into the memtable
+ assert(first_seqno_ == 0 || s >= first_seqno_);
+ if (first_seqno_ == 0) {
+ first_seqno_.store(s, std::memory_order_relaxed);
+
+ if (earliest_seqno_ == kMaxSequenceNumber) {
+ earliest_seqno_.store(GetFirstSequenceNumber(),
+ std::memory_order_relaxed);
+ }
+ assert(first_seqno_.load() >= earliest_seqno_.load());
+ }
+ assert(post_process_info == nullptr);
+ UpdateFlushState();
+ } else {
+ bool res = (hint == nullptr)
+ ? table->InsertKeyConcurrently(handle)
+ : table->InsertKeyWithHintConcurrently(handle, hint);
+ if (UNLIKELY(!res)) {
+ return res;
+ }
+
+ assert(post_process_info != nullptr);
+ post_process_info->num_entries++;
+ post_process_info->data_size += encoded_len;
+ if (type == kTypeDeletion) {
+ post_process_info->num_deletes++;
+ }
+
+ if (bloom_filter_ && prefix_extractor_ &&
+ prefix_extractor_->InDomain(key)) {
+ bloom_filter_->AddConcurrently(prefix_extractor_->Transform(key));
+ }
+ if (bloom_filter_ && moptions_.memtable_whole_key_filtering) {
+ bloom_filter_->AddConcurrently(StripTimestampFromUserKey(key, ts_sz));
+ }
+
+ // atomically update first_seqno_ and earliest_seqno_.
+ uint64_t cur_seq_num = first_seqno_.load(std::memory_order_relaxed);
+ while ((cur_seq_num == 0 || s < cur_seq_num) &&
+ !first_seqno_.compare_exchange_weak(cur_seq_num, s)) {
+ }
+ uint64_t cur_earliest_seqno =
+ earliest_seqno_.load(std::memory_order_relaxed);
+ while (
+ (cur_earliest_seqno == kMaxSequenceNumber || s < cur_earliest_seqno) &&
+ !first_seqno_.compare_exchange_weak(cur_earliest_seqno, s)) {
+ }
+ }
+ if (type == kTypeRangeDeletion) {
+ is_range_del_table_empty_.store(false, std::memory_order_relaxed);
+ }
+ UpdateOldestKeyTime();
+ return true;
+}
+
+// Callback from MemTable::Get()
+namespace {
+
+struct Saver {
+ Status* status;
+ const LookupKey* key;
+ bool* found_final_value; // Is value set correctly? Used by KeyMayExist
+ bool* merge_in_progress;
+ std::string* value;
+ SequenceNumber seq;
+ const MergeOperator* merge_operator;
+ // the merge operations encountered;
+ MergeContext* merge_context;
+ SequenceNumber max_covering_tombstone_seq;
+ MemTable* mem;
+ Logger* logger;
+ Statistics* statistics;
+ bool inplace_update_support;
+ bool do_merge;
+ Env* env_;
+ ReadCallback* callback_;
+ bool* is_blob_index;
+
+ bool CheckCallback(SequenceNumber _seq) {
+ if (callback_) {
+ return callback_->IsVisible(_seq);
+ }
+ return true;
+ }
+};
+} // namespace
+
+static bool SaveValue(void* arg, const char* entry) {
+ Saver* s = reinterpret_cast<Saver*>(arg);
+ assert(s != nullptr);
+ MergeContext* merge_context = s->merge_context;
+ SequenceNumber max_covering_tombstone_seq = s->max_covering_tombstone_seq;
+ const MergeOperator* merge_operator = s->merge_operator;
+
+ assert(merge_context != nullptr);
+
+ // entry format is:
+ // klength varint32
+ // userkey char[klength-8]
+ // tag uint64
+ // vlength varint32f
+ // value char[vlength]
+ // Check that it belongs to same user key. We do not check the
+ // sequence number since the Seek() call above should have skipped
+ // all entries with overly large sequence numbers.
+ uint32_t key_length;
+ const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+ Slice user_key_slice = Slice(key_ptr, key_length - 8);
+ if (s->mem->GetInternalKeyComparator()
+ .user_comparator()
+ ->CompareWithoutTimestamp(user_key_slice, s->key->user_key()) == 0) {
+ // Correct user key
+ const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
+ ValueType type;
+ SequenceNumber seq;
+ UnPackSequenceAndType(tag, &seq, &type);
+ // If the value is not in the snapshot, skip it
+ if (!s->CheckCallback(seq)) {
+ return true; // to continue to the next seq
+ }
+
+ s->seq = seq;
+
+ if ((type == kTypeValue || type == kTypeMerge || type == kTypeBlobIndex) &&
+ max_covering_tombstone_seq > seq) {
+ type = kTypeRangeDeletion;
+ }
+ switch (type) {
+ case kTypeBlobIndex:
+ if (s->is_blob_index == nullptr) {
+ ROCKS_LOG_ERROR(s->logger, "Encounter unexpected blob index.");
+ *(s->status) = Status::NotSupported(
+ "Encounter unsupported blob value. Please open DB with "
+ "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
+ } else if (*(s->merge_in_progress)) {
+ *(s->status) =
+ Status::NotSupported("Blob DB does not support merge operator.");
+ }
+ if (!s->status->ok()) {
+ *(s->found_final_value) = true;
+ return false;
+ }
+ FALLTHROUGH_INTENDED;
+ case kTypeValue: {
+ if (s->inplace_update_support) {
+ s->mem->GetLock(s->key->user_key())->ReadLock();
+ }
+ Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
+ *(s->status) = Status::OK();
+ if (*(s->merge_in_progress)) {
+ if (s->do_merge) {
+ if (s->value != nullptr) {
+ *(s->status) = MergeHelper::TimedFullMerge(
+ merge_operator, s->key->user_key(), &v,
+ merge_context->GetOperands(), s->value, s->logger,
+ s->statistics, s->env_, nullptr /* result_operand */, true);
+ }
+ } else {
+ // Preserve the value with the goal of returning it as part of
+ // raw merge operands to the user
+ merge_context->PushOperand(
+ v, s->inplace_update_support == false /* operand_pinned */);
+ }
+ } else if (!s->do_merge) {
+ // Preserve the value with the goal of returning it as part of
+ // raw merge operands to the user
+ merge_context->PushOperand(
+ v, s->inplace_update_support == false /* operand_pinned */);
+ } else if (s->value != nullptr) {
+ s->value->assign(v.data(), v.size());
+ }
+ if (s->inplace_update_support) {
+ s->mem->GetLock(s->key->user_key())->ReadUnlock();
+ }
+ *(s->found_final_value) = true;
+ if (s->is_blob_index != nullptr) {
+ *(s->is_blob_index) = (type == kTypeBlobIndex);
+ }
+ return false;
+ }
+ case kTypeDeletion:
+ case kTypeSingleDeletion:
+ case kTypeRangeDeletion: {
+ if (*(s->merge_in_progress)) {
+ if (s->value != nullptr) {
+ *(s->status) = MergeHelper::TimedFullMerge(
+ merge_operator, s->key->user_key(), nullptr,
+ merge_context->GetOperands(), s->value, s->logger,
+ s->statistics, s->env_, nullptr /* result_operand */, true);
+ }
+ } else {
+ *(s->status) = Status::NotFound();
+ }
+ *(s->found_final_value) = true;
+ return false;
+ }
+ case kTypeMerge: {
+ if (!merge_operator) {
+ *(s->status) = Status::InvalidArgument(
+ "merge_operator is not properly initialized.");
+ // Normally we continue the loop (return true) when we see a merge
+ // operand. But in case of an error, we should stop the loop
+ // immediately and pretend we have found the value to stop further
+ // seek. Otherwise, the later call will override this error status.
+ *(s->found_final_value) = true;
+ return false;
+ }
+ Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
+ *(s->merge_in_progress) = true;
+ merge_context->PushOperand(
+ v, s->inplace_update_support == false /* operand_pinned */);
+ if (s->do_merge && merge_operator->ShouldMerge(
+ merge_context->GetOperandsDirectionBackward())) {
+ *(s->status) = MergeHelper::TimedFullMerge(
+ merge_operator, s->key->user_key(), nullptr,
+ merge_context->GetOperands(), s->value, s->logger, s->statistics,
+ s->env_, nullptr /* result_operand */, true);
+ *(s->found_final_value) = true;
+ return false;
+ }
+ return true;
+ }
+ default:
+ assert(false);
+ return true;
+ }
+ }
+
+ // s->state could be Corrupt, merge or notfound
+ return false;
+}
+
+bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
+ MergeContext* merge_context,
+ SequenceNumber* max_covering_tombstone_seq,
+ SequenceNumber* seq, const ReadOptions& read_opts,
+ ReadCallback* callback, bool* is_blob_index, bool do_merge) {
+ // The sequence number is updated synchronously in version_set.h
+ if (IsEmpty()) {
+ // Avoiding recording stats for speed.
+ return false;
+ }
+ PERF_TIMER_GUARD(get_from_memtable_time);
+
+ std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+ NewRangeTombstoneIterator(read_opts,
+ GetInternalKeySeqno(key.internal_key())));
+ if (range_del_iter != nullptr) {
+ *max_covering_tombstone_seq =
+ std::max(*max_covering_tombstone_seq,
+ range_del_iter->MaxCoveringTombstoneSeqnum(key.user_key()));
+ }
+
+ Slice user_key = key.user_key();
+ bool found_final_value = false;
+ bool merge_in_progress = s->IsMergeInProgress();
+ bool may_contain = true;
+ size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size();
+ if (bloom_filter_) {
+ // when both memtable_whole_key_filtering and prefix_extractor_ are set,
+ // only do whole key filtering for Get() to save CPU
+ if (moptions_.memtable_whole_key_filtering) {
+ may_contain =
+ bloom_filter_->MayContain(StripTimestampFromUserKey(user_key, ts_sz));
+ } else {
+ assert(prefix_extractor_);
+ may_contain =
+ !prefix_extractor_->InDomain(user_key) ||
+ bloom_filter_->MayContain(prefix_extractor_->Transform(user_key));
+ }
+ }
+
+ if (bloom_filter_ && !may_contain) {
+ // iter is null if prefix bloom says the key does not exist
+ PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
+ *seq = kMaxSequenceNumber;
+ } else {
+ if (bloom_filter_) {
+ PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
+ }
+ GetFromTable(key, *max_covering_tombstone_seq, do_merge, callback,
+ is_blob_index, value, s, merge_context, seq,
+ &found_final_value, &merge_in_progress);
+ }
+
+ // No change to value, since we have not yet found a Put/Delete
+ if (!found_final_value && merge_in_progress) {
+ *s = Status::MergeInProgress();
+ }
+ PERF_COUNTER_ADD(get_from_memtable_count, 1);
+ return found_final_value;
+}
+
+void MemTable::GetFromTable(const LookupKey& key,
+ SequenceNumber max_covering_tombstone_seq,
+ bool do_merge, ReadCallback* callback,
+ bool* is_blob_index, std::string* value, Status* s,
+ MergeContext* merge_context, SequenceNumber* seq,
+ bool* found_final_value, bool* merge_in_progress) {
+ Saver saver;
+ saver.status = s;
+ saver.found_final_value = found_final_value;
+ saver.merge_in_progress = merge_in_progress;
+ saver.key = &key;
+ saver.value = value;
+ saver.seq = kMaxSequenceNumber;
+ saver.mem = this;
+ saver.merge_context = merge_context;
+ saver.max_covering_tombstone_seq = max_covering_tombstone_seq;
+ saver.merge_operator = moptions_.merge_operator;
+ saver.logger = moptions_.info_log;
+ saver.inplace_update_support = moptions_.inplace_update_support;
+ saver.statistics = moptions_.statistics;
+ saver.env_ = env_;
+ saver.callback_ = callback;
+ saver.is_blob_index = is_blob_index;
+ saver.do_merge = do_merge;
+ table_->Get(key, &saver, SaveValue);
+ *seq = saver.seq;
+}
+
+void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
+ ReadCallback* callback, bool* is_blob) {
+ // The sequence number is updated synchronously in version_set.h
+ if (IsEmpty()) {
+ // Avoiding recording stats for speed.
+ return;
+ }
+ PERF_TIMER_GUARD(get_from_memtable_time);
+
+ MultiGetRange temp_range(*range, range->begin(), range->end());
+ if (bloom_filter_) {
+ std::array<Slice*, MultiGetContext::MAX_BATCH_SIZE> keys;
+ std::array<bool, MultiGetContext::MAX_BATCH_SIZE> may_match = {{true}};
+ autovector<Slice, MultiGetContext::MAX_BATCH_SIZE> prefixes;
+ int num_keys = 0;
+ for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
+ if (!prefix_extractor_) {
+ keys[num_keys++] = &iter->ukey;
+ } else if (prefix_extractor_->InDomain(iter->ukey)) {
+ prefixes.emplace_back(prefix_extractor_->Transform(iter->ukey));
+ keys[num_keys++] = &prefixes.back();
+ }
+ }
+ bloom_filter_->MayContain(num_keys, &keys[0], &may_match[0]);
+ int idx = 0;
+ for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
+ if (prefix_extractor_ && !prefix_extractor_->InDomain(iter->ukey)) {
+ PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
+ continue;
+ }
+ if (!may_match[idx]) {
+ temp_range.SkipKey(iter);
+ PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
+ } else {
+ PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
+ }
+ idx++;
+ }
+ }
+ for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
+ SequenceNumber seq = kMaxSequenceNumber;
+ bool found_final_value{false};
+ bool merge_in_progress = iter->s->IsMergeInProgress();
+ std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+ NewRangeTombstoneIterator(
+ read_options, GetInternalKeySeqno(iter->lkey->internal_key())));
+ if (range_del_iter != nullptr) {
+ iter->max_covering_tombstone_seq = std::max(
+ iter->max_covering_tombstone_seq,
+ range_del_iter->MaxCoveringTombstoneSeqnum(iter->lkey->user_key()));
+ }
+ GetFromTable(*(iter->lkey), iter->max_covering_tombstone_seq, true,
+ callback, is_blob, iter->value->GetSelf(), iter->s,
+ &(iter->merge_context), &seq, &found_final_value,
+ &merge_in_progress);
+
+ if (!found_final_value && merge_in_progress) {
+ *(iter->s) = Status::MergeInProgress();
+ }
+
+ if (found_final_value) {
+ iter->value->PinSelf();
+ range->MarkKeyDone(iter);
+ RecordTick(moptions_.statistics, MEMTABLE_HIT);
+ }
+ }
+ PERF_COUNTER_ADD(get_from_memtable_count, 1);
+}
+
+void MemTable::Update(SequenceNumber seq,
+ const Slice& key,
+ const Slice& value) {
+ LookupKey lkey(key, seq);
+ Slice mem_key = lkey.memtable_key();
+
+ std::unique_ptr<MemTableRep::Iterator> iter(
+ table_->GetDynamicPrefixIterator());
+ iter->Seek(lkey.internal_key(), mem_key.data());
+
+ if (iter->Valid()) {
+ // entry format is:
+ // key_length varint32
+ // userkey char[klength-8]
+ // tag uint64
+ // vlength varint32
+ // value char[vlength]
+ // Check that it belongs to same user key. We do not check the
+ // sequence number since the Seek() call above should have skipped
+ // all entries with overly large sequence numbers.
+ const char* entry = iter->key();
+ uint32_t key_length = 0;
+ const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+ if (comparator_.comparator.user_comparator()->Equal(
+ Slice(key_ptr, key_length - 8), lkey.user_key())) {
+ // Correct user key
+ const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
+ ValueType type;
+ SequenceNumber existing_seq;
+ UnPackSequenceAndType(tag, &existing_seq, &type);
+ assert(existing_seq != seq);
+ if (type == kTypeValue) {
+ Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
+ uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
+ uint32_t new_size = static_cast<uint32_t>(value.size());
+
+ // Update value, if new value size <= previous value size
+ if (new_size <= prev_size) {
+ char* p =
+ EncodeVarint32(const_cast<char*>(key_ptr) + key_length, new_size);
+ WriteLock wl(GetLock(lkey.user_key()));
+ memcpy(p, value.data(), value.size());
+ assert((unsigned)((p + value.size()) - entry) ==
+ (unsigned)(VarintLength(key_length) + key_length +
+ VarintLength(value.size()) + value.size()));
+ RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED);
+ return;
+ }
+ }
+ }
+ }
+
+ // key doesn't exist
+ bool add_res __attribute__((__unused__));
+ add_res = Add(seq, kTypeValue, key, value);
+ // We already checked unused != seq above. In that case, Add should not fail.
+ assert(add_res);
+}
+
+bool MemTable::UpdateCallback(SequenceNumber seq,
+ const Slice& key,
+ const Slice& delta) {
+ LookupKey lkey(key, seq);
+ Slice memkey = lkey.memtable_key();
+
+ std::unique_ptr<MemTableRep::Iterator> iter(
+ table_->GetDynamicPrefixIterator());
+ iter->Seek(lkey.internal_key(), memkey.data());
+
+ if (iter->Valid()) {
+ // entry format is:
+ // key_length varint32
+ // userkey char[klength-8]
+ // tag uint64
+ // vlength varint32
+ // value char[vlength]
+ // Check that it belongs to same user key. We do not check the
+ // sequence number since the Seek() call above should have skipped
+ // all entries with overly large sequence numbers.
+ const char* entry = iter->key();
+ uint32_t key_length = 0;
+ const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+ if (comparator_.comparator.user_comparator()->Equal(
+ Slice(key_ptr, key_length - 8), lkey.user_key())) {
+ // Correct user key
+ const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
+ ValueType type;
+ uint64_t unused;
+ UnPackSequenceAndType(tag, &unused, &type);
+ switch (type) {
+ case kTypeValue: {
+ Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
+ uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
+
+ char* prev_buffer = const_cast<char*>(prev_value.data());
+ uint32_t new_prev_size = prev_size;
+
+ std::string str_value;
+ WriteLock wl(GetLock(lkey.user_key()));
+ auto status = moptions_.inplace_callback(prev_buffer, &new_prev_size,
+ delta, &str_value);
+ if (status == UpdateStatus::UPDATED_INPLACE) {
+ // Value already updated by callback.
+ assert(new_prev_size <= prev_size);
+ if (new_prev_size < prev_size) {
+ // overwrite the new prev_size
+ char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
+ new_prev_size);
+ if (VarintLength(new_prev_size) < VarintLength(prev_size)) {
+ // shift the value buffer as well.
+ memcpy(p, prev_buffer, new_prev_size);
+ }
+ }
+ RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED);
+ UpdateFlushState();
+ return true;
+ } else if (status == UpdateStatus::UPDATED) {
+ Add(seq, kTypeValue, key, Slice(str_value));
+ RecordTick(moptions_.statistics, NUMBER_KEYS_WRITTEN);
+ UpdateFlushState();
+ return true;
+ } else if (status == UpdateStatus::UPDATE_FAILED) {
+ // No action required. Return.
+ UpdateFlushState();
+ return true;
+ }
+ }
+ default:
+ break;
+ }
+ }
+ }
+ // If the latest value is not kTypeValue
+ // or key doesn't exist
+ return false;
+}
+
+size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) {
+ Slice memkey = key.memtable_key();
+
+ // A total ordered iterator is costly for some memtablerep (prefix aware
+ // reps). By passing in the user key, we allow efficient iterator creation.
+ // The iterator only needs to be ordered within the same user key.
+ std::unique_ptr<MemTableRep::Iterator> iter(
+ table_->GetDynamicPrefixIterator());
+ iter->Seek(key.internal_key(), memkey.data());
+
+ size_t num_successive_merges = 0;
+
+ for (; iter->Valid(); iter->Next()) {
+ const char* entry = iter->key();
+ uint32_t key_length = 0;
+ const char* iter_key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+ if (!comparator_.comparator.user_comparator()->Equal(
+ Slice(iter_key_ptr, key_length - 8), key.user_key())) {
+ break;
+ }
+
+ const uint64_t tag = DecodeFixed64(iter_key_ptr + key_length - 8);
+ ValueType type;
+ uint64_t unused;
+ UnPackSequenceAndType(tag, &unused, &type);
+ if (type != kTypeMerge) {
+ break;
+ }
+
+ ++num_successive_merges;
+ }
+
+ return num_successive_merges;
+}
+
+void MemTableRep::Get(const LookupKey& k, void* callback_args,
+ bool (*callback_func)(void* arg, const char* entry)) {
+ auto iter = GetDynamicPrefixIterator();
+ for (iter->Seek(k.internal_key(), k.memtable_key().data());
+ iter->Valid() && callback_func(callback_args, iter->key());
+ iter->Next()) {
+ }
+}
+
+void MemTable::RefLogContainingPrepSection(uint64_t log) {
+ assert(log > 0);
+ auto cur = min_prep_log_referenced_.load();
+ while ((log < cur || cur == 0) &&
+ !min_prep_log_referenced_.compare_exchange_strong(cur, log)) {
+ cur = min_prep_log_referenced_.load();
+ }
+}
+
+uint64_t MemTable::GetMinLogContainingPrepSection() {
+ return min_prep_log_referenced_.load();
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/memtable.h b/src/rocksdb/db/memtable.h
new file mode 100644
index 000000000..f4e4b98a9
--- /dev/null
+++ b/src/rocksdb/db/memtable.h
@@ -0,0 +1,542 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <deque>
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "db/dbformat.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/read_callback.h"
+#include "db/version_edit.h"
+#include "memory/allocator.h"
+#include "memory/concurrent_arena.h"
+#include "monitoring/instrumented_mutex.h"
+#include "options/cf_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "table/multiget_context.h"
+#include "util/dynamic_bloom.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct FlushJobInfo;
+class Mutex;
+class MemTableIterator;
+class MergeContext;
+
+struct ImmutableMemTableOptions {
+ explicit ImmutableMemTableOptions(const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options);
+ size_t arena_block_size;
+ uint32_t memtable_prefix_bloom_bits;
+ size_t memtable_huge_page_size;
+ bool memtable_whole_key_filtering;
+ bool inplace_update_support;
+ size_t inplace_update_num_locks;
+ UpdateStatus (*inplace_callback)(char* existing_value,
+ uint32_t* existing_value_size,
+ Slice delta_value,
+ std::string* merged_value);
+ size_t max_successive_merges;
+ Statistics* statistics;
+ MergeOperator* merge_operator;
+ Logger* info_log;
+};
+
+// Batched counters to updated when inserting keys in one write batch.
+// In post process of the write batch, these can be updated together.
+// Only used in concurrent memtable insert case.
+struct MemTablePostProcessInfo {
+ uint64_t data_size = 0;
+ uint64_t num_entries = 0;
+ uint64_t num_deletes = 0;
+};
+
+using MultiGetRange = MultiGetContext::Range;
+// Note: Many of the methods in this class have comments indicating that
+// external synchronization is required as these methods are not thread-safe.
+// It is up to higher layers of code to decide how to prevent concurrent
+// invokation of these methods. This is usually done by acquiring either
+// the db mutex or the single writer thread.
+//
+// Some of these methods are documented to only require external
+// synchronization if this memtable is immutable. Calling MarkImmutable() is
+// not sufficient to guarantee immutability. It is up to higher layers of
+// code to determine if this MemTable can still be modified by other threads.
+// Eg: The Superversion stores a pointer to the current MemTable (that can
+// be modified) and a separate list of the MemTables that can no longer be
+// written to (aka the 'immutable memtables').
+class MemTable {
+ public:
+ struct KeyComparator : public MemTableRep::KeyComparator {
+ const InternalKeyComparator comparator;
+ explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { }
+ virtual int operator()(const char* prefix_len_key1,
+ const char* prefix_len_key2) const override;
+ virtual int operator()(const char* prefix_len_key,
+ const DecodedType& key) const override;
+ };
+
+ // MemTables are reference counted. The initial reference count
+ // is zero and the caller must call Ref() at least once.
+ //
+ // earliest_seq should be the current SequenceNumber in the db such that any
+ // key inserted into this memtable will have an equal or larger seq number.
+ // (When a db is first created, the earliest sequence number will be 0).
+ // If the earliest sequence number is not known, kMaxSequenceNumber may be
+ // used, but this may prevent some transactions from succeeding until the
+ // first key is inserted into the memtable.
+ explicit MemTable(const InternalKeyComparator& comparator,
+ const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options,
+ WriteBufferManager* write_buffer_manager,
+ SequenceNumber earliest_seq, uint32_t column_family_id);
+ // No copying allowed
+ MemTable(const MemTable&) = delete;
+ MemTable& operator=(const MemTable&) = delete;
+
+ // Do not delete this MemTable unless Unref() indicates it not in use.
+ ~MemTable();
+
+ // Increase reference count.
+ // REQUIRES: external synchronization to prevent simultaneous
+ // operations on the same MemTable.
+ void Ref() { ++refs_; }
+
+ // Drop reference count.
+ // If the refcount goes to zero return this memtable, otherwise return null.
+ // REQUIRES: external synchronization to prevent simultaneous
+ // operations on the same MemTable.
+ MemTable* Unref() {
+ --refs_;
+ assert(refs_ >= 0);
+ if (refs_ <= 0) {
+ return this;
+ }
+ return nullptr;
+ }
+
+ // Returns an estimate of the number of bytes of data in use by this
+ // data structure.
+ //
+ // REQUIRES: external synchronization to prevent simultaneous
+ // operations on the same MemTable (unless this Memtable is immutable).
+ size_t ApproximateMemoryUsage();
+
+ // As a cheap version of `ApproximateMemoryUsage()`, this function doens't
+ // require external synchronization. The value may be less accurate though
+ size_t ApproximateMemoryUsageFast() const {
+ return approximate_memory_usage_.load(std::memory_order_relaxed);
+ }
+
+ // This method heuristically determines if the memtable should continue to
+ // host more data.
+ bool ShouldScheduleFlush() const {
+ return flush_state_.load(std::memory_order_relaxed) == FLUSH_REQUESTED;
+ }
+
+ // Returns true if a flush should be scheduled and the caller should
+ // be the one to schedule it
+ bool MarkFlushScheduled() {
+ auto before = FLUSH_REQUESTED;
+ return flush_state_.compare_exchange_strong(before, FLUSH_SCHEDULED,
+ std::memory_order_relaxed,
+ std::memory_order_relaxed);
+ }
+
+ // Return an iterator that yields the contents of the memtable.
+ //
+ // The caller must ensure that the underlying MemTable remains live
+ // while the returned iterator is live. The keys returned by this
+ // iterator are internal keys encoded by AppendInternalKey in the
+ // db/dbformat.{h,cc} module.
+ //
+ // By default, it returns an iterator for prefix seek if prefix_extractor
+ // is configured in Options.
+ // arena: If not null, the arena needs to be used to allocate the Iterator.
+ // Calling ~Iterator of the iterator will destroy all the states but
+ // those allocated in arena.
+ InternalIterator* NewIterator(const ReadOptions& read_options, Arena* arena);
+
+ FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
+ const ReadOptions& read_options, SequenceNumber read_seq);
+
+ // Add an entry into memtable that maps key to value at the
+ // specified sequence number and with the specified type.
+ // Typically value will be empty if type==kTypeDeletion.
+ //
+ // REQUIRES: if allow_concurrent = false, external synchronization to prevent
+ // simultaneous operations on the same MemTable.
+ //
+ // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
+ // the <key, seq> already exists.
+ bool Add(SequenceNumber seq, ValueType type, const Slice& key,
+ const Slice& value, bool allow_concurrent = false,
+ MemTablePostProcessInfo* post_process_info = nullptr,
+ void** hint = nullptr);
+
+ // Used to Get value associated with key or Get Merge Operands associated
+ // with key.
+ // If do_merge = true the default behavior which is Get value for key is
+ // executed. Expected behavior is described right below.
+ // If memtable contains a value for key, store it in *value and return true.
+ // If memtable contains a deletion for key, store a NotFound() error
+ // in *status and return true.
+ // If memtable contains Merge operation as the most recent entry for a key,
+ // and the merge process does not stop (not reaching a value or delete),
+ // prepend the current merge operand to *operands.
+ // store MergeInProgress in s, and return false.
+ // Else, return false.
+ // If any operation was found, its most recent sequence number
+ // will be stored in *seq on success (regardless of whether true/false is
+ // returned). Otherwise, *seq will be set to kMaxSequenceNumber.
+ // On success, *s may be set to OK, NotFound, or MergeInProgress. Any other
+ // status returned indicates a corruption or other unexpected error.
+ // If do_merge = false then any Merge Operands encountered for key are simply
+ // stored in merge_context.operands_list and never actually merged to get a
+ // final value. The raw Merge Operands are eventually returned to the user.
+ bool Get(const LookupKey& key, std::string* value, Status* s,
+ MergeContext* merge_context,
+ SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
+ const ReadOptions& read_opts, ReadCallback* callback = nullptr,
+ bool* is_blob_index = nullptr, bool do_merge = true);
+
+ bool Get(const LookupKey& key, std::string* value, Status* s,
+ MergeContext* merge_context,
+ SequenceNumber* max_covering_tombstone_seq,
+ const ReadOptions& read_opts, ReadCallback* callback = nullptr,
+ bool* is_blob_index = nullptr, bool do_merge = true) {
+ SequenceNumber seq;
+ return Get(key, value, s, merge_context, max_covering_tombstone_seq, &seq,
+ read_opts, callback, is_blob_index, do_merge);
+ }
+
+ void MultiGet(const ReadOptions& read_options, MultiGetRange* range,
+ ReadCallback* callback, bool* is_blob);
+
+ // Attempts to update the new_value inplace, else does normal Add
+ // Pseudocode
+ // if key exists in current memtable && prev_value is of type kTypeValue
+ // if new sizeof(new_value) <= sizeof(prev_value)
+ // update inplace
+ // else add(key, new_value)
+ // else add(key, new_value)
+ //
+ // REQUIRES: external synchronization to prevent simultaneous
+ // operations on the same MemTable.
+ void Update(SequenceNumber seq,
+ const Slice& key,
+ const Slice& value);
+
+ // If prev_value for key exists, attempts to update it inplace.
+ // else returns false
+ // Pseudocode
+ // if key exists in current memtable && prev_value is of type kTypeValue
+ // new_value = delta(prev_value)
+ // if sizeof(new_value) <= sizeof(prev_value)
+ // update inplace
+ // else add(key, new_value)
+ // else return false
+ //
+ // REQUIRES: external synchronization to prevent simultaneous
+ // operations on the same MemTable.
+ bool UpdateCallback(SequenceNumber seq,
+ const Slice& key,
+ const Slice& delta);
+
+ // Returns the number of successive merge entries starting from the newest
+ // entry for the key up to the last non-merge entry or last entry for the
+ // key in the memtable.
+ size_t CountSuccessiveMergeEntries(const LookupKey& key);
+
+ // Update counters and flush status after inserting a whole write batch
+ // Used in concurrent memtable inserts.
+ void BatchPostProcess(const MemTablePostProcessInfo& update_counters) {
+ num_entries_.fetch_add(update_counters.num_entries,
+ std::memory_order_relaxed);
+ data_size_.fetch_add(update_counters.data_size, std::memory_order_relaxed);
+ if (update_counters.num_deletes != 0) {
+ num_deletes_.fetch_add(update_counters.num_deletes,
+ std::memory_order_relaxed);
+ }
+ UpdateFlushState();
+ }
+
+ // Get total number of entries in the mem table.
+ // REQUIRES: external synchronization to prevent simultaneous
+ // operations on the same MemTable (unless this Memtable is immutable).
+ uint64_t num_entries() const {
+ return num_entries_.load(std::memory_order_relaxed);
+ }
+
+ // Get total number of deletes in the mem table.
+ // REQUIRES: external synchronization to prevent simultaneous
+ // operations on the same MemTable (unless this Memtable is immutable).
+ uint64_t num_deletes() const {
+ return num_deletes_.load(std::memory_order_relaxed);
+ }
+
+ uint64_t get_data_size() const {
+ return data_size_.load(std::memory_order_relaxed);
+ }
+
+ // Dynamically change the memtable's capacity. If set below the current usage,
+ // the next key added will trigger a flush. Can only increase size when
+ // memtable prefix bloom is disabled, since we can't easily allocate more
+ // space.
+ void UpdateWriteBufferSize(size_t new_write_buffer_size) {
+ if (bloom_filter_ == nullptr ||
+ new_write_buffer_size < write_buffer_size_) {
+ write_buffer_size_.store(new_write_buffer_size,
+ std::memory_order_relaxed);
+ }
+ }
+
+ // Returns the edits area that is needed for flushing the memtable
+ VersionEdit* GetEdits() { return &edit_; }
+
+ // Returns if there is no entry inserted to the mem table.
+ // REQUIRES: external synchronization to prevent simultaneous
+ // operations on the same MemTable (unless this Memtable is immutable).
+ bool IsEmpty() const { return first_seqno_ == 0; }
+
+ // Returns the sequence number of the first element that was inserted
+ // into the memtable.
+ // REQUIRES: external synchronization to prevent simultaneous
+ // operations on the same MemTable (unless this Memtable is immutable).
+ SequenceNumber GetFirstSequenceNumber() {
+ return first_seqno_.load(std::memory_order_relaxed);
+ }
+
+ // Returns the sequence number that is guaranteed to be smaller than or equal
+ // to the sequence number of any key that could be inserted into this
+ // memtable. It can then be assumed that any write with a larger(or equal)
+ // sequence number will be present in this memtable or a later memtable.
+ //
+ // If the earliest sequence number could not be determined,
+ // kMaxSequenceNumber will be returned.
+ SequenceNumber GetEarliestSequenceNumber() {
+ return earliest_seqno_.load(std::memory_order_relaxed);
+ }
+
+ // DB's latest sequence ID when the memtable is created. This number
+ // may be updated to a more recent one before any key is inserted.
+ SequenceNumber GetCreationSeq() const { return creation_seq_; }
+
+ void SetCreationSeq(SequenceNumber sn) { creation_seq_ = sn; }
+
+ // Returns the next active logfile number when this memtable is about to
+ // be flushed to storage
+ // REQUIRES: external synchronization to prevent simultaneous
+ // operations on the same MemTable.
+ uint64_t GetNextLogNumber() { return mem_next_logfile_number_; }
+
+ // Sets the next active logfile number when this memtable is about to
+ // be flushed to storage
+ // REQUIRES: external synchronization to prevent simultaneous
+ // operations on the same MemTable.
+ void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; }
+
+ // if this memtable contains data from a committed
+ // two phase transaction we must take note of the
+ // log which contains that data so we can know
+ // when to relese that log
+ void RefLogContainingPrepSection(uint64_t log);
+ uint64_t GetMinLogContainingPrepSection();
+
+ // Notify the underlying storage that no more items will be added.
+ // REQUIRES: external synchronization to prevent simultaneous
+ // operations on the same MemTable.
+ // After MarkImmutable() is called, you should not attempt to
+ // write anything to this MemTable(). (Ie. do not call Add() or Update()).
+ void MarkImmutable() {
+ table_->MarkReadOnly();
+ mem_tracker_.DoneAllocating();
+ }
+
+ // Notify the underlying storage that all data it contained has been
+ // persisted.
+ // REQUIRES: external synchronization to prevent simultaneous
+ // operations on the same MemTable.
+ void MarkFlushed() {
+ table_->MarkFlushed();
+ }
+
+ // return true if the current MemTableRep supports merge operator.
+ bool IsMergeOperatorSupported() const {
+ return table_->IsMergeOperatorSupported();
+ }
+
+ // return true if the current MemTableRep supports snapshots.
+ // inplace update prevents snapshots,
+ bool IsSnapshotSupported() const {
+ return table_->IsSnapshotSupported() && !moptions_.inplace_update_support;
+ }
+
+ struct MemTableStats {
+ uint64_t size;
+ uint64_t count;
+ };
+
+ MemTableStats ApproximateStats(const Slice& start_ikey,
+ const Slice& end_ikey);
+
+ // Get the lock associated for the key
+ port::RWMutex* GetLock(const Slice& key);
+
+ const InternalKeyComparator& GetInternalKeyComparator() const {
+ return comparator_.comparator;
+ }
+
+ const ImmutableMemTableOptions* GetImmutableMemTableOptions() const {
+ return &moptions_;
+ }
+
+ uint64_t ApproximateOldestKeyTime() const {
+ return oldest_key_time_.load(std::memory_order_relaxed);
+ }
+
+ // REQUIRES: db_mutex held.
+ void SetID(uint64_t id) { id_ = id; }
+
+ uint64_t GetID() const { return id_; }
+
+ void SetFlushCompleted(bool completed) { flush_completed_ = completed; }
+
+ uint64_t GetFileNumber() const { return file_number_; }
+
+ void SetFileNumber(uint64_t file_num) { file_number_ = file_num; }
+
+ void SetFlushInProgress(bool in_progress) {
+ flush_in_progress_ = in_progress;
+ }
+
+#ifndef ROCKSDB_LITE
+ void SetFlushJobInfo(std::unique_ptr<FlushJobInfo>&& info) {
+ flush_job_info_ = std::move(info);
+ }
+
+ std::unique_ptr<FlushJobInfo> ReleaseFlushJobInfo() {
+ return std::move(flush_job_info_);
+ }
+#endif // !ROCKSDB_LITE
+
+ private:
+ enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED };
+
+ friend class MemTableIterator;
+ friend class MemTableBackwardIterator;
+ friend class MemTableList;
+
+ KeyComparator comparator_;
+ const ImmutableMemTableOptions moptions_;
+ int refs_;
+ const size_t kArenaBlockSize;
+ AllocTracker mem_tracker_;
+ ConcurrentArena arena_;
+ std::unique_ptr<MemTableRep> table_;
+ std::unique_ptr<MemTableRep> range_del_table_;
+ std::atomic_bool is_range_del_table_empty_;
+
+ // Total data size of all data inserted
+ std::atomic<uint64_t> data_size_;
+ std::atomic<uint64_t> num_entries_;
+ std::atomic<uint64_t> num_deletes_;
+
+ // Dynamically changeable memtable option
+ std::atomic<size_t> write_buffer_size_;
+
+ // These are used to manage memtable flushes to storage
+ bool flush_in_progress_; // started the flush
+ bool flush_completed_; // finished the flush
+ uint64_t file_number_; // filled up after flush is complete
+
+ // The updates to be applied to the transaction log when this
+ // memtable is flushed to storage.
+ VersionEdit edit_;
+
+ // The sequence number of the kv that was inserted first
+ std::atomic<SequenceNumber> first_seqno_;
+
+ // The db sequence number at the time of creation or kMaxSequenceNumber
+ // if not set.
+ std::atomic<SequenceNumber> earliest_seqno_;
+
+ SequenceNumber creation_seq_;
+
+ // The log files earlier than this number can be deleted.
+ uint64_t mem_next_logfile_number_;
+
+ // the earliest log containing a prepared section
+ // which has been inserted into this memtable.
+ std::atomic<uint64_t> min_prep_log_referenced_;
+
+ // rw locks for inplace updates
+ std::vector<port::RWMutex> locks_;
+
+ const SliceTransform* const prefix_extractor_;
+ std::unique_ptr<DynamicBloom> bloom_filter_;
+
+ std::atomic<FlushStateEnum> flush_state_;
+
+ Env* env_;
+
+ // Extract sequential insert prefixes.
+ const SliceTransform* insert_with_hint_prefix_extractor_;
+
+ // Insert hints for each prefix.
+ std::unordered_map<Slice, void*, SliceHasher> insert_hints_;
+
+ // Timestamp of oldest key
+ std::atomic<uint64_t> oldest_key_time_;
+
+ // Memtable id to track flush.
+ uint64_t id_ = 0;
+
+ // Sequence number of the atomic flush that is responsible for this memtable.
+ // The sequence number of atomic flush is a seq, such that no writes with
+ // sequence numbers greater than or equal to seq are flushed, while all
+ // writes with sequence number smaller than seq are flushed.
+ SequenceNumber atomic_flush_seqno_;
+
+ // keep track of memory usage in table_, arena_, and range_del_table_.
+ // Gets refrshed inside `ApproximateMemoryUsage()` or `ShouldFlushNow`
+ std::atomic<uint64_t> approximate_memory_usage_;
+
+#ifndef ROCKSDB_LITE
+ // Flush job info of the current memtable.
+ std::unique_ptr<FlushJobInfo> flush_job_info_;
+#endif // !ROCKSDB_LITE
+
+ // Returns a heuristic flush decision
+ bool ShouldFlushNow();
+
+ // Updates flush_state_ using ShouldFlushNow()
+ void UpdateFlushState();
+
+ void UpdateOldestKeyTime();
+
+ void GetFromTable(const LookupKey& key,
+ SequenceNumber max_covering_tombstone_seq, bool do_merge,
+ ReadCallback* callback, bool* is_blob_index,
+ std::string* value, Status* s, MergeContext* merge_context,
+ SequenceNumber* seq, bool* found_final_value,
+ bool* merge_in_progress);
+};
+
+extern const char* EncodeKey(std::string* scratch, const Slice& target);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/memtable_list.cc b/src/rocksdb/db/memtable_list.cc
new file mode 100644
index 000000000..a8b358fa6
--- /dev/null
+++ b/src/rocksdb/db/memtable_list.cc
@@ -0,0 +1,771 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#include "db/memtable_list.h"
+
+#include <cinttypes>
+#include <limits>
+#include <queue>
+#include <string>
+#include "db/db_impl/db_impl.h"
+#include "db/memtable.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/version_set.h"
+#include "logging/log_buffer.h"
+#include "monitoring/thread_status_util.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "table/merging_iterator.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class InternalKeyComparator;
+class Mutex;
+class VersionSet;
+
+void MemTableListVersion::AddMemTable(MemTable* m) {
+ memlist_.push_front(m);
+ *parent_memtable_list_memory_usage_ += m->ApproximateMemoryUsage();
+}
+
+void MemTableListVersion::UnrefMemTable(autovector<MemTable*>* to_delete,
+ MemTable* m) {
+ if (m->Unref()) {
+ to_delete->push_back(m);
+ assert(*parent_memtable_list_memory_usage_ >= m->ApproximateMemoryUsage());
+ *parent_memtable_list_memory_usage_ -= m->ApproximateMemoryUsage();
+ }
+}
+
+MemTableListVersion::MemTableListVersion(
+ size_t* parent_memtable_list_memory_usage, MemTableListVersion* old)
+ : max_write_buffer_number_to_maintain_(
+ old->max_write_buffer_number_to_maintain_),
+ max_write_buffer_size_to_maintain_(
+ old->max_write_buffer_size_to_maintain_),
+ parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) {
+ if (old != nullptr) {
+ memlist_ = old->memlist_;
+ for (auto& m : memlist_) {
+ m->Ref();
+ }
+
+ memlist_history_ = old->memlist_history_;
+ for (auto& m : memlist_history_) {
+ m->Ref();
+ }
+ }
+}
+
+MemTableListVersion::MemTableListVersion(
+ size_t* parent_memtable_list_memory_usage,
+ int max_write_buffer_number_to_maintain,
+ int64_t max_write_buffer_size_to_maintain)
+ : max_write_buffer_number_to_maintain_(max_write_buffer_number_to_maintain),
+ max_write_buffer_size_to_maintain_(max_write_buffer_size_to_maintain),
+ parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) {}
+
+void MemTableListVersion::Ref() { ++refs_; }
+
+// called by superversion::clean()
+void MemTableListVersion::Unref(autovector<MemTable*>* to_delete) {
+ assert(refs_ >= 1);
+ --refs_;
+ if (refs_ == 0) {
+ // if to_delete is equal to nullptr it means we're confident
+ // that refs_ will not be zero
+ assert(to_delete != nullptr);
+ for (const auto& m : memlist_) {
+ UnrefMemTable(to_delete, m);
+ }
+ for (const auto& m : memlist_history_) {
+ UnrefMemTable(to_delete, m);
+ }
+ delete this;
+ }
+}
+
+int MemTableList::NumNotFlushed() const {
+ int size = static_cast<int>(current_->memlist_.size());
+ assert(num_flush_not_started_ <= size);
+ return size;
+}
+
+int MemTableList::NumFlushed() const {
+ return static_cast<int>(current_->memlist_history_.size());
+}
+
+// Search all the memtables starting from the most recent one.
+// Return the most recent value found, if any.
+// Operands stores the list of merge operations to apply, so far.
+bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
+ Status* s, MergeContext* merge_context,
+ SequenceNumber* max_covering_tombstone_seq,
+ SequenceNumber* seq, const ReadOptions& read_opts,
+ ReadCallback* callback, bool* is_blob_index) {
+ return GetFromList(&memlist_, key, value, s, merge_context,
+ max_covering_tombstone_seq, seq, read_opts, callback,
+ is_blob_index);
+}
+
+void MemTableListVersion::MultiGet(const ReadOptions& read_options,
+ MultiGetRange* range, ReadCallback* callback,
+ bool* is_blob) {
+ for (auto memtable : memlist_) {
+ memtable->MultiGet(read_options, range, callback, is_blob);
+ if (range->empty()) {
+ return;
+ }
+ }
+}
+
+bool MemTableListVersion::GetMergeOperands(
+ const LookupKey& key, Status* s, MergeContext* merge_context,
+ SequenceNumber* max_covering_tombstone_seq, const ReadOptions& read_opts) {
+ for (MemTable* memtable : memlist_) {
+ bool done = memtable->Get(key, nullptr, s, merge_context,
+ max_covering_tombstone_seq, read_opts, nullptr,
+ nullptr, false);
+ if (done) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool MemTableListVersion::GetFromHistory(
+ const LookupKey& key, std::string* value, Status* s,
+ MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq,
+ SequenceNumber* seq, const ReadOptions& read_opts, bool* is_blob_index) {
+ return GetFromList(&memlist_history_, key, value, s, merge_context,
+ max_covering_tombstone_seq, seq, read_opts,
+ nullptr /*read_callback*/, is_blob_index);
+}
+
+bool MemTableListVersion::GetFromList(
+ std::list<MemTable*>* list, const LookupKey& key, std::string* value,
+ Status* s, MergeContext* merge_context,
+ SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
+ const ReadOptions& read_opts, ReadCallback* callback, bool* is_blob_index) {
+ *seq = kMaxSequenceNumber;
+
+ for (auto& memtable : *list) {
+ SequenceNumber current_seq = kMaxSequenceNumber;
+
+ bool done =
+ memtable->Get(key, value, s, merge_context, max_covering_tombstone_seq,
+ &current_seq, read_opts, callback, is_blob_index);
+ if (*seq == kMaxSequenceNumber) {
+ // Store the most recent sequence number of any operation on this key.
+ // Since we only care about the most recent change, we only need to
+ // return the first operation found when searching memtables in
+ // reverse-chronological order.
+ // current_seq would be equal to kMaxSequenceNumber if the value was to be
+ // skipped. This allows seq to be assigned again when the next value is
+ // read.
+ *seq = current_seq;
+ }
+
+ if (done) {
+ assert(*seq != kMaxSequenceNumber || s->IsNotFound());
+ return true;
+ }
+ if (!done && !s->ok() && !s->IsMergeInProgress() && !s->IsNotFound()) {
+ return false;
+ }
+ }
+ return false;
+}
+
+Status MemTableListVersion::AddRangeTombstoneIterators(
+ const ReadOptions& read_opts, Arena* /*arena*/,
+ RangeDelAggregator* range_del_agg) {
+ assert(range_del_agg != nullptr);
+ // Except for snapshot read, using kMaxSequenceNumber is OK because these
+ // are immutable memtables.
+ SequenceNumber read_seq = read_opts.snapshot != nullptr
+ ? read_opts.snapshot->GetSequenceNumber()
+ : kMaxSequenceNumber;
+ for (auto& m : memlist_) {
+ std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+ m->NewRangeTombstoneIterator(read_opts, read_seq));
+ range_del_agg->AddTombstones(std::move(range_del_iter));
+ }
+ return Status::OK();
+}
+
+void MemTableListVersion::AddIterators(
+ const ReadOptions& options, std::vector<InternalIterator*>* iterator_list,
+ Arena* arena) {
+ for (auto& m : memlist_) {
+ iterator_list->push_back(m->NewIterator(options, arena));
+ }
+}
+
+void MemTableListVersion::AddIterators(
+ const ReadOptions& options, MergeIteratorBuilder* merge_iter_builder) {
+ for (auto& m : memlist_) {
+ merge_iter_builder->AddIterator(
+ m->NewIterator(options, merge_iter_builder->GetArena()));
+ }
+}
+
+uint64_t MemTableListVersion::GetTotalNumEntries() const {
+ uint64_t total_num = 0;
+ for (auto& m : memlist_) {
+ total_num += m->num_entries();
+ }
+ return total_num;
+}
+
+MemTable::MemTableStats MemTableListVersion::ApproximateStats(
+ const Slice& start_ikey, const Slice& end_ikey) {
+ MemTable::MemTableStats total_stats = {0, 0};
+ for (auto& m : memlist_) {
+ auto mStats = m->ApproximateStats(start_ikey, end_ikey);
+ total_stats.size += mStats.size;
+ total_stats.count += mStats.count;
+ }
+ return total_stats;
+}
+
+uint64_t MemTableListVersion::GetTotalNumDeletes() const {
+ uint64_t total_num = 0;
+ for (auto& m : memlist_) {
+ total_num += m->num_deletes();
+ }
+ return total_num;
+}
+
+SequenceNumber MemTableListVersion::GetEarliestSequenceNumber(
+ bool include_history) const {
+ if (include_history && !memlist_history_.empty()) {
+ return memlist_history_.back()->GetEarliestSequenceNumber();
+ } else if (!memlist_.empty()) {
+ return memlist_.back()->GetEarliestSequenceNumber();
+ } else {
+ return kMaxSequenceNumber;
+ }
+}
+
+// caller is responsible for referencing m
+void MemTableListVersion::Add(MemTable* m, autovector<MemTable*>* to_delete) {
+ assert(refs_ == 1); // only when refs_ == 1 is MemTableListVersion mutable
+ AddMemTable(m);
+
+ TrimHistory(to_delete, m->ApproximateMemoryUsage());
+}
+
+// Removes m from list of memtables not flushed. Caller should NOT Unref m.
+void MemTableListVersion::Remove(MemTable* m,
+ autovector<MemTable*>* to_delete) {
+ assert(refs_ == 1); // only when refs_ == 1 is MemTableListVersion mutable
+ memlist_.remove(m);
+
+ m->MarkFlushed();
+ if (max_write_buffer_size_to_maintain_ > 0 ||
+ max_write_buffer_number_to_maintain_ > 0) {
+ memlist_history_.push_front(m);
+ // Unable to get size of mutable memtable at this point, pass 0 to
+ // TrimHistory as a best effort.
+ TrimHistory(to_delete, 0);
+ } else {
+ UnrefMemTable(to_delete, m);
+ }
+}
+
+// return the total memory usage assuming the oldest flushed memtable is dropped
+size_t MemTableListVersion::ApproximateMemoryUsageExcludingLast() const {
+ size_t total_memtable_size = 0;
+ for (auto& memtable : memlist_) {
+ total_memtable_size += memtable->ApproximateMemoryUsage();
+ }
+ for (auto& memtable : memlist_history_) {
+ total_memtable_size += memtable->ApproximateMemoryUsage();
+ }
+ if (!memlist_history_.empty()) {
+ total_memtable_size -= memlist_history_.back()->ApproximateMemoryUsage();
+ }
+ return total_memtable_size;
+}
+
+bool MemTableListVersion::MemtableLimitExceeded(size_t usage) {
+ if (max_write_buffer_size_to_maintain_ > 0) {
+ // calculate the total memory usage after dropping the oldest flushed
+ // memtable, compare with max_write_buffer_size_to_maintain_ to decide
+ // whether to trim history
+ return ApproximateMemoryUsageExcludingLast() + usage >=
+ static_cast<size_t>(max_write_buffer_size_to_maintain_);
+ } else if (max_write_buffer_number_to_maintain_ > 0) {
+ return memlist_.size() + memlist_history_.size() >
+ static_cast<size_t>(max_write_buffer_number_to_maintain_);
+ } else {
+ return false;
+ }
+}
+
+// Make sure we don't use up too much space in history
+void MemTableListVersion::TrimHistory(autovector<MemTable*>* to_delete,
+ size_t usage) {
+ while (MemtableLimitExceeded(usage) && !memlist_history_.empty()) {
+ MemTable* x = memlist_history_.back();
+ memlist_history_.pop_back();
+
+ UnrefMemTable(to_delete, x);
+ }
+}
+
+// Returns true if there is at least one memtable on which flush has
+// not yet started.
+bool MemTableList::IsFlushPending() const {
+ if ((flush_requested_ && num_flush_not_started_ > 0) ||
+ (num_flush_not_started_ >= min_write_buffer_number_to_merge_)) {
+ assert(imm_flush_needed.load(std::memory_order_relaxed));
+ return true;
+ }
+ return false;
+}
+
+// Returns the memtables that need to be flushed.
+void MemTableList::PickMemtablesToFlush(const uint64_t* max_memtable_id,
+ autovector<MemTable*>* ret) {
+ AutoThreadOperationStageUpdater stage_updater(
+ ThreadStatus::STAGE_PICK_MEMTABLES_TO_FLUSH);
+ const auto& memlist = current_->memlist_;
+ bool atomic_flush = false;
+ for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
+ MemTable* m = *it;
+ if (!atomic_flush && m->atomic_flush_seqno_ != kMaxSequenceNumber) {
+ atomic_flush = true;
+ }
+ if (max_memtable_id != nullptr && m->GetID() > *max_memtable_id) {
+ break;
+ }
+ if (!m->flush_in_progress_) {
+ assert(!m->flush_completed_);
+ num_flush_not_started_--;
+ if (num_flush_not_started_ == 0) {
+ imm_flush_needed.store(false, std::memory_order_release);
+ }
+ m->flush_in_progress_ = true; // flushing will start very soon
+ ret->push_back(m);
+ }
+ }
+ if (!atomic_flush || num_flush_not_started_ == 0) {
+ flush_requested_ = false; // start-flush request is complete
+ }
+}
+
+void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
+ uint64_t /*file_number*/) {
+ AutoThreadOperationStageUpdater stage_updater(
+ ThreadStatus::STAGE_MEMTABLE_ROLLBACK);
+ assert(!mems.empty());
+
+ // If the flush was not successful, then just reset state.
+ // Maybe a succeeding attempt to flush will be successful.
+ for (MemTable* m : mems) {
+ assert(m->flush_in_progress_);
+ assert(m->file_number_ == 0);
+
+ m->flush_in_progress_ = false;
+ m->flush_completed_ = false;
+ m->edit_.Clear();
+ num_flush_not_started_++;
+ }
+ imm_flush_needed.store(true, std::memory_order_release);
+}
+
+// Try record a successful flush in the manifest file. It might just return
+// Status::OK letting a concurrent flush to do actual the recording..
+Status MemTableList::TryInstallMemtableFlushResults(
+ ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+ const autovector<MemTable*>& mems, LogsWithPrepTracker* prep_tracker,
+ VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number,
+ autovector<MemTable*>* to_delete, Directory* db_directory,
+ LogBuffer* log_buffer,
+ std::list<std::unique_ptr<FlushJobInfo>>* committed_flush_jobs_info) {
+ AutoThreadOperationStageUpdater stage_updater(
+ ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS);
+ mu->AssertHeld();
+
+ // Flush was successful
+ // Record the status on the memtable object. Either this call or a call by a
+ // concurrent flush thread will read the status and write it to manifest.
+ for (size_t i = 0; i < mems.size(); ++i) {
+ // All the edits are associated with the first memtable of this batch.
+ assert(i == 0 || mems[i]->GetEdits()->NumEntries() == 0);
+
+ mems[i]->flush_completed_ = true;
+ mems[i]->file_number_ = file_number;
+ }
+
+ // if some other thread is already committing, then return
+ Status s;
+ if (commit_in_progress_) {
+ TEST_SYNC_POINT("MemTableList::TryInstallMemtableFlushResults:InProgress");
+ return s;
+ }
+
+ // Only a single thread can be executing this piece of code
+ commit_in_progress_ = true;
+
+ // Retry until all completed flushes are committed. New flushes can finish
+ // while the current thread is writing manifest where mutex is released.
+ while (s.ok()) {
+ auto& memlist = current_->memlist_;
+ // The back is the oldest; if flush_completed_ is not set to it, it means
+ // that we were assigned a more recent memtable. The memtables' flushes must
+ // be recorded in manifest in order. A concurrent flush thread, who is
+ // assigned to flush the oldest memtable, will later wake up and does all
+ // the pending writes to manifest, in order.
+ if (memlist.empty() || !memlist.back()->flush_completed_) {
+ break;
+ }
+ // scan all memtables from the earliest, and commit those
+ // (in that order) that have finished flushing. Memtables
+ // are always committed in the order that they were created.
+ uint64_t batch_file_number = 0;
+ size_t batch_count = 0;
+ autovector<VersionEdit*> edit_list;
+ autovector<MemTable*> memtables_to_flush;
+ // enumerate from the last (earliest) element to see how many batch finished
+ for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
+ MemTable* m = *it;
+ if (!m->flush_completed_) {
+ break;
+ }
+ if (it == memlist.rbegin() || batch_file_number != m->file_number_) {
+ batch_file_number = m->file_number_;
+ ROCKS_LOG_BUFFER(log_buffer,
+ "[%s] Level-0 commit table #%" PRIu64 " started",
+ cfd->GetName().c_str(), m->file_number_);
+ edit_list.push_back(&m->edit_);
+ memtables_to_flush.push_back(m);
+#ifndef ROCKSDB_LITE
+ std::unique_ptr<FlushJobInfo> info = m->ReleaseFlushJobInfo();
+ if (info != nullptr) {
+ committed_flush_jobs_info->push_back(std::move(info));
+ }
+#else
+ (void)committed_flush_jobs_info;
+#endif // !ROCKSDB_LITE
+ }
+ batch_count++;
+ }
+
+ // TODO(myabandeh): Not sure how batch_count could be 0 here.
+ if (batch_count > 0) {
+ if (vset->db_options()->allow_2pc) {
+ assert(edit_list.size() > 0);
+ // We piggyback the information of earliest log file to keep in the
+ // manifest entry for the last file flushed.
+ edit_list.back()->SetMinLogNumberToKeep(PrecomputeMinLogNumberToKeep(
+ vset, *cfd, edit_list, memtables_to_flush, prep_tracker));
+ }
+
+ // this can release and reacquire the mutex.
+ s = vset->LogAndApply(cfd, mutable_cf_options, edit_list, mu,
+ db_directory);
+
+ // we will be changing the version in the next code path,
+ // so we better create a new one, since versions are immutable
+ InstallNewVersion();
+
+ // All the later memtables that have the same filenum
+ // are part of the same batch. They can be committed now.
+ uint64_t mem_id = 1; // how many memtables have been flushed.
+
+ // commit new state only if the column family is NOT dropped.
+ // The reason is as follows (refer to
+ // ColumnFamilyTest.FlushAndDropRaceCondition).
+ // If the column family is dropped, then according to LogAndApply, its
+ // corresponding flush operation is NOT written to the MANIFEST. This
+ // means the DB is not aware of the L0 files generated from the flush.
+ // By committing the new state, we remove the memtable from the memtable
+ // list. Creating an iterator on this column family will not be able to
+ // read full data since the memtable is removed, and the DB is not aware
+ // of the L0 files, causing MergingIterator unable to build child
+ // iterators. RocksDB contract requires that the iterator can be created
+ // on a dropped column family, and we must be able to
+ // read full data as long as column family handle is not deleted, even if
+ // the column family is dropped.
+ if (s.ok() && !cfd->IsDropped()) { // commit new state
+ while (batch_count-- > 0) {
+ MemTable* m = current_->memlist_.back();
+ ROCKS_LOG_BUFFER(log_buffer, "[%s] Level-0 commit table #%" PRIu64
+ ": memtable #%" PRIu64 " done",
+ cfd->GetName().c_str(), m->file_number_, mem_id);
+ assert(m->file_number_ > 0);
+ current_->Remove(m, to_delete);
+ UpdateCachedValuesFromMemTableListVersion();
+ ResetTrimHistoryNeeded();
+ ++mem_id;
+ }
+ } else {
+ for (auto it = current_->memlist_.rbegin(); batch_count-- > 0; ++it) {
+ MemTable* m = *it;
+ // commit failed. setup state so that we can flush again.
+ ROCKS_LOG_BUFFER(log_buffer, "Level-0 commit table #%" PRIu64
+ ": memtable #%" PRIu64 " failed",
+ m->file_number_, mem_id);
+ m->flush_completed_ = false;
+ m->flush_in_progress_ = false;
+ m->edit_.Clear();
+ num_flush_not_started_++;
+ m->file_number_ = 0;
+ imm_flush_needed.store(true, std::memory_order_release);
+ ++mem_id;
+ }
+ }
+ }
+ }
+ commit_in_progress_ = false;
+ return s;
+}
+
+// New memtables are inserted at the front of the list.
+void MemTableList::Add(MemTable* m, autovector<MemTable*>* to_delete) {
+ assert(static_cast<int>(current_->memlist_.size()) >= num_flush_not_started_);
+ InstallNewVersion();
+ // this method is used to move mutable memtable into an immutable list.
+ // since mutable memtable is already refcounted by the DBImpl,
+ // and when moving to the imutable list we don't unref it,
+ // we don't have to ref the memtable here. we just take over the
+ // reference from the DBImpl.
+ current_->Add(m, to_delete);
+ m->MarkImmutable();
+ num_flush_not_started_++;
+ if (num_flush_not_started_ == 1) {
+ imm_flush_needed.store(true, std::memory_order_release);
+ }
+ UpdateCachedValuesFromMemTableListVersion();
+ ResetTrimHistoryNeeded();
+}
+
+void MemTableList::TrimHistory(autovector<MemTable*>* to_delete, size_t usage) {
+ InstallNewVersion();
+ current_->TrimHistory(to_delete, usage);
+ UpdateCachedValuesFromMemTableListVersion();
+ ResetTrimHistoryNeeded();
+}
+
+// Returns an estimate of the number of bytes of data in use.
+size_t MemTableList::ApproximateUnflushedMemTablesMemoryUsage() {
+ size_t total_size = 0;
+ for (auto& memtable : current_->memlist_) {
+ total_size += memtable->ApproximateMemoryUsage();
+ }
+ return total_size;
+}
+
+size_t MemTableList::ApproximateMemoryUsage() { return current_memory_usage_; }
+
+size_t MemTableList::ApproximateMemoryUsageExcludingLast() const {
+ const size_t usage =
+ current_memory_usage_excluding_last_.load(std::memory_order_relaxed);
+ return usage;
+}
+
+bool MemTableList::HasHistory() const {
+ const bool has_history = current_has_history_.load(std::memory_order_relaxed);
+ return has_history;
+}
+
+void MemTableList::UpdateCachedValuesFromMemTableListVersion() {
+ const size_t total_memtable_size =
+ current_->ApproximateMemoryUsageExcludingLast();
+ current_memory_usage_excluding_last_.store(total_memtable_size,
+ std::memory_order_relaxed);
+
+ const bool has_history = current_->HasHistory();
+ current_has_history_.store(has_history, std::memory_order_relaxed);
+}
+
+uint64_t MemTableList::ApproximateOldestKeyTime() const {
+ if (!current_->memlist_.empty()) {
+ return current_->memlist_.back()->ApproximateOldestKeyTime();
+ }
+ return std::numeric_limits<uint64_t>::max();
+}
+
+void MemTableList::InstallNewVersion() {
+ if (current_->refs_ == 1) {
+ // we're the only one using the version, just keep using it
+ } else {
+ // somebody else holds the current version, we need to create new one
+ MemTableListVersion* version = current_;
+ current_ = new MemTableListVersion(&current_memory_usage_, current_);
+ current_->Ref();
+ version->Unref();
+ }
+}
+
+uint64_t MemTableList::PrecomputeMinLogContainingPrepSection(
+ const autovector<MemTable*>& memtables_to_flush) {
+ uint64_t min_log = 0;
+
+ for (auto& m : current_->memlist_) {
+ // Assume the list is very short, we can live with O(m*n). We can optimize
+ // if the performance has some problem.
+ bool should_skip = false;
+ for (MemTable* m_to_flush : memtables_to_flush) {
+ if (m == m_to_flush) {
+ should_skip = true;
+ break;
+ }
+ }
+ if (should_skip) {
+ continue;
+ }
+
+ auto log = m->GetMinLogContainingPrepSection();
+
+ if (log > 0 && (min_log == 0 || log < min_log)) {
+ min_log = log;
+ }
+ }
+
+ return min_log;
+}
+
+// Commit a successful atomic flush in the manifest file.
+Status InstallMemtableAtomicFlushResults(
+ const autovector<MemTableList*>* imm_lists,
+ const autovector<ColumnFamilyData*>& cfds,
+ const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+ const autovector<const autovector<MemTable*>*>& mems_list, VersionSet* vset,
+ InstrumentedMutex* mu, const autovector<FileMetaData*>& file_metas,
+ autovector<MemTable*>* to_delete, Directory* db_directory,
+ LogBuffer* log_buffer) {
+ AutoThreadOperationStageUpdater stage_updater(
+ ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS);
+ mu->AssertHeld();
+
+ size_t num = mems_list.size();
+ assert(cfds.size() == num);
+ if (imm_lists != nullptr) {
+ assert(imm_lists->size() == num);
+ }
+ for (size_t k = 0; k != num; ++k) {
+#ifndef NDEBUG
+ const auto* imm =
+ (imm_lists == nullptr) ? cfds[k]->imm() : imm_lists->at(k);
+ if (!mems_list[k]->empty()) {
+ assert((*mems_list[k])[0]->GetID() == imm->GetEarliestMemTableID());
+ }
+#endif
+ assert(nullptr != file_metas[k]);
+ for (size_t i = 0; i != mems_list[k]->size(); ++i) {
+ assert(i == 0 || (*mems_list[k])[i]->GetEdits()->NumEntries() == 0);
+ (*mems_list[k])[i]->SetFlushCompleted(true);
+ (*mems_list[k])[i]->SetFileNumber(file_metas[k]->fd.GetNumber());
+ }
+ }
+
+ Status s;
+
+ autovector<autovector<VersionEdit*>> edit_lists;
+ uint32_t num_entries = 0;
+ for (const auto mems : mems_list) {
+ assert(mems != nullptr);
+ autovector<VersionEdit*> edits;
+ assert(!mems->empty());
+ edits.emplace_back((*mems)[0]->GetEdits());
+ ++num_entries;
+ edit_lists.emplace_back(edits);
+ }
+ // Mark the version edits as an atomic group if the number of version edits
+ // exceeds 1.
+ if (cfds.size() > 1) {
+ for (auto& edits : edit_lists) {
+ assert(edits.size() == 1);
+ edits[0]->MarkAtomicGroup(--num_entries);
+ }
+ assert(0 == num_entries);
+ }
+
+ // this can release and reacquire the mutex.
+ s = vset->LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu,
+ db_directory);
+
+ for (size_t k = 0; k != cfds.size(); ++k) {
+ auto* imm = (imm_lists == nullptr) ? cfds[k]->imm() : imm_lists->at(k);
+ imm->InstallNewVersion();
+ }
+
+ if (s.ok() || s.IsColumnFamilyDropped()) {
+ for (size_t i = 0; i != cfds.size(); ++i) {
+ if (cfds[i]->IsDropped()) {
+ continue;
+ }
+ auto* imm = (imm_lists == nullptr) ? cfds[i]->imm() : imm_lists->at(i);
+ for (auto m : *mems_list[i]) {
+ assert(m->GetFileNumber() > 0);
+ uint64_t mem_id = m->GetID();
+ ROCKS_LOG_BUFFER(log_buffer,
+ "[%s] Level-0 commit table #%" PRIu64
+ ": memtable #%" PRIu64 " done",
+ cfds[i]->GetName().c_str(), m->GetFileNumber(),
+ mem_id);
+ imm->current_->Remove(m, to_delete);
+ imm->UpdateCachedValuesFromMemTableListVersion();
+ imm->ResetTrimHistoryNeeded();
+ }
+ }
+ } else {
+ for (size_t i = 0; i != cfds.size(); ++i) {
+ auto* imm = (imm_lists == nullptr) ? cfds[i]->imm() : imm_lists->at(i);
+ for (auto m : *mems_list[i]) {
+ uint64_t mem_id = m->GetID();
+ ROCKS_LOG_BUFFER(log_buffer,
+ "[%s] Level-0 commit table #%" PRIu64
+ ": memtable #%" PRIu64 " failed",
+ cfds[i]->GetName().c_str(), m->GetFileNumber(),
+ mem_id);
+ m->SetFlushCompleted(false);
+ m->SetFlushInProgress(false);
+ m->GetEdits()->Clear();
+ m->SetFileNumber(0);
+ imm->num_flush_not_started_++;
+ }
+ imm->imm_flush_needed.store(true, std::memory_order_release);
+ }
+ }
+
+ return s;
+}
+
+void MemTableList::RemoveOldMemTables(uint64_t log_number,
+ autovector<MemTable*>* to_delete) {
+ assert(to_delete != nullptr);
+ InstallNewVersion();
+ auto& memlist = current_->memlist_;
+ autovector<MemTable*> old_memtables;
+ for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
+ MemTable* mem = *it;
+ if (mem->GetNextLogNumber() > log_number) {
+ break;
+ }
+ old_memtables.push_back(mem);
+ }
+
+ for (auto it = old_memtables.begin(); it != old_memtables.end(); ++it) {
+ MemTable* mem = *it;
+ current_->Remove(mem, to_delete);
+ --num_flush_not_started_;
+ if (0 == num_flush_not_started_) {
+ imm_flush_needed.store(false, std::memory_order_release);
+ }
+ }
+
+ UpdateCachedValuesFromMemTableListVersion();
+ ResetTrimHistoryNeeded();
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/memtable_list.h b/src/rocksdb/db/memtable_list.h
new file mode 100644
index 000000000..a6acf6a32
--- /dev/null
+++ b/src/rocksdb/db/memtable_list.h
@@ -0,0 +1,422 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#include <deque>
+#include <limits>
+#include <list>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/logs_with_prep_tracker.h"
+#include "db/memtable.h"
+#include "db/range_del_aggregator.h"
+#include "file/filename.h"
+#include "logging/log_buffer.h"
+#include "monitoring/instrumented_mutex.h"
+#include "rocksdb/db.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/types.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ColumnFamilyData;
+class InternalKeyComparator;
+class InstrumentedMutex;
+class MergeIteratorBuilder;
+class MemTableList;
+
+struct FlushJobInfo;
+
+// keeps a list of immutable memtables in a vector. the list is immutable
+// if refcount is bigger than one. It is used as a state for Get() and
+// Iterator code paths
+//
+// This class is not thread-safe. External synchronization is required
+// (such as holding the db mutex or being on the write thread).
+class MemTableListVersion {
+ public:
+ explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
+ MemTableListVersion* old = nullptr);
+ explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
+ int max_write_buffer_number_to_maintain,
+ int64_t max_write_buffer_size_to_maintain);
+
+ void Ref();
+ void Unref(autovector<MemTable*>* to_delete = nullptr);
+
+ // Search all the memtables starting from the most recent one.
+ // Return the most recent value found, if any.
+ //
+ // If any operation was found for this key, its most recent sequence number
+ // will be stored in *seq on success (regardless of whether true/false is
+ // returned). Otherwise, *seq will be set to kMaxSequenceNumber.
+ bool Get(const LookupKey& key, std::string* value, Status* s,
+ MergeContext* merge_context,
+ SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
+ const ReadOptions& read_opts, ReadCallback* callback = nullptr,
+ bool* is_blob_index = nullptr);
+
+ bool Get(const LookupKey& key, std::string* value, Status* s,
+ MergeContext* merge_context,
+ SequenceNumber* max_covering_tombstone_seq,
+ const ReadOptions& read_opts, ReadCallback* callback = nullptr,
+ bool* is_blob_index = nullptr) {
+ SequenceNumber seq;
+ return Get(key, value, s, merge_context, max_covering_tombstone_seq, &seq,
+ read_opts, callback, is_blob_index);
+ }
+
+ void MultiGet(const ReadOptions& read_options, MultiGetRange* range,
+ ReadCallback* callback, bool* is_blob);
+
+ // Returns all the merge operands corresponding to the key by searching all
+ // memtables starting from the most recent one.
+ bool GetMergeOperands(const LookupKey& key, Status* s,
+ MergeContext* merge_context,
+ SequenceNumber* max_covering_tombstone_seq,
+ const ReadOptions& read_opts);
+
+ // Similar to Get(), but searches the Memtable history of memtables that
+ // have already been flushed. Should only be used from in-memory only
+ // queries (such as Transaction validation) as the history may contain
+ // writes that are also present in the SST files.
+ bool GetFromHistory(const LookupKey& key, std::string* value, Status* s,
+ MergeContext* merge_context,
+ SequenceNumber* max_covering_tombstone_seq,
+ SequenceNumber* seq, const ReadOptions& read_opts,
+ bool* is_blob_index = nullptr);
+ bool GetFromHistory(const LookupKey& key, std::string* value, Status* s,
+ MergeContext* merge_context,
+ SequenceNumber* max_covering_tombstone_seq,
+ const ReadOptions& read_opts,
+ bool* is_blob_index = nullptr) {
+ SequenceNumber seq;
+ return GetFromHistory(key, value, s, merge_context,
+ max_covering_tombstone_seq, &seq, read_opts,
+ is_blob_index);
+ }
+
+ Status AddRangeTombstoneIterators(const ReadOptions& read_opts, Arena* arena,
+ RangeDelAggregator* range_del_agg);
+
+ void AddIterators(const ReadOptions& options,
+ std::vector<InternalIterator*>* iterator_list,
+ Arena* arena);
+
+ void AddIterators(const ReadOptions& options,
+ MergeIteratorBuilder* merge_iter_builder);
+
+ uint64_t GetTotalNumEntries() const;
+
+ uint64_t GetTotalNumDeletes() const;
+
+ MemTable::MemTableStats ApproximateStats(const Slice& start_ikey,
+ const Slice& end_ikey);
+
+ // Returns the value of MemTable::GetEarliestSequenceNumber() on the most
+ // recent MemTable in this list or kMaxSequenceNumber if the list is empty.
+ // If include_history=true, will also search Memtables in MemTableList
+ // History.
+ SequenceNumber GetEarliestSequenceNumber(bool include_history = false) const;
+
+ private:
+ friend class MemTableList;
+
+ friend Status InstallMemtableAtomicFlushResults(
+ const autovector<MemTableList*>* imm_lists,
+ const autovector<ColumnFamilyData*>& cfds,
+ const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+ const autovector<const autovector<MemTable*>*>& mems_list,
+ VersionSet* vset, InstrumentedMutex* mu,
+ const autovector<FileMetaData*>& file_meta,
+ autovector<MemTable*>* to_delete, Directory* db_directory,
+ LogBuffer* log_buffer);
+
+ // REQUIRE: m is an immutable memtable
+ void Add(MemTable* m, autovector<MemTable*>* to_delete);
+ // REQUIRE: m is an immutable memtable
+ void Remove(MemTable* m, autovector<MemTable*>* to_delete);
+
+ void TrimHistory(autovector<MemTable*>* to_delete, size_t usage);
+
+ bool GetFromList(std::list<MemTable*>* list, const LookupKey& key,
+ std::string* value, Status* s, MergeContext* merge_context,
+ SequenceNumber* max_covering_tombstone_seq,
+ SequenceNumber* seq, const ReadOptions& read_opts,
+ ReadCallback* callback = nullptr,
+ bool* is_blob_index = nullptr);
+
+ void AddMemTable(MemTable* m);
+
+ void UnrefMemTable(autovector<MemTable*>* to_delete, MemTable* m);
+
+ // Calculate the total amount of memory used by memlist_ and memlist_history_
+ // excluding the last MemTable in memlist_history_. The reason for excluding
+ // the last MemTable is to see if dropping the last MemTable will keep total
+ // memory usage above or equal to max_write_buffer_size_to_maintain_
+ size_t ApproximateMemoryUsageExcludingLast() const;
+
+ // Whether this version contains flushed memtables that are only kept around
+ // for transaction conflict checking.
+ bool HasHistory() const { return !memlist_history_.empty(); }
+
+ bool MemtableLimitExceeded(size_t usage);
+
+ // Immutable MemTables that have not yet been flushed.
+ std::list<MemTable*> memlist_;
+
+ // MemTables that have already been flushed
+ // (used during Transaction validation)
+ std::list<MemTable*> memlist_history_;
+
+ // Maximum number of MemTables to keep in memory (including both flushed
+ const int max_write_buffer_number_to_maintain_;
+ // Maximum size of MemTables to keep in memory (including both flushed
+ // and not-yet-flushed tables).
+ const int64_t max_write_buffer_size_to_maintain_;
+
+ int refs_ = 0;
+
+ size_t* parent_memtable_list_memory_usage_;
+};
+
+// This class stores references to all the immutable memtables.
+// The memtables are flushed to L0 as soon as possible and in
+// any order. If there are more than one immutable memtable, their
+// flushes can occur concurrently. However, they are 'committed'
+// to the manifest in FIFO order to maintain correctness and
+// recoverability from a crash.
+//
+//
+// Other than imm_flush_needed and imm_trim_needed, this class is not
+// thread-safe and requires external synchronization (such as holding the db
+// mutex or being on the write thread.)
+class MemTableList {
+ public:
+ // A list of memtables.
+ explicit MemTableList(int min_write_buffer_number_to_merge,
+ int max_write_buffer_number_to_maintain,
+ int64_t max_write_buffer_size_to_maintain)
+ : imm_flush_needed(false),
+ imm_trim_needed(false),
+ min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge),
+ current_(new MemTableListVersion(&current_memory_usage_,
+ max_write_buffer_number_to_maintain,
+ max_write_buffer_size_to_maintain)),
+ num_flush_not_started_(0),
+ commit_in_progress_(false),
+ flush_requested_(false),
+ current_memory_usage_(0),
+ current_memory_usage_excluding_last_(0),
+ current_has_history_(false) {
+ current_->Ref();
+ }
+
+ // Should not delete MemTableList without making sure MemTableList::current()
+ // is Unref()'d.
+ ~MemTableList() {}
+
+ MemTableListVersion* current() const { return current_; }
+
+ // so that background threads can detect non-nullptr pointer to
+ // determine whether there is anything more to start flushing.
+ std::atomic<bool> imm_flush_needed;
+
+ std::atomic<bool> imm_trim_needed;
+
+ // Returns the total number of memtables in the list that haven't yet
+ // been flushed and logged.
+ int NumNotFlushed() const;
+
+ // Returns total number of memtables in the list that have been
+ // completely flushed and logged.
+ int NumFlushed() const;
+
+ // Returns true if there is at least one memtable on which flush has
+ // not yet started.
+ bool IsFlushPending() const;
+
+ // Returns the earliest memtables that needs to be flushed. The returned
+ // memtables are guaranteed to be in the ascending order of created time.
+ void PickMemtablesToFlush(const uint64_t* max_memtable_id,
+ autovector<MemTable*>* mems);
+
+ // Reset status of the given memtable list back to pending state so that
+ // they can get picked up again on the next round of flush.
+ void RollbackMemtableFlush(const autovector<MemTable*>& mems,
+ uint64_t file_number);
+
+ // Try commit a successful flush in the manifest file. It might just return
+ // Status::OK letting a concurrent flush to do the actual the recording.
+ Status TryInstallMemtableFlushResults(
+ ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+ const autovector<MemTable*>& m, LogsWithPrepTracker* prep_tracker,
+ VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number,
+ autovector<MemTable*>* to_delete, Directory* db_directory,
+ LogBuffer* log_buffer,
+ std::list<std::unique_ptr<FlushJobInfo>>* committed_flush_jobs_info);
+
+ // New memtables are inserted at the front of the list.
+ // Takes ownership of the referenced held on *m by the caller of Add().
+ void Add(MemTable* m, autovector<MemTable*>* to_delete);
+
+ // Returns an estimate of the number of bytes of data in use.
+ size_t ApproximateMemoryUsage();
+
+ // Returns the cached current_memory_usage_excluding_last_ value.
+ size_t ApproximateMemoryUsageExcludingLast() const;
+
+ // Returns the cached current_has_history_ value.
+ bool HasHistory() const;
+
+ // Updates current_memory_usage_excluding_last_ and current_has_history_
+ // from MemTableListVersion. Must be called whenever InstallNewVersion is
+ // called.
+ void UpdateCachedValuesFromMemTableListVersion();
+
+ // `usage` is the current size of the mutable Memtable. When
+ // max_write_buffer_size_to_maintain is used, total size of mutable and
+ // immutable memtables is checked against it to decide whether to trim
+ // memtable list.
+ void TrimHistory(autovector<MemTable*>* to_delete, size_t usage);
+
+ // Returns an estimate of the number of bytes of data used by
+ // the unflushed mem-tables.
+ size_t ApproximateUnflushedMemTablesMemoryUsage();
+
+ // Returns an estimate of the timestamp of the earliest key.
+ uint64_t ApproximateOldestKeyTime() const;
+
+ // Request a flush of all existing memtables to storage. This will
+ // cause future calls to IsFlushPending() to return true if this list is
+ // non-empty (regardless of the min_write_buffer_number_to_merge
+ // parameter). This flush request will persist until the next time
+ // PickMemtablesToFlush() is called.
+ void FlushRequested() { flush_requested_ = true; }
+
+ bool HasFlushRequested() { return flush_requested_; }
+
+ // Returns true if a trim history should be scheduled and the caller should
+ // be the one to schedule it
+ bool MarkTrimHistoryNeeded() {
+ auto expected = false;
+ return imm_trim_needed.compare_exchange_strong(
+ expected, true, std::memory_order_relaxed, std::memory_order_relaxed);
+ }
+
+ void ResetTrimHistoryNeeded() {
+ auto expected = true;
+ imm_trim_needed.compare_exchange_strong(
+ expected, false, std::memory_order_relaxed, std::memory_order_relaxed);
+ }
+
+ // Copying allowed
+ // MemTableList(const MemTableList&);
+ // void operator=(const MemTableList&);
+
+ size_t* current_memory_usage() { return &current_memory_usage_; }
+
+ // Returns the min log containing the prep section after memtables listsed in
+ // `memtables_to_flush` are flushed and their status is persisted in manifest.
+ uint64_t PrecomputeMinLogContainingPrepSection(
+ const autovector<MemTable*>& memtables_to_flush);
+
+ uint64_t GetEarliestMemTableID() const {
+ auto& memlist = current_->memlist_;
+ if (memlist.empty()) {
+ return std::numeric_limits<uint64_t>::max();
+ }
+ return memlist.back()->GetID();
+ }
+
+ uint64_t GetLatestMemTableID() const {
+ auto& memlist = current_->memlist_;
+ if (memlist.empty()) {
+ return 0;
+ }
+ return memlist.front()->GetID();
+ }
+
+ void AssignAtomicFlushSeq(const SequenceNumber& seq) {
+ const auto& memlist = current_->memlist_;
+ // Scan the memtable list from new to old
+ for (auto it = memlist.begin(); it != memlist.end(); ++it) {
+ MemTable* mem = *it;
+ if (mem->atomic_flush_seqno_ == kMaxSequenceNumber) {
+ mem->atomic_flush_seqno_ = seq;
+ } else {
+ // Earlier memtables must have been assigned a atomic flush seq, no
+ // need to continue scan.
+ break;
+ }
+ }
+ }
+
+ // Used only by DBImplSecondary during log replay.
+ // Remove memtables whose data were written before the WAL with log_number
+ // was created, i.e. mem->GetNextLogNumber() <= log_number. The memtables are
+ // not freed, but put into a vector for future deref and reclamation.
+ void RemoveOldMemTables(uint64_t log_number,
+ autovector<MemTable*>* to_delete);
+
+ private:
+ friend Status InstallMemtableAtomicFlushResults(
+ const autovector<MemTableList*>* imm_lists,
+ const autovector<ColumnFamilyData*>& cfds,
+ const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+ const autovector<const autovector<MemTable*>*>& mems_list,
+ VersionSet* vset, InstrumentedMutex* mu,
+ const autovector<FileMetaData*>& file_meta,
+ autovector<MemTable*>* to_delete, Directory* db_directory,
+ LogBuffer* log_buffer);
+
+ // DB mutex held
+ void InstallNewVersion();
+
+ const int min_write_buffer_number_to_merge_;
+
+ MemTableListVersion* current_;
+
+ // the number of elements that still need flushing
+ int num_flush_not_started_;
+
+ // committing in progress
+ bool commit_in_progress_;
+
+ // Requested a flush of memtables to storage. It's possible to request that
+ // a subset of memtables be flushed.
+ bool flush_requested_;
+
+ // The current memory usage.
+ size_t current_memory_usage_;
+
+ // Cached value of current_->ApproximateMemoryUsageExcludingLast().
+ std::atomic<size_t> current_memory_usage_excluding_last_;
+
+ // Cached value of current_->HasHistory().
+ std::atomic<bool> current_has_history_;
+};
+
+// Installs memtable atomic flush results.
+// In most cases, imm_lists is nullptr, and the function simply uses the
+// immutable memtable lists associated with the cfds. There are unit tests that
+// installs flush results for external immutable memtable lists other than the
+// cfds' own immutable memtable lists, e.g. MemTableLIstTest. In this case,
+// imm_lists parameter is not nullptr.
+extern Status InstallMemtableAtomicFlushResults(
+ const autovector<MemTableList*>* imm_lists,
+ const autovector<ColumnFamilyData*>& cfds,
+ const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+ const autovector<const autovector<MemTable*>*>& mems_list, VersionSet* vset,
+ InstrumentedMutex* mu, const autovector<FileMetaData*>& file_meta,
+ autovector<MemTable*>* to_delete, Directory* db_directory,
+ LogBuffer* log_buffer);
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/memtable_list_test.cc b/src/rocksdb/db/memtable_list_test.cc
new file mode 100644
index 000000000..a92bc6c79
--- /dev/null
+++ b/src/rocksdb/db/memtable_list_test.cc
@@ -0,0 +1,922 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/memtable_list.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "db/merge_context.h"
+#include "db/version_set.h"
+#include "db/write_controller.h"
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MemTableListTest : public testing::Test {
+ public:
+ std::string dbname;
+ DB* db;
+ Options options;
+ std::vector<ColumnFamilyHandle*> handles;
+ std::atomic<uint64_t> file_number;
+
+ MemTableListTest() : db(nullptr), file_number(1) {
+ dbname = test::PerThreadDBPath("memtable_list_test");
+ options.create_if_missing = true;
+ DestroyDB(dbname, options);
+ }
+
+ // Create a test db if not yet created
+ void CreateDB() {
+ if (db == nullptr) {
+ options.create_if_missing = true;
+ DestroyDB(dbname, options);
+ // Open DB only with default column family
+ ColumnFamilyOptions cf_options;
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ cf_descs.emplace_back(kDefaultColumnFamilyName, cf_options);
+ Status s = DB::Open(options, dbname, cf_descs, &handles, &db);
+ EXPECT_OK(s);
+
+ ColumnFamilyOptions cf_opt1, cf_opt2;
+ cf_opt1.cf_paths.emplace_back(dbname + "_one_1",
+ std::numeric_limits<uint64_t>::max());
+ cf_opt2.cf_paths.emplace_back(dbname + "_two_1",
+ std::numeric_limits<uint64_t>::max());
+ int sz = static_cast<int>(handles.size());
+ handles.resize(sz + 2);
+ s = db->CreateColumnFamily(cf_opt1, "one", &handles[1]);
+ EXPECT_OK(s);
+ s = db->CreateColumnFamily(cf_opt2, "two", &handles[2]);
+ EXPECT_OK(s);
+
+ cf_descs.emplace_back("one", cf_options);
+ cf_descs.emplace_back("two", cf_options);
+ }
+ }
+
+ ~MemTableListTest() override {
+ if (db) {
+ std::vector<ColumnFamilyDescriptor> cf_descs(handles.size());
+ for (int i = 0; i != static_cast<int>(handles.size()); ++i) {
+ handles[i]->GetDescriptor(&cf_descs[i]);
+ }
+ for (auto h : handles) {
+ if (h) {
+ db->DestroyColumnFamilyHandle(h);
+ }
+ }
+ handles.clear();
+ delete db;
+ db = nullptr;
+ DestroyDB(dbname, options, cf_descs);
+ }
+ }
+
+ // Calls MemTableList::TryInstallMemtableFlushResults() and sets up all
+ // structures needed to call this function.
+ Status Mock_InstallMemtableFlushResults(
+ MemTableList* list, const MutableCFOptions& mutable_cf_options,
+ const autovector<MemTable*>& m, autovector<MemTable*>* to_delete) {
+ // Create a mock Logger
+ test::NullLogger logger;
+ LogBuffer log_buffer(DEBUG_LEVEL, &logger);
+
+ CreateDB();
+ // Create a mock VersionSet
+ DBOptions db_options;
+ db_options.file_system = FileSystem::Default();
+ ImmutableDBOptions immutable_db_options(db_options);
+ EnvOptions env_options;
+ std::shared_ptr<Cache> table_cache(NewLRUCache(50000, 16));
+ WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size);
+ WriteController write_controller(10000000u);
+
+ VersionSet versions(dbname, &immutable_db_options, env_options,
+ table_cache.get(), &write_buffer_manager,
+ &write_controller, /*block_cache_tracer=*/nullptr);
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
+ cf_descs.emplace_back("one", ColumnFamilyOptions());
+ cf_descs.emplace_back("two", ColumnFamilyOptions());
+
+ EXPECT_OK(versions.Recover(cf_descs, false));
+
+ // Create mock default ColumnFamilyData
+ auto column_family_set = versions.GetColumnFamilySet();
+ LogsWithPrepTracker dummy_prep_tracker;
+ auto cfd = column_family_set->GetDefault();
+ EXPECT_TRUE(nullptr != cfd);
+ uint64_t file_num = file_number.fetch_add(1);
+ // Create dummy mutex.
+ InstrumentedMutex mutex;
+ InstrumentedMutexLock l(&mutex);
+ std::list<std::unique_ptr<FlushJobInfo>> flush_jobs_info;
+ Status s = list->TryInstallMemtableFlushResults(
+ cfd, mutable_cf_options, m, &dummy_prep_tracker, &versions, &mutex,
+ file_num, to_delete, nullptr, &log_buffer, &flush_jobs_info);
+ return s;
+ }
+
+ // Calls MemTableList::InstallMemtableFlushResults() and sets up all
+ // structures needed to call this function.
+ Status Mock_InstallMemtableAtomicFlushResults(
+ autovector<MemTableList*>& lists, const autovector<uint32_t>& cf_ids,
+ const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+ const autovector<const autovector<MemTable*>*>& mems_list,
+ autovector<MemTable*>* to_delete) {
+ // Create a mock Logger
+ test::NullLogger logger;
+ LogBuffer log_buffer(DEBUG_LEVEL, &logger);
+
+ CreateDB();
+ // Create a mock VersionSet
+ DBOptions db_options;
+ db_options.file_system.reset(new LegacyFileSystemWrapper(db_options.env));
+
+ ImmutableDBOptions immutable_db_options(db_options);
+ EnvOptions env_options;
+ std::shared_ptr<Cache> table_cache(NewLRUCache(50000, 16));
+ WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size);
+ WriteController write_controller(10000000u);
+
+ VersionSet versions(dbname, &immutable_db_options, env_options,
+ table_cache.get(), &write_buffer_manager,
+ &write_controller, /*block_cache_tracer=*/nullptr);
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
+ cf_descs.emplace_back("one", ColumnFamilyOptions());
+ cf_descs.emplace_back("two", ColumnFamilyOptions());
+ EXPECT_OK(versions.Recover(cf_descs, false));
+
+ // Create mock default ColumnFamilyData
+
+ auto column_family_set = versions.GetColumnFamilySet();
+
+ LogsWithPrepTracker dummy_prep_tracker;
+ autovector<ColumnFamilyData*> cfds;
+ for (int i = 0; i != static_cast<int>(cf_ids.size()); ++i) {
+ cfds.emplace_back(column_family_set->GetColumnFamily(cf_ids[i]));
+ EXPECT_NE(nullptr, cfds[i]);
+ }
+ std::vector<FileMetaData> file_metas;
+ file_metas.reserve(cf_ids.size());
+ for (size_t i = 0; i != cf_ids.size(); ++i) {
+ FileMetaData meta;
+ uint64_t file_num = file_number.fetch_add(1);
+ meta.fd = FileDescriptor(file_num, 0, 0);
+ file_metas.emplace_back(meta);
+ }
+ autovector<FileMetaData*> file_meta_ptrs;
+ for (auto& meta : file_metas) {
+ file_meta_ptrs.push_back(&meta);
+ }
+ InstrumentedMutex mutex;
+ InstrumentedMutexLock l(&mutex);
+ return InstallMemtableAtomicFlushResults(
+ &lists, cfds, mutable_cf_options_list, mems_list, &versions, &mutex,
+ file_meta_ptrs, to_delete, nullptr, &log_buffer);
+ }
+};
+
+TEST_F(MemTableListTest, Empty) {
+ // Create an empty MemTableList and validate basic functions.
+ MemTableList list(1, 0, 0);
+
+ ASSERT_EQ(0, list.NumNotFlushed());
+ ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+ ASSERT_FALSE(list.IsFlushPending());
+
+ autovector<MemTable*> mems;
+ list.PickMemtablesToFlush(nullptr /* memtable_id */, &mems);
+ ASSERT_EQ(0, mems.size());
+
+ autovector<MemTable*> to_delete;
+ list.current()->Unref(&to_delete);
+ ASSERT_EQ(0, to_delete.size());
+}
+
+TEST_F(MemTableListTest, GetTest) {
+ // Create MemTableList
+ int min_write_buffer_number_to_merge = 2;
+ int max_write_buffer_number_to_maintain = 0;
+ int64_t max_write_buffer_size_to_maintain = 0;
+ MemTableList list(min_write_buffer_number_to_merge,
+ max_write_buffer_number_to_maintain,
+ max_write_buffer_size_to_maintain);
+
+ SequenceNumber seq = 1;
+ std::string value;
+ Status s;
+ MergeContext merge_context;
+ InternalKeyComparator ikey_cmp(options.comparator);
+ SequenceNumber max_covering_tombstone_seq = 0;
+ autovector<MemTable*> to_delete;
+
+ LookupKey lkey("key1", seq);
+ bool found = list.current()->Get(lkey, &value, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ ASSERT_FALSE(found);
+
+ // Create a MemTable
+ InternalKeyComparator cmp(BytewiseComparator());
+ auto factory = std::make_shared<SkipListFactory>();
+ options.memtable_factory = factory;
+ ImmutableCFOptions ioptions(options);
+
+ WriteBufferManager wb(options.db_write_buffer_size);
+ MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+ kMaxSequenceNumber, 0 /* column_family_id */);
+ mem->Ref();
+
+ // Write some keys to this memtable.
+ mem->Add(++seq, kTypeDeletion, "key1", "");
+ mem->Add(++seq, kTypeValue, "key2", "value2");
+ mem->Add(++seq, kTypeValue, "key1", "value1");
+ mem->Add(++seq, kTypeValue, "key2", "value2.2");
+
+ // Fetch the newly written keys
+ merge_context.Clear();
+ found = mem->Get(LookupKey("key1", seq), &value, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ ASSERT_TRUE(s.ok() && found);
+ ASSERT_EQ(value, "value1");
+
+ merge_context.Clear();
+ found = mem->Get(LookupKey("key1", 2), &value, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ // MemTable found out that this key is *not* found (at this sequence#)
+ ASSERT_TRUE(found && s.IsNotFound());
+
+ merge_context.Clear();
+ found = mem->Get(LookupKey("key2", seq), &value, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ ASSERT_TRUE(s.ok() && found);
+ ASSERT_EQ(value, "value2.2");
+
+ ASSERT_EQ(4, mem->num_entries());
+ ASSERT_EQ(1, mem->num_deletes());
+
+ // Add memtable to list
+ list.Add(mem, &to_delete);
+
+ SequenceNumber saved_seq = seq;
+
+ // Create another memtable and write some keys to it
+ WriteBufferManager wb2(options.db_write_buffer_size);
+ MemTable* mem2 = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb2,
+ kMaxSequenceNumber, 0 /* column_family_id */);
+ mem2->Ref();
+
+ mem2->Add(++seq, kTypeDeletion, "key1", "");
+ mem2->Add(++seq, kTypeValue, "key2", "value2.3");
+
+ // Add second memtable to list
+ list.Add(mem2, &to_delete);
+
+ // Fetch keys via MemTableList
+ merge_context.Clear();
+ found =
+ list.current()->Get(LookupKey("key1", seq), &value, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ ASSERT_TRUE(found && s.IsNotFound());
+
+ merge_context.Clear();
+ found = list.current()->Get(LookupKey("key1", saved_seq), &value, &s,
+ &merge_context, &max_covering_tombstone_seq,
+ ReadOptions());
+ ASSERT_TRUE(s.ok() && found);
+ ASSERT_EQ("value1", value);
+
+ merge_context.Clear();
+ found =
+ list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ ASSERT_TRUE(s.ok() && found);
+ ASSERT_EQ(value, "value2.3");
+
+ merge_context.Clear();
+ found = list.current()->Get(LookupKey("key2", 1), &value, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ ASSERT_FALSE(found);
+
+ ASSERT_EQ(2, list.NumNotFlushed());
+
+ list.current()->Unref(&to_delete);
+ for (MemTable* m : to_delete) {
+ delete m;
+ }
+}
+
+TEST_F(MemTableListTest, GetFromHistoryTest) {
+ // Create MemTableList
+ int min_write_buffer_number_to_merge = 2;
+ int max_write_buffer_number_to_maintain = 2;
+ int64_t max_write_buffer_size_to_maintain = 2000;
+ MemTableList list(min_write_buffer_number_to_merge,
+ max_write_buffer_number_to_maintain,
+ max_write_buffer_size_to_maintain);
+
+ SequenceNumber seq = 1;
+ std::string value;
+ Status s;
+ MergeContext merge_context;
+ InternalKeyComparator ikey_cmp(options.comparator);
+ SequenceNumber max_covering_tombstone_seq = 0;
+ autovector<MemTable*> to_delete;
+
+ LookupKey lkey("key1", seq);
+ bool found = list.current()->Get(lkey, &value, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ ASSERT_FALSE(found);
+
+ // Create a MemTable
+ InternalKeyComparator cmp(BytewiseComparator());
+ auto factory = std::make_shared<SkipListFactory>();
+ options.memtable_factory = factory;
+ ImmutableCFOptions ioptions(options);
+
+ WriteBufferManager wb(options.db_write_buffer_size);
+ MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+ kMaxSequenceNumber, 0 /* column_family_id */);
+ mem->Ref();
+
+ // Write some keys to this memtable.
+ mem->Add(++seq, kTypeDeletion, "key1", "");
+ mem->Add(++seq, kTypeValue, "key2", "value2");
+ mem->Add(++seq, kTypeValue, "key2", "value2.2");
+
+ // Fetch the newly written keys
+ merge_context.Clear();
+ found = mem->Get(LookupKey("key1", seq), &value, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ // MemTable found out that this key is *not* found (at this sequence#)
+ ASSERT_TRUE(found && s.IsNotFound());
+
+ merge_context.Clear();
+ found = mem->Get(LookupKey("key2", seq), &value, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ ASSERT_TRUE(s.ok() && found);
+ ASSERT_EQ(value, "value2.2");
+
+ // Add memtable to list
+ list.Add(mem, &to_delete);
+ ASSERT_EQ(0, to_delete.size());
+
+ // Fetch keys via MemTableList
+ merge_context.Clear();
+ found =
+ list.current()->Get(LookupKey("key1", seq), &value, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ ASSERT_TRUE(found && s.IsNotFound());
+
+ merge_context.Clear();
+ found =
+ list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ ASSERT_TRUE(s.ok() && found);
+ ASSERT_EQ("value2.2", value);
+
+ // Flush this memtable from the list.
+ // (It will then be a part of the memtable history).
+ autovector<MemTable*> to_flush;
+ list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
+ ASSERT_EQ(1, to_flush.size());
+
+ MutableCFOptions mutable_cf_options(options);
+ s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush,
+ &to_delete);
+ ASSERT_OK(s);
+ ASSERT_EQ(0, list.NumNotFlushed());
+ ASSERT_EQ(1, list.NumFlushed());
+ ASSERT_EQ(0, to_delete.size());
+
+ // Verify keys are no longer in MemTableList
+ merge_context.Clear();
+ found =
+ list.current()->Get(LookupKey("key1", seq), &value, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ ASSERT_FALSE(found);
+
+ merge_context.Clear();
+ found =
+ list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ ASSERT_FALSE(found);
+
+ // Verify keys are present in history
+ merge_context.Clear();
+ found = list.current()->GetFromHistory(
+ LookupKey("key1", seq), &value, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ ASSERT_TRUE(found && s.IsNotFound());
+
+ merge_context.Clear();
+ found = list.current()->GetFromHistory(
+ LookupKey("key2", seq), &value, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ ASSERT_TRUE(found);
+ ASSERT_EQ("value2.2", value);
+
+ // Create another memtable and write some keys to it
+ WriteBufferManager wb2(options.db_write_buffer_size);
+ MemTable* mem2 = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb2,
+ kMaxSequenceNumber, 0 /* column_family_id */);
+ mem2->Ref();
+
+ mem2->Add(++seq, kTypeDeletion, "key1", "");
+ mem2->Add(++seq, kTypeValue, "key3", "value3");
+
+ // Add second memtable to list
+ list.Add(mem2, &to_delete);
+ ASSERT_EQ(0, to_delete.size());
+
+ to_flush.clear();
+ list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
+ ASSERT_EQ(1, to_flush.size());
+
+ // Flush second memtable
+ s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush,
+ &to_delete);
+ ASSERT_OK(s);
+ ASSERT_EQ(0, list.NumNotFlushed());
+ ASSERT_EQ(2, list.NumFlushed());
+ ASSERT_EQ(0, to_delete.size());
+
+ // Add a third memtable to push the first memtable out of the history
+ WriteBufferManager wb3(options.db_write_buffer_size);
+ MemTable* mem3 = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb3,
+ kMaxSequenceNumber, 0 /* column_family_id */);
+ mem3->Ref();
+ list.Add(mem3, &to_delete);
+ ASSERT_EQ(1, list.NumNotFlushed());
+ ASSERT_EQ(1, list.NumFlushed());
+ ASSERT_EQ(1, to_delete.size());
+
+ // Verify keys are no longer in MemTableList
+ merge_context.Clear();
+ found =
+ list.current()->Get(LookupKey("key1", seq), &value, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ ASSERT_FALSE(found);
+
+ merge_context.Clear();
+ found =
+ list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ ASSERT_FALSE(found);
+
+ merge_context.Clear();
+ found =
+ list.current()->Get(LookupKey("key3", seq), &value, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ ASSERT_FALSE(found);
+
+ // Verify that the second memtable's keys are in the history
+ merge_context.Clear();
+ found = list.current()->GetFromHistory(
+ LookupKey("key1", seq), &value, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ ASSERT_TRUE(found && s.IsNotFound());
+
+ merge_context.Clear();
+ found = list.current()->GetFromHistory(
+ LookupKey("key3", seq), &value, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ ASSERT_TRUE(found);
+ ASSERT_EQ("value3", value);
+
+ // Verify that key2 from the first memtable is no longer in the history
+ merge_context.Clear();
+ found =
+ list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ ASSERT_FALSE(found);
+
+ // Cleanup
+ list.current()->Unref(&to_delete);
+ ASSERT_EQ(3, to_delete.size());
+ for (MemTable* m : to_delete) {
+ delete m;
+ }
+}
+
+TEST_F(MemTableListTest, FlushPendingTest) {
+ const int num_tables = 6;
+ SequenceNumber seq = 1;
+ Status s;
+
+ auto factory = std::make_shared<SkipListFactory>();
+ options.memtable_factory = factory;
+ ImmutableCFOptions ioptions(options);
+ InternalKeyComparator cmp(BytewiseComparator());
+ WriteBufferManager wb(options.db_write_buffer_size);
+ autovector<MemTable*> to_delete;
+
+ // Create MemTableList
+ int min_write_buffer_number_to_merge = 3;
+ int max_write_buffer_number_to_maintain = 7;
+ int64_t max_write_buffer_size_to_maintain =
+ 7 * static_cast<int>(options.write_buffer_size);
+ MemTableList list(min_write_buffer_number_to_merge,
+ max_write_buffer_number_to_maintain,
+ max_write_buffer_size_to_maintain);
+
+ // Create some MemTables
+ uint64_t memtable_id = 0;
+ std::vector<MemTable*> tables;
+ MutableCFOptions mutable_cf_options(options);
+ for (int i = 0; i < num_tables; i++) {
+ MemTable* mem = new MemTable(cmp, ioptions, mutable_cf_options, &wb,
+ kMaxSequenceNumber, 0 /* column_family_id */);
+ mem->SetID(memtable_id++);
+ mem->Ref();
+
+ std::string value;
+ MergeContext merge_context;
+
+ mem->Add(++seq, kTypeValue, "key1", ToString(i));
+ mem->Add(++seq, kTypeValue, "keyN" + ToString(i), "valueN");
+ mem->Add(++seq, kTypeValue, "keyX" + ToString(i), "value");
+ mem->Add(++seq, kTypeValue, "keyM" + ToString(i), "valueM");
+ mem->Add(++seq, kTypeDeletion, "keyX" + ToString(i), "");
+
+ tables.push_back(mem);
+ }
+
+ // Nothing to flush
+ ASSERT_FALSE(list.IsFlushPending());
+ ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+ autovector<MemTable*> to_flush;
+ list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
+ ASSERT_EQ(0, to_flush.size());
+
+ // Request a flush even though there is nothing to flush
+ list.FlushRequested();
+ ASSERT_FALSE(list.IsFlushPending());
+ ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+ // Attempt to 'flush' to clear request for flush
+ list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
+ ASSERT_EQ(0, to_flush.size());
+ ASSERT_FALSE(list.IsFlushPending());
+ ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+ // Request a flush again
+ list.FlushRequested();
+ // No flush pending since the list is empty.
+ ASSERT_FALSE(list.IsFlushPending());
+ ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+ // Add 2 tables
+ list.Add(tables[0], &to_delete);
+ list.Add(tables[1], &to_delete);
+ ASSERT_EQ(2, list.NumNotFlushed());
+ ASSERT_EQ(0, to_delete.size());
+
+ // Even though we have less than the minimum to flush, a flush is
+ // pending since we had previously requested a flush and never called
+ // PickMemtablesToFlush() to clear the flush.
+ ASSERT_TRUE(list.IsFlushPending());
+ ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+ // Pick tables to flush
+ list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
+ ASSERT_EQ(2, to_flush.size());
+ ASSERT_EQ(2, list.NumNotFlushed());
+ ASSERT_FALSE(list.IsFlushPending());
+ ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+ // Revert flush
+ list.RollbackMemtableFlush(to_flush, 0);
+ ASSERT_FALSE(list.IsFlushPending());
+ ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+ to_flush.clear();
+
+ // Add another table
+ list.Add(tables[2], &to_delete);
+ // We now have the minimum to flush regardles of whether FlushRequested()
+ // was called.
+ ASSERT_TRUE(list.IsFlushPending());
+ ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+ ASSERT_EQ(0, to_delete.size());
+
+ // Pick tables to flush
+ list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
+ ASSERT_EQ(3, to_flush.size());
+ ASSERT_EQ(3, list.NumNotFlushed());
+ ASSERT_FALSE(list.IsFlushPending());
+ ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+ // Pick tables to flush again
+ autovector<MemTable*> to_flush2;
+ list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush2);
+ ASSERT_EQ(0, to_flush2.size());
+ ASSERT_EQ(3, list.NumNotFlushed());
+ ASSERT_FALSE(list.IsFlushPending());
+ ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+ // Add another table
+ list.Add(tables[3], &to_delete);
+ ASSERT_FALSE(list.IsFlushPending());
+ ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+ ASSERT_EQ(0, to_delete.size());
+
+ // Request a flush again
+ list.FlushRequested();
+ ASSERT_TRUE(list.IsFlushPending());
+ ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+ // Pick tables to flush again
+ list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush2);
+ ASSERT_EQ(1, to_flush2.size());
+ ASSERT_EQ(4, list.NumNotFlushed());
+ ASSERT_FALSE(list.IsFlushPending());
+ ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+ // Rollback first pick of tables
+ list.RollbackMemtableFlush(to_flush, 0);
+ ASSERT_TRUE(list.IsFlushPending());
+ ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+ to_flush.clear();
+
+ // Add another tables
+ list.Add(tables[4], &to_delete);
+ ASSERT_EQ(5, list.NumNotFlushed());
+ // We now have the minimum to flush regardles of whether FlushRequested()
+ ASSERT_TRUE(list.IsFlushPending());
+ ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+ ASSERT_EQ(0, to_delete.size());
+
+ // Pick tables to flush
+ list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
+ // Should pick 4 of 5 since 1 table has been picked in to_flush2
+ ASSERT_EQ(4, to_flush.size());
+ ASSERT_EQ(5, list.NumNotFlushed());
+ ASSERT_FALSE(list.IsFlushPending());
+ ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+ // Pick tables to flush again
+ autovector<MemTable*> to_flush3;
+ list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush3);
+ ASSERT_EQ(0, to_flush3.size()); // nothing not in progress of being flushed
+ ASSERT_EQ(5, list.NumNotFlushed());
+ ASSERT_FALSE(list.IsFlushPending());
+ ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+ // Flush the 4 memtables that were picked in to_flush
+ s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush,
+ &to_delete);
+ ASSERT_OK(s);
+
+ // Note: now to_flush contains tables[0,1,2,4]. to_flush2 contains
+ // tables[3].
+ // Current implementation will only commit memtables in the order they were
+ // created. So TryInstallMemtableFlushResults will install the first 3 tables
+ // in to_flush and stop when it encounters a table not yet flushed.
+ ASSERT_EQ(2, list.NumNotFlushed());
+ int num_in_history =
+ std::min(3, static_cast<int>(max_write_buffer_size_to_maintain) /
+ static_cast<int>(options.write_buffer_size));
+ ASSERT_EQ(num_in_history, list.NumFlushed());
+ ASSERT_EQ(5 - list.NumNotFlushed() - num_in_history, to_delete.size());
+
+ // Request a flush again. Should be nothing to flush
+ list.FlushRequested();
+ ASSERT_FALSE(list.IsFlushPending());
+ ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+ // Flush the 1 memtable that was picked in to_flush2
+ s = MemTableListTest::Mock_InstallMemtableFlushResults(
+ &list, mutable_cf_options, to_flush2, &to_delete);
+ ASSERT_OK(s);
+
+ // This will actually install 2 tables. The 1 we told it to flush, and also
+ // tables[4] which has been waiting for tables[3] to commit.
+ ASSERT_EQ(0, list.NumNotFlushed());
+ num_in_history =
+ std::min(5, static_cast<int>(max_write_buffer_size_to_maintain) /
+ static_cast<int>(options.write_buffer_size));
+ ASSERT_EQ(num_in_history, list.NumFlushed());
+ ASSERT_EQ(5 - list.NumNotFlushed() - num_in_history, to_delete.size());
+
+ for (const auto& m : to_delete) {
+ // Refcount should be 0 after calling TryInstallMemtableFlushResults.
+ // Verify this, by Ref'ing then UnRef'ing:
+ m->Ref();
+ ASSERT_EQ(m, m->Unref());
+ delete m;
+ }
+ to_delete.clear();
+
+ // Add another table
+ list.Add(tables[5], &to_delete);
+ ASSERT_EQ(1, list.NumNotFlushed());
+ ASSERT_EQ(5, list.GetLatestMemTableID());
+ memtable_id = 4;
+ // Pick tables to flush. The tables to pick must have ID smaller than or
+ // equal to 4. Therefore, no table will be selected in this case.
+ autovector<MemTable*> to_flush4;
+ list.FlushRequested();
+ ASSERT_TRUE(list.HasFlushRequested());
+ list.PickMemtablesToFlush(&memtable_id, &to_flush4);
+ ASSERT_TRUE(to_flush4.empty());
+ ASSERT_EQ(1, list.NumNotFlushed());
+ ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+ ASSERT_FALSE(list.IsFlushPending());
+ ASSERT_FALSE(list.HasFlushRequested());
+
+ // Pick tables to flush. The tables to pick must have ID smaller than or
+ // equal to 5. Therefore, only tables[5] will be selected.
+ memtable_id = 5;
+ list.FlushRequested();
+ list.PickMemtablesToFlush(&memtable_id, &to_flush4);
+ ASSERT_EQ(1, static_cast<int>(to_flush4.size()));
+ ASSERT_EQ(1, list.NumNotFlushed());
+ ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+ ASSERT_FALSE(list.IsFlushPending());
+ to_delete.clear();
+
+ list.current()->Unref(&to_delete);
+ int to_delete_size =
+ std::min(num_tables, static_cast<int>(max_write_buffer_size_to_maintain) /
+ static_cast<int>(options.write_buffer_size));
+ ASSERT_EQ(to_delete_size, to_delete.size());
+
+ for (const auto& m : to_delete) {
+ // Refcount should be 0 after calling TryInstallMemtableFlushResults.
+ // Verify this, by Ref'ing then UnRef'ing:
+ m->Ref();
+ ASSERT_EQ(m, m->Unref());
+ delete m;
+ }
+ to_delete.clear();
+}
+
+TEST_F(MemTableListTest, EmptyAtomicFlusTest) {
+ autovector<MemTableList*> lists;
+ autovector<uint32_t> cf_ids;
+ autovector<const MutableCFOptions*> options_list;
+ autovector<const autovector<MemTable*>*> to_flush;
+ autovector<MemTable*> to_delete;
+ Status s = Mock_InstallMemtableAtomicFlushResults(lists, cf_ids, options_list,
+ to_flush, &to_delete);
+ ASSERT_OK(s);
+ ASSERT_TRUE(to_delete.empty());
+}
+
+TEST_F(MemTableListTest, AtomicFlusTest) {
+ const int num_cfs = 3;
+ const int num_tables_per_cf = 2;
+ SequenceNumber seq = 1;
+
+ auto factory = std::make_shared<SkipListFactory>();
+ options.memtable_factory = factory;
+ ImmutableCFOptions ioptions(options);
+ InternalKeyComparator cmp(BytewiseComparator());
+ WriteBufferManager wb(options.db_write_buffer_size);
+
+ // Create MemTableLists
+ int min_write_buffer_number_to_merge = 3;
+ int max_write_buffer_number_to_maintain = 7;
+ int64_t max_write_buffer_size_to_maintain =
+ 7 * static_cast<int64_t>(options.write_buffer_size);
+ autovector<MemTableList*> lists;
+ for (int i = 0; i != num_cfs; ++i) {
+ lists.emplace_back(new MemTableList(min_write_buffer_number_to_merge,
+ max_write_buffer_number_to_maintain,
+ max_write_buffer_size_to_maintain));
+ }
+
+ autovector<uint32_t> cf_ids;
+ std::vector<std::vector<MemTable*>> tables(num_cfs);
+ autovector<const MutableCFOptions*> mutable_cf_options_list;
+ uint32_t cf_id = 0;
+ for (auto& elem : tables) {
+ mutable_cf_options_list.emplace_back(new MutableCFOptions(options));
+ uint64_t memtable_id = 0;
+ for (int i = 0; i != num_tables_per_cf; ++i) {
+ MemTable* mem =
+ new MemTable(cmp, ioptions, *(mutable_cf_options_list.back()), &wb,
+ kMaxSequenceNumber, cf_id);
+ mem->SetID(memtable_id++);
+ mem->Ref();
+
+ std::string value;
+
+ mem->Add(++seq, kTypeValue, "key1", ToString(i));
+ mem->Add(++seq, kTypeValue, "keyN" + ToString(i), "valueN");
+ mem->Add(++seq, kTypeValue, "keyX" + ToString(i), "value");
+ mem->Add(++seq, kTypeValue, "keyM" + ToString(i), "valueM");
+ mem->Add(++seq, kTypeDeletion, "keyX" + ToString(i), "");
+
+ elem.push_back(mem);
+ }
+ cf_ids.push_back(cf_id++);
+ }
+
+ std::vector<autovector<MemTable*>> flush_candidates(num_cfs);
+
+ // Nothing to flush
+ for (auto i = 0; i != num_cfs; ++i) {
+ auto* list = lists[i];
+ ASSERT_FALSE(list->IsFlushPending());
+ ASSERT_FALSE(list->imm_flush_needed.load(std::memory_order_acquire));
+ list->PickMemtablesToFlush(nullptr /* memtable_id */, &flush_candidates[i]);
+ ASSERT_EQ(0, flush_candidates[i].size());
+ }
+ // Request flush even though there is nothing to flush
+ for (auto i = 0; i != num_cfs; ++i) {
+ auto* list = lists[i];
+ list->FlushRequested();
+ ASSERT_FALSE(list->IsFlushPending());
+ ASSERT_FALSE(list->imm_flush_needed.load(std::memory_order_acquire));
+ }
+ autovector<MemTable*> to_delete;
+ // Add tables to the immutable memtalbe lists associated with column families
+ for (auto i = 0; i != num_cfs; ++i) {
+ for (auto j = 0; j != num_tables_per_cf; ++j) {
+ lists[i]->Add(tables[i][j], &to_delete);
+ }
+ ASSERT_EQ(num_tables_per_cf, lists[i]->NumNotFlushed());
+ ASSERT_TRUE(lists[i]->IsFlushPending());
+ ASSERT_TRUE(lists[i]->imm_flush_needed.load(std::memory_order_acquire));
+ }
+ std::vector<uint64_t> flush_memtable_ids = {1, 1, 0};
+ // +----+
+ // list[0]: |0 1|
+ // list[1]: |0 1|
+ // | +--+
+ // list[2]: |0| 1
+ // +-+
+ // Pick memtables to flush
+ for (auto i = 0; i != num_cfs; ++i) {
+ flush_candidates[i].clear();
+ lists[i]->PickMemtablesToFlush(&flush_memtable_ids[i],
+ &flush_candidates[i]);
+ ASSERT_EQ(flush_memtable_ids[i] - 0 + 1,
+ static_cast<uint64_t>(flush_candidates[i].size()));
+ }
+ autovector<MemTableList*> tmp_lists;
+ autovector<uint32_t> tmp_cf_ids;
+ autovector<const MutableCFOptions*> tmp_options_list;
+ autovector<const autovector<MemTable*>*> to_flush;
+ for (auto i = 0; i != num_cfs; ++i) {
+ if (!flush_candidates[i].empty()) {
+ to_flush.push_back(&flush_candidates[i]);
+ tmp_lists.push_back(lists[i]);
+ tmp_cf_ids.push_back(i);
+ tmp_options_list.push_back(mutable_cf_options_list[i]);
+ }
+ }
+ Status s = Mock_InstallMemtableAtomicFlushResults(
+ tmp_lists, tmp_cf_ids, tmp_options_list, to_flush, &to_delete);
+ ASSERT_OK(s);
+
+ for (auto i = 0; i != num_cfs; ++i) {
+ for (auto j = 0; j != num_tables_per_cf; ++j) {
+ if (static_cast<uint64_t>(j) <= flush_memtable_ids[i]) {
+ ASSERT_LT(0, tables[i][j]->GetFileNumber());
+ }
+ }
+ ASSERT_EQ(
+ static_cast<size_t>(num_tables_per_cf) - flush_candidates[i].size(),
+ lists[i]->NumNotFlushed());
+ }
+
+ to_delete.clear();
+ for (auto list : lists) {
+ list->current()->Unref(&to_delete);
+ delete list;
+ }
+ for (auto& mutable_cf_options : mutable_cf_options_list) {
+ if (mutable_cf_options != nullptr) {
+ delete mutable_cf_options;
+ mutable_cf_options = nullptr;
+ }
+ }
+ // All memtables in tables array must have been flushed, thus ready to be
+ // deleted.
+ ASSERT_EQ(to_delete.size(), tables.size() * tables.front().size());
+ for (const auto& m : to_delete) {
+ // Refcount should be 0 after calling InstallMemtableFlushResults.
+ // Verify this by Ref'ing and then Unref'ing.
+ m->Ref();
+ ASSERT_EQ(m, m->Unref());
+ delete m;
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/merge_context.h b/src/rocksdb/db/merge_context.h
new file mode 100644
index 000000000..e1869a341
--- /dev/null
+++ b/src/rocksdb/db/merge_context.h
@@ -0,0 +1,134 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const std::vector<Slice> empty_operand_list;
+
+// The merge context for merging a user key.
+// When doing a Get(), DB will create such a class and pass it when
+// issuing Get() operation to memtables and version_set. The operands
+// will be fetched from the context when issuing partial of full merge.
+class MergeContext {
+ public:
+ // Clear all the operands
+ void Clear() {
+ if (operand_list_) {
+ operand_list_->clear();
+ copied_operands_->clear();
+ }
+ }
+
+ // Push a merge operand
+ void PushOperand(const Slice& operand_slice, bool operand_pinned = false) {
+ Initialize();
+ SetDirectionBackward();
+
+ if (operand_pinned) {
+ operand_list_->push_back(operand_slice);
+ } else {
+ // We need to have our own copy of the operand since it's not pinned
+ copied_operands_->emplace_back(
+ new std::string(operand_slice.data(), operand_slice.size()));
+ operand_list_->push_back(*copied_operands_->back());
+ }
+ }
+
+ // Push back a merge operand
+ void PushOperandBack(const Slice& operand_slice,
+ bool operand_pinned = false) {
+ Initialize();
+ SetDirectionForward();
+
+ if (operand_pinned) {
+ operand_list_->push_back(operand_slice);
+ } else {
+ // We need to have our own copy of the operand since it's not pinned
+ copied_operands_->emplace_back(
+ new std::string(operand_slice.data(), operand_slice.size()));
+ operand_list_->push_back(*copied_operands_->back());
+ }
+ }
+
+ // return total number of operands in the list
+ size_t GetNumOperands() const {
+ if (!operand_list_) {
+ return 0;
+ }
+ return operand_list_->size();
+ }
+
+ // Get the operand at the index.
+ Slice GetOperand(int index) {
+ assert(operand_list_);
+
+ SetDirectionForward();
+ return (*operand_list_)[index];
+ }
+
+ // Same as GetOperandsDirectionForward
+ const std::vector<Slice>& GetOperands() {
+ return GetOperandsDirectionForward();
+ }
+
+ // Return all the operands in the order as they were merged (passed to
+ // FullMerge or FullMergeV2)
+ const std::vector<Slice>& GetOperandsDirectionForward() {
+ if (!operand_list_) {
+ return empty_operand_list;
+ }
+
+ SetDirectionForward();
+ return *operand_list_;
+ }
+
+ // Return all the operands in the reversed order relative to how they were
+ // merged (passed to FullMerge or FullMergeV2)
+ const std::vector<Slice>& GetOperandsDirectionBackward() {
+ if (!operand_list_) {
+ return empty_operand_list;
+ }
+
+ SetDirectionBackward();
+ return *operand_list_;
+ }
+
+ private:
+ void Initialize() {
+ if (!operand_list_) {
+ operand_list_.reset(new std::vector<Slice>());
+ copied_operands_.reset(new std::vector<std::unique_ptr<std::string>>());
+ }
+ }
+
+ void SetDirectionForward() {
+ if (operands_reversed_ == true) {
+ std::reverse(operand_list_->begin(), operand_list_->end());
+ operands_reversed_ = false;
+ }
+ }
+
+ void SetDirectionBackward() {
+ if (operands_reversed_ == false) {
+ std::reverse(operand_list_->begin(), operand_list_->end());
+ operands_reversed_ = true;
+ }
+ }
+
+ // List of operands
+ std::unique_ptr<std::vector<Slice>> operand_list_;
+ // Copy of operands that are not pinned.
+ std::unique_ptr<std::vector<std::unique_ptr<std::string>>> copied_operands_;
+ bool operands_reversed_ = true;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/merge_helper.cc b/src/rocksdb/db/merge_helper.cc
new file mode 100644
index 000000000..96fe79251
--- /dev/null
+++ b/src/rocksdb/db/merge_helper.cc
@@ -0,0 +1,417 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/merge_helper.h"
+
+#include <string>
+
+#include "db/dbformat.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/statistics.h"
+#include "port/likely.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/merge_operator.h"
+#include "table/format.h"
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+MergeHelper::MergeHelper(Env* env, const Comparator* user_comparator,
+ const MergeOperator* user_merge_operator,
+ const CompactionFilter* compaction_filter,
+ Logger* logger, bool assert_valid_internal_key,
+ SequenceNumber latest_snapshot,
+ const SnapshotChecker* snapshot_checker, int level,
+ Statistics* stats,
+ const std::atomic<bool>* shutting_down)
+ : env_(env),
+ user_comparator_(user_comparator),
+ user_merge_operator_(user_merge_operator),
+ compaction_filter_(compaction_filter),
+ shutting_down_(shutting_down),
+ logger_(logger),
+ assert_valid_internal_key_(assert_valid_internal_key),
+ allow_single_operand_(false),
+ latest_snapshot_(latest_snapshot),
+ snapshot_checker_(snapshot_checker),
+ level_(level),
+ keys_(),
+ filter_timer_(env_),
+ total_filter_time_(0U),
+ stats_(stats) {
+ assert(user_comparator_ != nullptr);
+ if (user_merge_operator_) {
+ allow_single_operand_ = user_merge_operator_->AllowSingleOperand();
+ }
+}
+
+Status MergeHelper::TimedFullMerge(const MergeOperator* merge_operator,
+ const Slice& key, const Slice* value,
+ const std::vector<Slice>& operands,
+ std::string* result, Logger* logger,
+ Statistics* statistics, Env* env,
+ Slice* result_operand,
+ bool update_num_ops_stats) {
+ assert(merge_operator != nullptr);
+
+ if (operands.size() == 0) {
+ assert(value != nullptr && result != nullptr);
+ result->assign(value->data(), value->size());
+ return Status::OK();
+ }
+
+ if (update_num_ops_stats) {
+ RecordInHistogram(statistics, READ_NUM_MERGE_OPERANDS,
+ static_cast<uint64_t>(operands.size()));
+ }
+
+ bool success;
+ Slice tmp_result_operand(nullptr, 0);
+ const MergeOperator::MergeOperationInput merge_in(key, value, operands,
+ logger);
+ MergeOperator::MergeOperationOutput merge_out(*result, tmp_result_operand);
+ {
+ // Setup to time the merge
+ StopWatchNano timer(env, statistics != nullptr);
+ PERF_TIMER_GUARD(merge_operator_time_nanos);
+
+ // Do the merge
+ success = merge_operator->FullMergeV2(merge_in, &merge_out);
+
+ if (tmp_result_operand.data()) {
+ // FullMergeV2 result is an existing operand
+ if (result_operand != nullptr) {
+ *result_operand = tmp_result_operand;
+ } else {
+ result->assign(tmp_result_operand.data(), tmp_result_operand.size());
+ }
+ } else if (result_operand) {
+ *result_operand = Slice(nullptr, 0);
+ }
+
+ RecordTick(statistics, MERGE_OPERATION_TOTAL_TIME,
+ statistics ? timer.ElapsedNanos() : 0);
+ }
+
+ if (!success) {
+ RecordTick(statistics, NUMBER_MERGE_FAILURES);
+ return Status::Corruption("Error: Could not perform merge.");
+ }
+
+ return Status::OK();
+}
+
+// PRE: iter points to the first merge type entry
+// POST: iter points to the first entry beyond the merge process (or the end)
+// keys_, operands_ are updated to reflect the merge result.
+// keys_ stores the list of keys encountered while merging.
+// operands_ stores the list of merge operands encountered while merging.
+// keys_[i] corresponds to operands_[i] for each i.
+//
+// TODO: Avoid the snapshot stripe map lookup in CompactionRangeDelAggregator
+// and just pass the StripeRep corresponding to the stripe being merged.
+Status MergeHelper::MergeUntil(InternalIterator* iter,
+ CompactionRangeDelAggregator* range_del_agg,
+ const SequenceNumber stop_before,
+ const bool at_bottom) {
+ // Get a copy of the internal key, before it's invalidated by iter->Next()
+ // Also maintain the list of merge operands seen.
+ assert(HasOperator());
+ keys_.clear();
+ merge_context_.Clear();
+ has_compaction_filter_skip_until_ = false;
+ assert(user_merge_operator_);
+ bool first_key = true;
+
+ // We need to parse the internal key again as the parsed key is
+ // backed by the internal key!
+ // Assume no internal key corruption as it has been successfully parsed
+ // by the caller.
+ // original_key_is_iter variable is just caching the information:
+ // original_key_is_iter == (iter->key().ToString() == original_key)
+ bool original_key_is_iter = true;
+ std::string original_key = iter->key().ToString();
+ // Important:
+ // orig_ikey is backed by original_key if keys_.empty()
+ // orig_ikey is backed by keys_.back() if !keys_.empty()
+ ParsedInternalKey orig_ikey;
+ bool succ = ParseInternalKey(original_key, &orig_ikey);
+ assert(succ);
+ if (!succ) {
+ return Status::Corruption("Cannot parse key in MergeUntil");
+ }
+
+ Status s;
+ bool hit_the_next_user_key = false;
+ for (; iter->Valid(); iter->Next(), original_key_is_iter = false) {
+ if (IsShuttingDown()) {
+ return Status::ShutdownInProgress();
+ }
+
+ ParsedInternalKey ikey;
+ assert(keys_.size() == merge_context_.GetNumOperands());
+
+ if (!ParseInternalKey(iter->key(), &ikey)) {
+ // stop at corrupted key
+ if (assert_valid_internal_key_) {
+ assert(!"Corrupted internal key not expected.");
+ return Status::Corruption("Corrupted internal key not expected.");
+ }
+ break;
+ } else if (first_key) {
+ assert(user_comparator_->Equal(ikey.user_key, orig_ikey.user_key));
+ first_key = false;
+ } else if (!user_comparator_->Equal(ikey.user_key, orig_ikey.user_key)) {
+ // hit a different user key, stop right here
+ hit_the_next_user_key = true;
+ break;
+ } else if (stop_before > 0 && ikey.sequence <= stop_before &&
+ LIKELY(snapshot_checker_ == nullptr ||
+ snapshot_checker_->CheckInSnapshot(ikey.sequence,
+ stop_before) !=
+ SnapshotCheckerResult::kNotInSnapshot)) {
+ // hit an entry that's possibly visible by the previous snapshot, can't
+ // touch that
+ break;
+ }
+
+ // At this point we are guaranteed that we need to process this key.
+
+ assert(IsValueType(ikey.type));
+ if (ikey.type != kTypeMerge) {
+
+ // hit a put/delete/single delete
+ // => merge the put value or a nullptr with operands_
+ // => store result in operands_.back() (and update keys_.back())
+ // => change the entry type to kTypeValue for keys_.back()
+ // We are done! Success!
+
+ // If there are no operands, just return the Status::OK(). That will cause
+ // the compaction iterator to write out the key we're currently at, which
+ // is the put/delete we just encountered.
+ if (keys_.empty()) {
+ return Status::OK();
+ }
+
+ // TODO(noetzli) If the merge operator returns false, we are currently
+ // (almost) silently dropping the put/delete. That's probably not what we
+ // want. Also if we're in compaction and it's a put, it would be nice to
+ // run compaction filter on it.
+ const Slice val = iter->value();
+ const Slice* val_ptr;
+ if (kTypeValue == ikey.type &&
+ (range_del_agg == nullptr ||
+ !range_del_agg->ShouldDelete(
+ ikey, RangeDelPositioningMode::kForwardTraversal))) {
+ val_ptr = &val;
+ } else {
+ val_ptr = nullptr;
+ }
+ std::string merge_result;
+ s = TimedFullMerge(user_merge_operator_, ikey.user_key, val_ptr,
+ merge_context_.GetOperands(), &merge_result, logger_,
+ stats_, env_);
+
+ // We store the result in keys_.back() and operands_.back()
+ // if nothing went wrong (i.e.: no operand corruption on disk)
+ if (s.ok()) {
+ // The original key encountered
+ original_key = std::move(keys_.back());
+ orig_ikey.type = kTypeValue;
+ UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type);
+ keys_.clear();
+ merge_context_.Clear();
+ keys_.emplace_front(std::move(original_key));
+ merge_context_.PushOperand(merge_result);
+ }
+
+ // move iter to the next entry
+ iter->Next();
+ return s;
+ } else {
+ // hit a merge
+ // => if there is a compaction filter, apply it.
+ // => check for range tombstones covering the operand
+ // => merge the operand into the front of the operands_ list
+ // if not filtered
+ // => then continue because we haven't yet seen a Put/Delete.
+ //
+ // Keep queuing keys and operands until we either meet a put / delete
+ // request or later did a partial merge.
+
+ Slice value_slice = iter->value();
+ // add an operand to the list if:
+ // 1) it's included in one of the snapshots. in that case we *must* write
+ // it out, no matter what compaction filter says
+ // 2) it's not filtered by a compaction filter
+ CompactionFilter::Decision filter =
+ ikey.sequence <= latest_snapshot_
+ ? CompactionFilter::Decision::kKeep
+ : FilterMerge(orig_ikey.user_key, value_slice);
+ if (filter != CompactionFilter::Decision::kRemoveAndSkipUntil &&
+ range_del_agg != nullptr &&
+ range_del_agg->ShouldDelete(
+ iter->key(), RangeDelPositioningMode::kForwardTraversal)) {
+ filter = CompactionFilter::Decision::kRemove;
+ }
+ if (filter == CompactionFilter::Decision::kKeep ||
+ filter == CompactionFilter::Decision::kChangeValue) {
+ if (original_key_is_iter) {
+ // this is just an optimization that saves us one memcpy
+ keys_.push_front(std::move(original_key));
+ } else {
+ keys_.push_front(iter->key().ToString());
+ }
+ if (keys_.size() == 1) {
+ // we need to re-anchor the orig_ikey because it was anchored by
+ // original_key before
+ ParseInternalKey(keys_.back(), &orig_ikey);
+ }
+ if (filter == CompactionFilter::Decision::kKeep) {
+ merge_context_.PushOperand(
+ value_slice, iter->IsValuePinned() /* operand_pinned */);
+ } else { // kChangeValue
+ // Compaction filter asked us to change the operand from value_slice
+ // to compaction_filter_value_.
+ merge_context_.PushOperand(compaction_filter_value_, false);
+ }
+ } else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) {
+ // Compaction filter asked us to remove this key altogether
+ // (not just this operand), along with some keys following it.
+ keys_.clear();
+ merge_context_.Clear();
+ has_compaction_filter_skip_until_ = true;
+ return Status::OK();
+ }
+ }
+ }
+
+ if (merge_context_.GetNumOperands() == 0) {
+ // we filtered out all the merge operands
+ return Status::OK();
+ }
+
+ // We are sure we have seen this key's entire history if:
+ // at_bottom == true (this does not necessarily mean it is the bottommost
+ // layer, but rather that we are confident the key does not appear on any of
+ // the lower layers, at_bottom == false doesn't mean it does appear, just
+ // that we can't be sure, see Compaction::IsBottommostLevel for details)
+ // AND
+ // we have either encountered another key or end of key history on this
+ // layer.
+ //
+ // When these conditions are true we are able to merge all the keys
+ // using full merge.
+ //
+ // For these cases we are not sure about, we simply miss the opportunity
+ // to combine the keys. Since VersionSet::SetupOtherInputs() always makes
+ // sure that all merge-operands on the same level get compacted together,
+ // this will simply lead to these merge operands moving to the next level.
+ bool surely_seen_the_beginning =
+ (hit_the_next_user_key || !iter->Valid()) && at_bottom;
+ if (surely_seen_the_beginning) {
+ // do a final merge with nullptr as the existing value and say
+ // bye to the merge type (it's now converted to a Put)
+ assert(kTypeMerge == orig_ikey.type);
+ assert(merge_context_.GetNumOperands() >= 1);
+ assert(merge_context_.GetNumOperands() == keys_.size());
+ std::string merge_result;
+ s = TimedFullMerge(user_merge_operator_, orig_ikey.user_key, nullptr,
+ merge_context_.GetOperands(), &merge_result, logger_,
+ stats_, env_);
+ if (s.ok()) {
+ // The original key encountered
+ // We are certain that keys_ is not empty here (see assertions couple of
+ // lines before).
+ original_key = std::move(keys_.back());
+ orig_ikey.type = kTypeValue;
+ UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type);
+ keys_.clear();
+ merge_context_.Clear();
+ keys_.emplace_front(std::move(original_key));
+ merge_context_.PushOperand(merge_result);
+ }
+ } else {
+ // We haven't seen the beginning of the key nor a Put/Delete.
+ // Attempt to use the user's associative merge function to
+ // merge the stacked merge operands into a single operand.
+ s = Status::MergeInProgress();
+ if (merge_context_.GetNumOperands() >= 2 ||
+ (allow_single_operand_ && merge_context_.GetNumOperands() == 1)) {
+ bool merge_success = false;
+ std::string merge_result;
+ {
+ StopWatchNano timer(env_, stats_ != nullptr);
+ PERF_TIMER_GUARD(merge_operator_time_nanos);
+ merge_success = user_merge_operator_->PartialMergeMulti(
+ orig_ikey.user_key,
+ std::deque<Slice>(merge_context_.GetOperands().begin(),
+ merge_context_.GetOperands().end()),
+ &merge_result, logger_);
+ RecordTick(stats_, MERGE_OPERATION_TOTAL_TIME,
+ stats_ ? timer.ElapsedNanosSafe() : 0);
+ }
+ if (merge_success) {
+ // Merging of operands (associative merge) was successful.
+ // Replace operands with the merge result
+ merge_context_.Clear();
+ merge_context_.PushOperand(merge_result);
+ keys_.erase(keys_.begin(), keys_.end() - 1);
+ }
+ }
+ }
+
+ return s;
+}
+
+MergeOutputIterator::MergeOutputIterator(const MergeHelper* merge_helper)
+ : merge_helper_(merge_helper) {
+ it_keys_ = merge_helper_->keys().rend();
+ it_values_ = merge_helper_->values().rend();
+}
+
+void MergeOutputIterator::SeekToFirst() {
+ const auto& keys = merge_helper_->keys();
+ const auto& values = merge_helper_->values();
+ assert(keys.size() == values.size());
+ it_keys_ = keys.rbegin();
+ it_values_ = values.rbegin();
+}
+
+void MergeOutputIterator::Next() {
+ ++it_keys_;
+ ++it_values_;
+}
+
+CompactionFilter::Decision MergeHelper::FilterMerge(const Slice& user_key,
+ const Slice& value_slice) {
+ if (compaction_filter_ == nullptr) {
+ return CompactionFilter::Decision::kKeep;
+ }
+ if (stats_ != nullptr && ShouldReportDetailedTime(env_, stats_)) {
+ filter_timer_.Start();
+ }
+ compaction_filter_value_.clear();
+ compaction_filter_skip_until_.Clear();
+ auto ret = compaction_filter_->FilterV2(
+ level_, user_key, CompactionFilter::ValueType::kMergeOperand, value_slice,
+ &compaction_filter_value_, compaction_filter_skip_until_.rep());
+ if (ret == CompactionFilter::Decision::kRemoveAndSkipUntil) {
+ if (user_comparator_->Compare(*compaction_filter_skip_until_.rep(),
+ user_key) <= 0) {
+ // Invalid skip_until returned from compaction filter.
+ // Keep the key as per FilterV2 documentation.
+ ret = CompactionFilter::Decision::kKeep;
+ } else {
+ compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber,
+ kValueTypeForSeek);
+ }
+ }
+ total_filter_time_ += filter_timer_.ElapsedNanosSafe();
+ return ret;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/merge_helper.h b/src/rocksdb/db/merge_helper.h
new file mode 100644
index 000000000..c0534f08b
--- /dev/null
+++ b/src/rocksdb/db/merge_helper.h
@@ -0,0 +1,194 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#include <deque>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/merge_context.h"
+#include "db/range_del_aggregator.h"
+#include "db/snapshot_checker.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Comparator;
+class Iterator;
+class Logger;
+class MergeOperator;
+class Statistics;
+
+class MergeHelper {
+ public:
+ MergeHelper(Env* env, const Comparator* user_comparator,
+ const MergeOperator* user_merge_operator,
+ const CompactionFilter* compaction_filter, Logger* logger,
+ bool assert_valid_internal_key, SequenceNumber latest_snapshot,
+ const SnapshotChecker* snapshot_checker = nullptr, int level = 0,
+ Statistics* stats = nullptr,
+ const std::atomic<bool>* shutting_down = nullptr);
+
+ // Wrapper around MergeOperator::FullMergeV2() that records perf statistics.
+ // Result of merge will be written to result if status returned is OK.
+ // If operands is empty, the value will simply be copied to result.
+ // Set `update_num_ops_stats` to true if it is from a user read, so that
+ // the latency is sensitive.
+ // Returns one of the following statuses:
+ // - OK: Entries were successfully merged.
+ // - Corruption: Merge operator reported unsuccessful merge.
+ static Status TimedFullMerge(const MergeOperator* merge_operator,
+ const Slice& key, const Slice* value,
+ const std::vector<Slice>& operands,
+ std::string* result, Logger* logger,
+ Statistics* statistics, Env* env,
+ Slice* result_operand = nullptr,
+ bool update_num_ops_stats = false);
+
+ // Merge entries until we hit
+ // - a corrupted key
+ // - a Put/Delete,
+ // - a different user key,
+ // - a specific sequence number (snapshot boundary),
+ // - REMOVE_AND_SKIP_UNTIL returned from compaction filter,
+ // or - the end of iteration
+ // iter: (IN) points to the first merge type entry
+ // (OUT) points to the first entry not included in the merge process
+ // range_del_agg: (IN) filters merge operands covered by range tombstones.
+ // stop_before: (IN) a sequence number that merge should not cross.
+ // 0 means no restriction
+ // at_bottom: (IN) true if the iterator covers the bottem level, which means
+ // we could reach the start of the history of this user key.
+ //
+ // Returns one of the following statuses:
+ // - OK: Entries were successfully merged.
+ // - MergeInProgress: Put/Delete not encountered, and didn't reach the start
+ // of key's history. Output consists of merge operands only.
+ // - Corruption: Merge operator reported unsuccessful merge or a corrupted
+ // key has been encountered and not expected (applies only when compiling
+ // with asserts removed).
+ // - ShutdownInProgress: interrupted by shutdown (*shutting_down == true).
+ //
+ // REQUIRED: The first key in the input is not corrupted.
+ Status MergeUntil(InternalIterator* iter,
+ CompactionRangeDelAggregator* range_del_agg = nullptr,
+ const SequenceNumber stop_before = 0,
+ const bool at_bottom = false);
+
+ // Filters a merge operand using the compaction filter specified
+ // in the constructor. Returns the decision that the filter made.
+ // Uses compaction_filter_value_ and compaction_filter_skip_until_ for the
+ // optional outputs of compaction filter.
+ CompactionFilter::Decision FilterMerge(const Slice& user_key,
+ const Slice& value_slice);
+
+ // Query the merge result
+ // These are valid until the next MergeUntil call
+ // If the merging was successful:
+ // - keys() contains a single element with the latest sequence number of
+ // the merges. The type will be Put or Merge. See IMPORTANT 1 note, below.
+ // - values() contains a single element with the result of merging all the
+ // operands together
+ //
+ // IMPORTANT 1: the key type could change after the MergeUntil call.
+ // Put/Delete + Merge + ... + Merge => Put
+ // Merge + ... + Merge => Merge
+ //
+ // If the merge operator is not associative, and if a Put/Delete is not found
+ // then the merging will be unsuccessful. In this case:
+ // - keys() contains the list of internal keys seen in order of iteration.
+ // - values() contains the list of values (merges) seen in the same order.
+ // values() is parallel to keys() so that the first entry in
+ // keys() is the key associated with the first entry in values()
+ // and so on. These lists will be the same length.
+ // All of these pairs will be merges over the same user key.
+ // See IMPORTANT 2 note below.
+ //
+ // IMPORTANT 2: The entries were traversed in order from BACK to FRONT.
+ // So keys().back() was the first key seen by iterator.
+ // TODO: Re-style this comment to be like the first one
+ const std::deque<std::string>& keys() const { return keys_; }
+ const std::vector<Slice>& values() const {
+ return merge_context_.GetOperands();
+ }
+ uint64_t TotalFilterTime() const { return total_filter_time_; }
+ bool HasOperator() const { return user_merge_operator_ != nullptr; }
+
+ // If compaction filter returned REMOVE_AND_SKIP_UNTIL, this method will
+ // return true and fill *until with the key to which we should skip.
+ // If true, keys() and values() are empty.
+ bool FilteredUntil(Slice* skip_until) const {
+ if (!has_compaction_filter_skip_until_) {
+ return false;
+ }
+ assert(compaction_filter_ != nullptr);
+ assert(skip_until != nullptr);
+ assert(compaction_filter_skip_until_.Valid());
+ *skip_until = compaction_filter_skip_until_.Encode();
+ return true;
+ }
+
+ private:
+ Env* env_;
+ const Comparator* user_comparator_;
+ const MergeOperator* user_merge_operator_;
+ const CompactionFilter* compaction_filter_;
+ const std::atomic<bool>* shutting_down_;
+ Logger* logger_;
+ bool assert_valid_internal_key_; // enforce no internal key corruption?
+ bool allow_single_operand_;
+ SequenceNumber latest_snapshot_;
+ const SnapshotChecker* const snapshot_checker_;
+ int level_;
+
+ // the scratch area that holds the result of MergeUntil
+ // valid up to the next MergeUntil call
+
+ // Keeps track of the sequence of keys seen
+ std::deque<std::string> keys_;
+ // Parallel with keys_; stores the operands
+ mutable MergeContext merge_context_;
+
+ StopWatchNano filter_timer_;
+ uint64_t total_filter_time_;
+ Statistics* stats_;
+
+ bool has_compaction_filter_skip_until_ = false;
+ std::string compaction_filter_value_;
+ InternalKey compaction_filter_skip_until_;
+
+ bool IsShuttingDown() {
+ // This is a best-effort facility, so memory_order_relaxed is sufficient.
+ return shutting_down_ && shutting_down_->load(std::memory_order_relaxed);
+ }
+};
+
+// MergeOutputIterator can be used to iterate over the result of a merge.
+class MergeOutputIterator {
+ public:
+ // The MergeOutputIterator is bound to a MergeHelper instance.
+ explicit MergeOutputIterator(const MergeHelper* merge_helper);
+
+ // Seeks to the first record in the output.
+ void SeekToFirst();
+ // Advances to the next record in the output.
+ void Next();
+
+ Slice key() { return Slice(*it_keys_); }
+ Slice value() { return Slice(*it_values_); }
+ bool Valid() { return it_keys_ != merge_helper_->keys().rend(); }
+
+ private:
+ const MergeHelper* merge_helper_;
+ std::deque<std::string>::const_reverse_iterator it_keys_;
+ std::vector<Slice>::const_reverse_iterator it_values_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/merge_helper_test.cc b/src/rocksdb/db/merge_helper_test.cc
new file mode 100644
index 000000000..117916c8f
--- /dev/null
+++ b/src/rocksdb/db/merge_helper_test.cc
@@ -0,0 +1,290 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "db/merge_helper.h"
+#include "rocksdb/comparator.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MergeHelperTest : public testing::Test {
+ public:
+ MergeHelperTest() { env_ = Env::Default(); }
+
+ ~MergeHelperTest() override = default;
+
+ Status Run(SequenceNumber stop_before, bool at_bottom,
+ SequenceNumber latest_snapshot = 0) {
+ iter_.reset(new test::VectorIterator(ks_, vs_));
+ iter_->SeekToFirst();
+ merge_helper_.reset(new MergeHelper(env_, BytewiseComparator(),
+ merge_op_.get(), filter_.get(), nullptr,
+ false, latest_snapshot));
+ return merge_helper_->MergeUntil(iter_.get(), nullptr /* range_del_agg */,
+ stop_before, at_bottom);
+ }
+
+ void AddKeyVal(const std::string& user_key, const SequenceNumber& seq,
+ const ValueType& t, const std::string& val,
+ bool corrupt = false) {
+ InternalKey ikey(user_key, seq, t);
+ if (corrupt) {
+ test::CorruptKeyType(&ikey);
+ }
+ ks_.push_back(ikey.Encode().ToString());
+ vs_.push_back(val);
+ }
+
+ Env* env_;
+ std::unique_ptr<test::VectorIterator> iter_;
+ std::shared_ptr<MergeOperator> merge_op_;
+ std::unique_ptr<MergeHelper> merge_helper_;
+ std::vector<std::string> ks_;
+ std::vector<std::string> vs_;
+ std::unique_ptr<test::FilterNumber> filter_;
+};
+
+// If MergeHelper encounters a new key on the last level, we know that
+// the key has no more history and it can merge keys.
+TEST_F(MergeHelperTest, MergeAtBottomSuccess) {
+ merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+ AddKeyVal("a", 20, kTypeMerge, test::EncodeInt(1U));
+ AddKeyVal("a", 10, kTypeMerge, test::EncodeInt(3U));
+ AddKeyVal("b", 10, kTypeMerge, test::EncodeInt(4U)); // <- iter_ after merge
+
+ ASSERT_TRUE(Run(0, true).ok());
+ ASSERT_EQ(ks_[2], iter_->key());
+ ASSERT_EQ(test::KeyStr("a", 20, kTypeValue), merge_helper_->keys()[0]);
+ ASSERT_EQ(test::EncodeInt(4U), merge_helper_->values()[0]);
+ ASSERT_EQ(1U, merge_helper_->keys().size());
+ ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// Merging with a value results in a successful merge.
+TEST_F(MergeHelperTest, MergeValue) {
+ merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+ AddKeyVal("a", 40, kTypeMerge, test::EncodeInt(1U));
+ AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U));
+ AddKeyVal("a", 20, kTypeValue, test::EncodeInt(4U)); // <- iter_ after merge
+ AddKeyVal("a", 10, kTypeMerge, test::EncodeInt(1U));
+
+ ASSERT_TRUE(Run(0, false).ok());
+ ASSERT_EQ(ks_[3], iter_->key());
+ ASSERT_EQ(test::KeyStr("a", 40, kTypeValue), merge_helper_->keys()[0]);
+ ASSERT_EQ(test::EncodeInt(8U), merge_helper_->values()[0]);
+ ASSERT_EQ(1U, merge_helper_->keys().size());
+ ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// Merging stops before a snapshot.
+TEST_F(MergeHelperTest, SnapshotBeforeValue) {
+ merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+ AddKeyVal("a", 50, kTypeMerge, test::EncodeInt(1U));
+ AddKeyVal("a", 40, kTypeMerge, test::EncodeInt(3U)); // <- iter_ after merge
+ AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(1U));
+ AddKeyVal("a", 20, kTypeValue, test::EncodeInt(4U));
+ AddKeyVal("a", 10, kTypeMerge, test::EncodeInt(1U));
+
+ ASSERT_TRUE(Run(31, true).IsMergeInProgress());
+ ASSERT_EQ(ks_[2], iter_->key());
+ ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), merge_helper_->keys()[0]);
+ ASSERT_EQ(test::EncodeInt(4U), merge_helper_->values()[0]);
+ ASSERT_EQ(1U, merge_helper_->keys().size());
+ ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// MergeHelper preserves the operand stack for merge operators that
+// cannot do a partial merge.
+TEST_F(MergeHelperTest, NoPartialMerge) {
+ merge_op_ = MergeOperators::CreateStringAppendTESTOperator();
+
+ AddKeyVal("a", 50, kTypeMerge, "v2");
+ AddKeyVal("a", 40, kTypeMerge, "v"); // <- iter_ after merge
+ AddKeyVal("a", 30, kTypeMerge, "v");
+
+ ASSERT_TRUE(Run(31, true).IsMergeInProgress());
+ ASSERT_EQ(ks_[2], iter_->key());
+ ASSERT_EQ(test::KeyStr("a", 40, kTypeMerge), merge_helper_->keys()[0]);
+ ASSERT_EQ("v", merge_helper_->values()[0]);
+ ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), merge_helper_->keys()[1]);
+ ASSERT_EQ("v2", merge_helper_->values()[1]);
+ ASSERT_EQ(2U, merge_helper_->keys().size());
+ ASSERT_EQ(2U, merge_helper_->values().size());
+}
+
+// A single operand can not be merged.
+TEST_F(MergeHelperTest, SingleOperand) {
+ merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+ AddKeyVal("a", 50, kTypeMerge, test::EncodeInt(1U));
+
+ ASSERT_TRUE(Run(31, false).IsMergeInProgress());
+ ASSERT_FALSE(iter_->Valid());
+ ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), merge_helper_->keys()[0]);
+ ASSERT_EQ(test::EncodeInt(1U), merge_helper_->values()[0]);
+ ASSERT_EQ(1U, merge_helper_->keys().size());
+ ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// Merging with a deletion turns the deletion into a value
+TEST_F(MergeHelperTest, MergeDeletion) {
+ merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+ AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U));
+ AddKeyVal("a", 20, kTypeDeletion, "");
+
+ ASSERT_TRUE(Run(15, false).ok());
+ ASSERT_FALSE(iter_->Valid());
+ ASSERT_EQ(test::KeyStr("a", 30, kTypeValue), merge_helper_->keys()[0]);
+ ASSERT_EQ(test::EncodeInt(3U), merge_helper_->values()[0]);
+ ASSERT_EQ(1U, merge_helper_->keys().size());
+ ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// The merge helper stops upon encountering a corrupt key
+TEST_F(MergeHelperTest, CorruptKey) {
+ merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+ AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U));
+ AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(1U));
+ // Corrupt key
+ AddKeyVal("a", 20, kTypeDeletion, "", true); // <- iter_ after merge
+
+ ASSERT_TRUE(Run(15, false).IsMergeInProgress());
+ ASSERT_EQ(ks_[2], iter_->key());
+ ASSERT_EQ(test::KeyStr("a", 30, kTypeMerge), merge_helper_->keys()[0]);
+ ASSERT_EQ(test::EncodeInt(4U), merge_helper_->values()[0]);
+ ASSERT_EQ(1U, merge_helper_->keys().size());
+ ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// The compaction filter is called on every merge operand
+TEST_F(MergeHelperTest, FilterMergeOperands) {
+ merge_op_ = MergeOperators::CreateUInt64AddOperator();
+ filter_.reset(new test::FilterNumber(5U));
+
+ AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U));
+ AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(5U)); // Filtered
+ AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(3U));
+ AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(1U));
+ AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U)); // Filtered
+ AddKeyVal("a", 25, kTypeValue, test::EncodeInt(1U));
+
+ ASSERT_TRUE(Run(15, false).ok());
+ ASSERT_FALSE(iter_->Valid());
+ MergeOutputIterator merge_output_iter(merge_helper_.get());
+ merge_output_iter.SeekToFirst();
+ ASSERT_EQ(test::KeyStr("a", 30, kTypeValue),
+ merge_output_iter.key().ToString());
+ ASSERT_EQ(test::EncodeInt(8U), merge_output_iter.value().ToString());
+ merge_output_iter.Next();
+ ASSERT_FALSE(merge_output_iter.Valid());
+}
+
+TEST_F(MergeHelperTest, FilterAllMergeOperands) {
+ merge_op_ = MergeOperators::CreateUInt64AddOperator();
+ filter_.reset(new test::FilterNumber(5U));
+
+ AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(5U));
+ AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(5U));
+ AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(5U));
+ AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(5U));
+ AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U));
+ AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(5U));
+
+ // filtered out all
+ ASSERT_TRUE(Run(15, false).ok());
+ ASSERT_FALSE(iter_->Valid());
+ MergeOutputIterator merge_output_iter(merge_helper_.get());
+ merge_output_iter.SeekToFirst();
+ ASSERT_FALSE(merge_output_iter.Valid());
+
+ // we have one operand that will survive because it's a delete
+ AddKeyVal("a", 24, kTypeDeletion, test::EncodeInt(5U));
+ AddKeyVal("b", 23, kTypeValue, test::EncodeInt(5U));
+ ASSERT_TRUE(Run(15, true).ok());
+ merge_output_iter = MergeOutputIterator(merge_helper_.get());
+ ASSERT_TRUE(iter_->Valid());
+ merge_output_iter.SeekToFirst();
+ ASSERT_FALSE(merge_output_iter.Valid());
+
+ // when all merge operands are filtered out, we leave the iterator pointing to
+ // the Put/Delete that survived
+ ASSERT_EQ(test::KeyStr("a", 24, kTypeDeletion), iter_->key().ToString());
+ ASSERT_EQ(test::EncodeInt(5U), iter_->value().ToString());
+}
+
+// Make sure that merge operands are filtered at the beginning
+TEST_F(MergeHelperTest, FilterFirstMergeOperand) {
+ merge_op_ = MergeOperators::CreateUInt64AddOperator();
+ filter_.reset(new test::FilterNumber(5U));
+
+ AddKeyVal("a", 31, kTypeMerge, test::EncodeInt(5U)); // Filtered
+ AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(5U)); // Filtered
+ AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(2U));
+ AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(1U));
+ AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(3U));
+ AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U)); // Filtered
+ AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(5U)); // Filtered
+ AddKeyVal("b", 24, kTypeValue, test::EncodeInt(5U)); // next user key
+
+ ASSERT_OK(Run(15, true));
+ ASSERT_TRUE(iter_->Valid());
+ MergeOutputIterator merge_output_iter(merge_helper_.get());
+ merge_output_iter.SeekToFirst();
+ // sequence number is 29 here, because the first merge operand got filtered
+ // out
+ ASSERT_EQ(test::KeyStr("a", 29, kTypeValue),
+ merge_output_iter.key().ToString());
+ ASSERT_EQ(test::EncodeInt(6U), merge_output_iter.value().ToString());
+ merge_output_iter.Next();
+ ASSERT_FALSE(merge_output_iter.Valid());
+
+ // make sure that we're passing user keys into the filter
+ ASSERT_EQ("a", filter_->last_merge_operand_key());
+}
+
+// Make sure that merge operands are not filtered out if there's a snapshot
+// pointing at them
+TEST_F(MergeHelperTest, DontFilterMergeOperandsBeforeSnapshotTest) {
+ merge_op_ = MergeOperators::CreateUInt64AddOperator();
+ filter_.reset(new test::FilterNumber(5U));
+
+ AddKeyVal("a", 31, kTypeMerge, test::EncodeInt(5U));
+ AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(5U));
+ AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(2U));
+ AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(1U));
+ AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(3U));
+ AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U));
+ AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(5U));
+ AddKeyVal("b", 24, kTypeValue, test::EncodeInt(5U));
+
+ ASSERT_OK(Run(15, true, 32));
+ ASSERT_TRUE(iter_->Valid());
+ MergeOutputIterator merge_output_iter(merge_helper_.get());
+ merge_output_iter.SeekToFirst();
+ ASSERT_EQ(test::KeyStr("a", 31, kTypeValue),
+ merge_output_iter.key().ToString());
+ ASSERT_EQ(test::EncodeInt(26U), merge_output_iter.value().ToString());
+ merge_output_iter.Next();
+ ASSERT_FALSE(merge_output_iter.Valid());
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/merge_operator.cc b/src/rocksdb/db/merge_operator.cc
new file mode 100644
index 000000000..75dea432c
--- /dev/null
+++ b/src/rocksdb/db/merge_operator.cc
@@ -0,0 +1,86 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+/**
+ * Back-end implementation details specific to the Merge Operator.
+ */
+
+#include "rocksdb/merge_operator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool MergeOperator::FullMergeV2(const MergeOperationInput& merge_in,
+ MergeOperationOutput* merge_out) const {
+ // If FullMergeV2 is not implemented, we convert the operand_list to
+ // std::deque<std::string> and pass it to FullMerge
+ std::deque<std::string> operand_list_str;
+ for (auto& op : merge_in.operand_list) {
+ operand_list_str.emplace_back(op.data(), op.size());
+ }
+ return FullMerge(merge_in.key, merge_in.existing_value, operand_list_str,
+ &merge_out->new_value, merge_in.logger);
+}
+
+// The default implementation of PartialMergeMulti, which invokes
+// PartialMerge multiple times internally and merges two operands at
+// a time.
+bool MergeOperator::PartialMergeMulti(const Slice& key,
+ const std::deque<Slice>& operand_list,
+ std::string* new_value,
+ Logger* logger) const {
+ assert(operand_list.size() >= 2);
+ // Simply loop through the operands
+ Slice temp_slice(operand_list[0]);
+
+ for (size_t i = 1; i < operand_list.size(); ++i) {
+ auto& operand = operand_list[i];
+ std::string temp_value;
+ if (!PartialMerge(key, temp_slice, operand, &temp_value, logger)) {
+ return false;
+ }
+ swap(temp_value, *new_value);
+ temp_slice = Slice(*new_value);
+ }
+
+ // The result will be in *new_value. All merges succeeded.
+ return true;
+}
+
+// Given a "real" merge from the library, call the user's
+// associative merge function one-by-one on each of the operands.
+// NOTE: It is assumed that the client's merge-operator will handle any errors.
+bool AssociativeMergeOperator::FullMergeV2(
+ const MergeOperationInput& merge_in,
+ MergeOperationOutput* merge_out) const {
+ // Simply loop through the operands
+ Slice temp_existing;
+ const Slice* existing_value = merge_in.existing_value;
+ for (const auto& operand : merge_in.operand_list) {
+ std::string temp_value;
+ if (!Merge(merge_in.key, existing_value, operand, &temp_value,
+ merge_in.logger)) {
+ return false;
+ }
+ swap(temp_value, merge_out->new_value);
+ temp_existing = Slice(merge_out->new_value);
+ existing_value = &temp_existing;
+ }
+
+ // The result will be in *new_value. All merges succeeded.
+ return true;
+}
+
+// Call the user defined simple merge on the operands;
+// NOTE: It is assumed that the client's merge-operator will handle any errors.
+bool AssociativeMergeOperator::PartialMerge(
+ const Slice& key,
+ const Slice& left_operand,
+ const Slice& right_operand,
+ std::string* new_value,
+ Logger* logger) const {
+ return Merge(key, &left_operand, right_operand, new_value, logger);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/merge_test.cc b/src/rocksdb/db/merge_test.cc
new file mode 100644
index 000000000..3f85f6464
--- /dev/null
+++ b/src/rocksdb/db/merge_test.cc
@@ -0,0 +1,504 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#include <assert.h>
+#include <memory>
+#include <iostream>
+
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "db/write_batch_internal.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "test_util/testharness.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool use_compression;
+
+class MergeTest : public testing::Test {};
+
+size_t num_merge_operator_calls;
+void resetNumMergeOperatorCalls() { num_merge_operator_calls = 0; }
+
+size_t num_partial_merge_calls;
+void resetNumPartialMergeCalls() { num_partial_merge_calls = 0; }
+
+class CountMergeOperator : public AssociativeMergeOperator {
+ public:
+ CountMergeOperator() {
+ mergeOperator_ = MergeOperators::CreateUInt64AddOperator();
+ }
+
+ bool Merge(const Slice& key, const Slice* existing_value, const Slice& value,
+ std::string* new_value, Logger* logger) const override {
+ assert(new_value->empty());
+ ++num_merge_operator_calls;
+ if (existing_value == nullptr) {
+ new_value->assign(value.data(), value.size());
+ return true;
+ }
+
+ return mergeOperator_->PartialMerge(
+ key,
+ *existing_value,
+ value,
+ new_value,
+ logger);
+ }
+
+ bool PartialMergeMulti(const Slice& key,
+ const std::deque<Slice>& operand_list,
+ std::string* new_value,
+ Logger* logger) const override {
+ assert(new_value->empty());
+ ++num_partial_merge_calls;
+ return mergeOperator_->PartialMergeMulti(key, operand_list, new_value,
+ logger);
+ }
+
+ const char* Name() const override { return "UInt64AddOperator"; }
+
+ private:
+ std::shared_ptr<MergeOperator> mergeOperator_;
+};
+
+std::shared_ptr<DB> OpenDb(const std::string& dbname, const bool ttl = false,
+ const size_t max_successive_merges = 0) {
+ DB* db;
+ Options options;
+ options.create_if_missing = true;
+ options.merge_operator = std::make_shared<CountMergeOperator>();
+ options.max_successive_merges = max_successive_merges;
+ Status s;
+ DestroyDB(dbname, Options());
+// DBWithTTL is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+ if (ttl) {
+ DBWithTTL* db_with_ttl;
+ s = DBWithTTL::Open(options, dbname, &db_with_ttl);
+ db = db_with_ttl;
+ } else {
+ s = DB::Open(options, dbname, &db);
+ }
+#else
+ assert(!ttl);
+ s = DB::Open(options, dbname, &db);
+#endif // !ROCKSDB_LITE
+ if (!s.ok()) {
+ std::cerr << s.ToString() << std::endl;
+ assert(false);
+ }
+ return std::shared_ptr<DB>(db);
+}
+
+// Imagine we are maintaining a set of uint64 counters.
+// Each counter has a distinct name. And we would like
+// to support four high level operations:
+// set, add, get and remove
+// This is a quick implementation without a Merge operation.
+class Counters {
+
+ protected:
+ std::shared_ptr<DB> db_;
+
+ WriteOptions put_option_;
+ ReadOptions get_option_;
+ WriteOptions delete_option_;
+
+ uint64_t default_;
+
+ public:
+ explicit Counters(std::shared_ptr<DB> db, uint64_t defaultCount = 0)
+ : db_(db),
+ put_option_(),
+ get_option_(),
+ delete_option_(),
+ default_(defaultCount) {
+ assert(db_);
+ }
+
+ virtual ~Counters() {}
+
+ // public interface of Counters.
+ // All four functions return false
+ // if the underlying level db operation failed.
+
+ // mapped to a levedb Put
+ bool set(const std::string& key, uint64_t value) {
+ // just treat the internal rep of int64 as the string
+ char buf[sizeof(value)];
+ EncodeFixed64(buf, value);
+ Slice slice(buf, sizeof(value));
+ auto s = db_->Put(put_option_, key, slice);
+
+ if (s.ok()) {
+ return true;
+ } else {
+ std::cerr << s.ToString() << std::endl;
+ return false;
+ }
+ }
+
+ // mapped to a rocksdb Delete
+ bool remove(const std::string& key) {
+ auto s = db_->Delete(delete_option_, key);
+
+ if (s.ok()) {
+ return true;
+ } else {
+ std::cerr << s.ToString() << std::endl;
+ return false;
+ }
+ }
+
+ // mapped to a rocksdb Get
+ bool get(const std::string& key, uint64_t* value) {
+ std::string str;
+ auto s = db_->Get(get_option_, key, &str);
+
+ if (s.IsNotFound()) {
+ // return default value if not found;
+ *value = default_;
+ return true;
+ } else if (s.ok()) {
+ // deserialization
+ if (str.size() != sizeof(uint64_t)) {
+ std::cerr << "value corruption\n";
+ return false;
+ }
+ *value = DecodeFixed64(&str[0]);
+ return true;
+ } else {
+ std::cerr << s.ToString() << std::endl;
+ return false;
+ }
+ }
+
+ // 'add' is implemented as get -> modify -> set
+ // An alternative is a single merge operation, see MergeBasedCounters
+ virtual bool add(const std::string& key, uint64_t value) {
+ uint64_t base = default_;
+ return get(key, &base) && set(key, base + value);
+ }
+
+
+ // convenience functions for testing
+ void assert_set(const std::string& key, uint64_t value) {
+ assert(set(key, value));
+ }
+
+ void assert_remove(const std::string& key) { assert(remove(key)); }
+
+ uint64_t assert_get(const std::string& key) {
+ uint64_t value = default_;
+ int result = get(key, &value);
+ assert(result);
+ if (result == 0) exit(1); // Disable unused variable warning.
+ return value;
+ }
+
+ void assert_add(const std::string& key, uint64_t value) {
+ int result = add(key, value);
+ assert(result);
+ if (result == 0) exit(1); // Disable unused variable warning.
+ }
+};
+
+// Implement 'add' directly with the new Merge operation
+class MergeBasedCounters : public Counters {
+ private:
+ WriteOptions merge_option_; // for merge
+
+ public:
+ explicit MergeBasedCounters(std::shared_ptr<DB> db, uint64_t defaultCount = 0)
+ : Counters(db, defaultCount),
+ merge_option_() {
+ }
+
+ // mapped to a rocksdb Merge operation
+ bool add(const std::string& key, uint64_t value) override {
+ char encoded[sizeof(uint64_t)];
+ EncodeFixed64(encoded, value);
+ Slice slice(encoded, sizeof(uint64_t));
+ auto s = db_->Merge(merge_option_, key, slice);
+
+ if (s.ok()) {
+ return true;
+ } else {
+ std::cerr << s.ToString() << std::endl;
+ return false;
+ }
+ }
+};
+
+void dumpDb(DB* db) {
+ auto it = std::unique_ptr<Iterator>(db->NewIterator(ReadOptions()));
+ for (it->SeekToFirst(); it->Valid(); it->Next()) {
+ //uint64_t value = DecodeFixed64(it->value().data());
+ //std::cout << it->key().ToString() << ": " << value << std::endl;
+ }
+ assert(it->status().ok()); // Check for any errors found during the scan
+}
+
+void testCounters(Counters& counters, DB* db, bool test_compaction) {
+
+ FlushOptions o;
+ o.wait = true;
+
+ counters.assert_set("a", 1);
+
+ if (test_compaction) db->Flush(o);
+
+ assert(counters.assert_get("a") == 1);
+
+ counters.assert_remove("b");
+
+ // defaut value is 0 if non-existent
+ assert(counters.assert_get("b") == 0);
+
+ counters.assert_add("a", 2);
+
+ if (test_compaction) db->Flush(o);
+
+ // 1+2 = 3
+ assert(counters.assert_get("a")== 3);
+
+ dumpDb(db);
+
+ // 1+...+49 = ?
+ uint64_t sum = 0;
+ for (int i = 1; i < 50; i++) {
+ counters.assert_add("b", i);
+ sum += i;
+ }
+ assert(counters.assert_get("b") == sum);
+
+ dumpDb(db);
+
+ if (test_compaction) {
+ db->Flush(o);
+
+ db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+ dumpDb(db);
+
+ assert(counters.assert_get("a")== 3);
+ assert(counters.assert_get("b") == sum);
+ }
+}
+
+void testSuccessiveMerge(Counters& counters, size_t max_num_merges,
+ size_t num_merges) {
+
+ counters.assert_remove("z");
+ uint64_t sum = 0;
+
+ for (size_t i = 1; i <= num_merges; ++i) {
+ resetNumMergeOperatorCalls();
+ counters.assert_add("z", i);
+ sum += i;
+
+ if (i % (max_num_merges + 1) == 0) {
+ assert(num_merge_operator_calls == max_num_merges + 1);
+ } else {
+ assert(num_merge_operator_calls == 0);
+ }
+
+ resetNumMergeOperatorCalls();
+ assert(counters.assert_get("z") == sum);
+ assert(num_merge_operator_calls == i % (max_num_merges + 1));
+ }
+}
+
+void testPartialMerge(Counters* counters, DB* db, size_t max_merge,
+ size_t min_merge, size_t count) {
+ FlushOptions o;
+ o.wait = true;
+
+ // Test case 1: partial merge should be called when the number of merge
+ // operands exceeds the threshold.
+ uint64_t tmp_sum = 0;
+ resetNumPartialMergeCalls();
+ for (size_t i = 1; i <= count; i++) {
+ counters->assert_add("b", i);
+ tmp_sum += i;
+ }
+ db->Flush(o);
+ db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ ASSERT_EQ(tmp_sum, counters->assert_get("b"));
+ if (count > max_merge) {
+ // in this case, FullMerge should be called instead.
+ ASSERT_EQ(num_partial_merge_calls, 0U);
+ } else {
+ // if count >= min_merge, then partial merge should be called once.
+ ASSERT_EQ((count >= min_merge), (num_partial_merge_calls == 1));
+ }
+
+ // Test case 2: partial merge should not be called when a put is found.
+ resetNumPartialMergeCalls();
+ tmp_sum = 0;
+ db->Put(ROCKSDB_NAMESPACE::WriteOptions(), "c", "10");
+ for (size_t i = 1; i <= count; i++) {
+ counters->assert_add("c", i);
+ tmp_sum += i;
+ }
+ db->Flush(o);
+ db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ ASSERT_EQ(tmp_sum, counters->assert_get("c"));
+ ASSERT_EQ(num_partial_merge_calls, 0U);
+}
+
+void testSingleBatchSuccessiveMerge(DB* db, size_t max_num_merges,
+ size_t num_merges) {
+ assert(num_merges > max_num_merges);
+
+ Slice key("BatchSuccessiveMerge");
+ uint64_t merge_value = 1;
+ char buf[sizeof(merge_value)];
+ EncodeFixed64(buf, merge_value);
+ Slice merge_value_slice(buf, sizeof(merge_value));
+
+ // Create the batch
+ WriteBatch batch;
+ for (size_t i = 0; i < num_merges; ++i) {
+ batch.Merge(key, merge_value_slice);
+ }
+
+ // Apply to memtable and count the number of merges
+ resetNumMergeOperatorCalls();
+ {
+ Status s = db->Write(WriteOptions(), &batch);
+ assert(s.ok());
+ }
+ ASSERT_EQ(
+ num_merge_operator_calls,
+ static_cast<size_t>(num_merges - (num_merges % (max_num_merges + 1))));
+
+ // Get the value
+ resetNumMergeOperatorCalls();
+ std::string get_value_str;
+ {
+ Status s = db->Get(ReadOptions(), key, &get_value_str);
+ assert(s.ok());
+ }
+ assert(get_value_str.size() == sizeof(uint64_t));
+ uint64_t get_value = DecodeFixed64(&get_value_str[0]);
+ ASSERT_EQ(get_value, num_merges * merge_value);
+ ASSERT_EQ(num_merge_operator_calls,
+ static_cast<size_t>((num_merges % (max_num_merges + 1))));
+}
+
+void runTest(const std::string& dbname, const bool use_ttl = false) {
+
+ {
+ auto db = OpenDb(dbname, use_ttl);
+
+ {
+ Counters counters(db, 0);
+ testCounters(counters, db.get(), true);
+ }
+
+ {
+ MergeBasedCounters counters(db, 0);
+ testCounters(counters, db.get(), use_compression);
+ }
+ }
+
+ DestroyDB(dbname, Options());
+
+ {
+ size_t max_merge = 5;
+ auto db = OpenDb(dbname, use_ttl, max_merge);
+ MergeBasedCounters counters(db, 0);
+ testCounters(counters, db.get(), use_compression);
+ testSuccessiveMerge(counters, max_merge, max_merge * 2);
+ testSingleBatchSuccessiveMerge(db.get(), 5, 7);
+ DestroyDB(dbname, Options());
+ }
+
+ {
+ size_t max_merge = 100;
+ // Min merge is hard-coded to 2.
+ uint32_t min_merge = 2;
+ for (uint32_t count = min_merge - 1; count <= min_merge + 1; count++) {
+ auto db = OpenDb(dbname, use_ttl, max_merge);
+ MergeBasedCounters counters(db, 0);
+ testPartialMerge(&counters, db.get(), max_merge, min_merge, count);
+ DestroyDB(dbname, Options());
+ }
+ {
+ auto db = OpenDb(dbname, use_ttl, max_merge);
+ MergeBasedCounters counters(db, 0);
+ testPartialMerge(&counters, db.get(), max_merge, min_merge,
+ min_merge * 10);
+ DestroyDB(dbname, Options());
+ }
+ }
+
+ {
+ {
+ auto db = OpenDb(dbname);
+ MergeBasedCounters counters(db, 0);
+ counters.add("test-key", 1);
+ counters.add("test-key", 1);
+ counters.add("test-key", 1);
+ db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ }
+
+ DB* reopen_db;
+ ASSERT_OK(DB::Open(Options(), dbname, &reopen_db));
+ std::string value;
+ ASSERT_TRUE(!(reopen_db->Get(ReadOptions(), "test-key", &value).ok()));
+ delete reopen_db;
+ DestroyDB(dbname, Options());
+ }
+
+ /* Temporary remove this test
+ {
+ std::cout << "Test merge-operator not set after reopen (recovery case)\n";
+ {
+ auto db = OpenDb(dbname);
+ MergeBasedCounters counters(db, 0);
+ counters.add("test-key", 1);
+ counters.add("test-key", 1);
+ counters.add("test-key", 1);
+ }
+
+ DB* reopen_db;
+ ASSERT_TRUE(DB::Open(Options(), dbname, &reopen_db).IsInvalidArgument());
+ }
+ */
+}
+
+TEST_F(MergeTest, MergeDbTest) {
+ runTest(test::PerThreadDBPath("merge_testdb"));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(MergeTest, MergeDbTtlTest) {
+ runTest(test::PerThreadDBPath("merge_testdbttl"),
+ true); // Run test on TTL database
+}
+#endif // !ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::use_compression = false;
+ if (argc > 1) {
+ ROCKSDB_NAMESPACE::use_compression = true;
+ }
+
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/obsolete_files_test.cc b/src/rocksdb/db/obsolete_files_test.cc
new file mode 100644
index 000000000..bf018a0e3
--- /dev/null
+++ b/src/rocksdb/db/obsolete_files_test.cc
@@ -0,0 +1,222 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include <stdlib.h>
+#include <map>
+#include <string>
+#include <vector>
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "file/filename.h"
+#include "port/stack_trace.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/transaction_log.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+using std::cerr;
+using std::cout;
+using std::endl;
+using std::flush;
+
+namespace ROCKSDB_NAMESPACE {
+
+class ObsoleteFilesTest : public DBTestBase {
+ public:
+ ObsoleteFilesTest()
+ : DBTestBase("/obsolete_files_test"), wal_dir_(dbname_ + "/wal_files") {}
+
+ void AddKeys(int numkeys, int startkey) {
+ WriteOptions options;
+ options.sync = false;
+ for (int i = startkey; i < (numkeys + startkey) ; i++) {
+ std::string temp = ToString(i);
+ Slice key(temp);
+ Slice value(temp);
+ ASSERT_OK(db_->Put(options, key, value));
+ }
+ }
+
+ void createLevel0Files(int numFiles, int numKeysPerFile) {
+ int startKey = 0;
+ for (int i = 0; i < numFiles; i++) {
+ AddKeys(numKeysPerFile, startKey);
+ startKey += numKeysPerFile;
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ }
+ }
+
+ void CheckFileTypeCounts(const std::string& dir, int required_log,
+ int required_sst, int required_manifest) {
+ std::vector<std::string> filenames;
+ env_->GetChildren(dir, &filenames);
+
+ int log_cnt = 0;
+ int sst_cnt = 0;
+ int manifest_cnt = 0;
+ for (auto file : filenames) {
+ uint64_t number;
+ FileType type;
+ if (ParseFileName(file, &number, &type)) {
+ log_cnt += (type == kLogFile);
+ sst_cnt += (type == kTableFile);
+ manifest_cnt += (type == kDescriptorFile);
+ }
+ }
+ ASSERT_EQ(required_log, log_cnt);
+ ASSERT_EQ(required_sst, sst_cnt);
+ ASSERT_EQ(required_manifest, manifest_cnt);
+ }
+
+ void ReopenDB() {
+ Options options = CurrentOptions();
+ // Trigger compaction when the number of level 0 files reaches 2.
+ options.create_if_missing = true;
+ options.level0_file_num_compaction_trigger = 2;
+ options.disable_auto_compactions = false;
+ options.delete_obsolete_files_period_micros = 0; // always do full purge
+ options.enable_thread_tracking = true;
+ options.write_buffer_size = 1024 * 1024 * 1000;
+ options.target_file_size_base = 1024 * 1024 * 1000;
+ options.max_bytes_for_level_base = 1024 * 1024 * 1000;
+ options.WAL_ttl_seconds = 300; // Used to test log files
+ options.WAL_size_limit_MB = 1024; // Used to test log files
+ options.wal_dir = wal_dir_;
+ Destroy(options);
+ Reopen(options);
+ }
+
+ const std::string wal_dir_;
+};
+
+TEST_F(ObsoleteFilesTest, RaceForObsoleteFileDeletion) {
+ ReopenDB();
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->LoadDependency({
+ {"DBImpl::BackgroundCallCompaction:FoundObsoleteFiles",
+ "ObsoleteFilesTest::RaceForObsoleteFileDeletion:1"},
+ {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
+ "ObsoleteFilesTest::RaceForObsoleteFileDeletion:2"},
+ });
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::DeleteObsoleteFileImpl:AfterDeletion", [&](void* arg) {
+ Status* p_status = reinterpret_cast<Status*>(arg);
+ ASSERT_OK(*p_status);
+ });
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::CloseHelper:PendingPurgeFinished", [&](void* arg) {
+ std::unordered_set<uint64_t>* files_grabbed_for_purge_ptr =
+ reinterpret_cast<std::unordered_set<uint64_t>*>(arg);
+ ASSERT_TRUE(files_grabbed_for_purge_ptr->empty());
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ createLevel0Files(2, 50000);
+ CheckFileTypeCounts(wal_dir_, 1, 0, 0);
+
+ port::Thread user_thread([this]() {
+ JobContext jobCxt(0);
+ TEST_SYNC_POINT("ObsoleteFilesTest::RaceForObsoleteFileDeletion:1");
+ dbfull()->TEST_LockMutex();
+ dbfull()->FindObsoleteFiles(&jobCxt, true /* force=true */,
+ false /* no_full_scan=false */);
+ dbfull()->TEST_UnlockMutex();
+ TEST_SYNC_POINT("ObsoleteFilesTest::RaceForObsoleteFileDeletion:2");
+ dbfull()->PurgeObsoleteFiles(jobCxt);
+ jobCxt.Clean();
+ });
+
+ user_thread.join();
+}
+
+TEST_F(ObsoleteFilesTest, DeleteObsoleteOptionsFile) {
+ ReopenDB();
+ SyncPoint::GetInstance()->DisableProcessing();
+ std::vector<uint64_t> optsfiles_nums;
+ std::vector<bool> optsfiles_keep;
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::PurgeObsoleteFiles:CheckOptionsFiles:1", [&](void* arg) {
+ optsfiles_nums.push_back(*reinterpret_cast<uint64_t*>(arg));
+ });
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::PurgeObsoleteFiles:CheckOptionsFiles:2", [&](void* arg) {
+ optsfiles_keep.push_back(*reinterpret_cast<bool*>(arg));
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ createLevel0Files(2, 50000);
+ CheckFileTypeCounts(wal_dir_, 1, 0, 0);
+
+ ASSERT_OK(dbfull()->DisableFileDeletions());
+ for (int i = 0; i != 4; ++i) {
+ if (i % 2) {
+ ASSERT_OK(dbfull()->SetOptions(dbfull()->DefaultColumnFamily(),
+ {{"paranoid_file_checks", "false"}}));
+ } else {
+ ASSERT_OK(dbfull()->SetOptions(dbfull()->DefaultColumnFamily(),
+ {{"paranoid_file_checks", "true"}}));
+ }
+ }
+ ASSERT_OK(dbfull()->EnableFileDeletions(true /* force */));
+ ASSERT_EQ(optsfiles_nums.size(), optsfiles_keep.size());
+
+ Close();
+
+ std::vector<std::string> files;
+ int opts_file_count = 0;
+ ASSERT_OK(env_->GetChildren(dbname_, &files));
+ for (const auto& file : files) {
+ uint64_t file_num;
+ Slice dummy_info_log_name_prefix;
+ FileType type;
+ WalFileType log_type;
+ if (ParseFileName(file, &file_num, dummy_info_log_name_prefix, &type,
+ &log_type) &&
+ type == kOptionsFile) {
+ opts_file_count++;
+ }
+ }
+ ASSERT_EQ(2, opts_file_count);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr,
+ "SKIPPED as DBImpl::DeleteFile is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/options_file_test.cc b/src/rocksdb/db/options_file_test.cc
new file mode 100644
index 000000000..00427de8a
--- /dev/null
+++ b/src/rocksdb/db/options_file_test.cc
@@ -0,0 +1,119 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include <string>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+class OptionsFileTest : public testing::Test {
+ public:
+ OptionsFileTest() : dbname_(test::PerThreadDBPath("options_file_test")) {}
+
+ std::string dbname_;
+};
+
+namespace {
+void UpdateOptionsFiles(DB* db,
+ std::unordered_set<std::string>* filename_history,
+ int* options_files_count) {
+ std::vector<std::string> filenames;
+ db->GetEnv()->GetChildren(db->GetName(), &filenames);
+ uint64_t number;
+ FileType type;
+ *options_files_count = 0;
+ for (auto filename : filenames) {
+ if (ParseFileName(filename, &number, &type) && type == kOptionsFile) {
+ filename_history->insert(filename);
+ (*options_files_count)++;
+ }
+ }
+}
+
+// Verify whether the current Options Files are the latest ones.
+void VerifyOptionsFileName(
+ DB* db, const std::unordered_set<std::string>& past_filenames) {
+ std::vector<std::string> filenames;
+ std::unordered_set<std::string> current_filenames;
+ db->GetEnv()->GetChildren(db->GetName(), &filenames);
+ uint64_t number;
+ FileType type;
+ for (auto filename : filenames) {
+ if (ParseFileName(filename, &number, &type) && type == kOptionsFile) {
+ current_filenames.insert(filename);
+ }
+ }
+ for (auto past_filename : past_filenames) {
+ if (current_filenames.find(past_filename) != current_filenames.end()) {
+ continue;
+ }
+ for (auto filename : current_filenames) {
+ ASSERT_GT(filename, past_filename);
+ }
+ }
+}
+} // namespace
+
+TEST_F(OptionsFileTest, NumberOfOptionsFiles) {
+ const int kReopenCount = 20;
+ Options opt;
+ opt.create_if_missing = true;
+ DestroyDB(dbname_, opt);
+ std::unordered_set<std::string> filename_history;
+ DB* db;
+ for (int i = 0; i < kReopenCount; ++i) {
+ ASSERT_OK(DB::Open(opt, dbname_, &db));
+ int num_options_files = 0;
+ UpdateOptionsFiles(db, &filename_history, &num_options_files);
+ ASSERT_GT(num_options_files, 0);
+ ASSERT_LE(num_options_files, 2);
+ // Make sure we always keep the latest option files.
+ VerifyOptionsFileName(db, filename_history);
+ delete db;
+ }
+}
+
+TEST_F(OptionsFileTest, OptionsFileName) {
+ const uint64_t kOptionsFileNum = 12345;
+ uint64_t number;
+ FileType type;
+
+ auto options_file_name = OptionsFileName("", kOptionsFileNum);
+ ASSERT_TRUE(ParseFileName(options_file_name, &number, &type, nullptr));
+ ASSERT_EQ(type, kOptionsFile);
+ ASSERT_EQ(number, kOptionsFileNum);
+
+ const uint64_t kTempOptionsFileNum = 54352;
+ auto temp_options_file_name = TempOptionsFileName("", kTempOptionsFileNum);
+ ASSERT_TRUE(ParseFileName(temp_options_file_name, &number, &type, nullptr));
+ ASSERT_NE(temp_options_file_name.find(kTempFileNameSuffix),
+ std::string::npos);
+ ASSERT_EQ(type, kTempFile);
+ ASSERT_EQ(number, kTempOptionsFileNum);
+}
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+#if !(defined NDEBUG) || !defined(OS_WIN)
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+#else
+ return 0;
+#endif // !(defined NDEBUG) || !defined(OS_WIN)
+}
+#else
+
+#include <cstdio>
+
+int main(int /*argc*/, char** /*argv*/) {
+ printf("Skipped as Options file is not supported in RocksDBLite.\n");
+ return 0;
+}
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/perf_context_test.cc b/src/rocksdb/db/perf_context_test.cc
new file mode 100644
index 000000000..86f2db7b6
--- /dev/null
+++ b/src/rocksdb/db/perf_context_test.cc
@@ -0,0 +1,981 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#include <algorithm>
+#include <iostream>
+#include <thread>
+#include <vector>
+
+#include "monitoring/histogram.h"
+#include "monitoring/instrumented_mutex.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/thread_status_util.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice_transform.h"
+#include "test_util/testharness.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+bool FLAGS_random_key = false;
+bool FLAGS_use_set_based_memetable = false;
+int FLAGS_total_keys = 100;
+int FLAGS_write_buffer_size = 1000000000;
+int FLAGS_max_write_buffer_number = 8;
+int FLAGS_min_write_buffer_number_to_merge = 7;
+bool FLAGS_verbose = false;
+
+// Path to the database on file system
+const std::string kDbName =
+ ROCKSDB_NAMESPACE::test::PerThreadDBPath("perf_context_test");
+
+namespace ROCKSDB_NAMESPACE {
+
+std::shared_ptr<DB> OpenDb(bool read_only = false) {
+ DB* db;
+ Options options;
+ options.create_if_missing = true;
+ options.max_open_files = -1;
+ options.write_buffer_size = FLAGS_write_buffer_size;
+ options.max_write_buffer_number = FLAGS_max_write_buffer_number;
+ options.min_write_buffer_number_to_merge =
+ FLAGS_min_write_buffer_number_to_merge;
+
+ if (FLAGS_use_set_based_memetable) {
+#ifndef ROCKSDB_LITE
+ options.prefix_extractor.reset(
+ ROCKSDB_NAMESPACE::NewFixedPrefixTransform(0));
+ options.memtable_factory.reset(NewHashSkipListRepFactory());
+#endif // ROCKSDB_LITE
+ }
+
+ Status s;
+ if (!read_only) {
+ s = DB::Open(options, kDbName, &db);
+ } else {
+ s = DB::OpenForReadOnly(options, kDbName, &db);
+ }
+ EXPECT_OK(s);
+ return std::shared_ptr<DB>(db);
+}
+
+class PerfContextTest : public testing::Test {};
+
+TEST_F(PerfContextTest, SeekIntoDeletion) {
+ DestroyDB(kDbName, Options());
+ auto db = OpenDb();
+ WriteOptions write_options;
+ ReadOptions read_options;
+
+ for (int i = 0; i < FLAGS_total_keys; ++i) {
+ std::string key = "k" + ToString(i);
+ std::string value = "v" + ToString(i);
+
+ db->Put(write_options, key, value);
+ }
+
+ for (int i = 0; i < FLAGS_total_keys -1 ; ++i) {
+ std::string key = "k" + ToString(i);
+ db->Delete(write_options, key);
+ }
+
+ HistogramImpl hist_get;
+ HistogramImpl hist_get_time;
+ for (int i = 0; i < FLAGS_total_keys - 1; ++i) {
+ std::string key = "k" + ToString(i);
+ std::string value;
+
+ get_perf_context()->Reset();
+ StopWatchNano timer(Env::Default());
+ timer.Start();
+ auto status = db->Get(read_options, key, &value);
+ auto elapsed_nanos = timer.ElapsedNanos();
+ ASSERT_TRUE(status.IsNotFound());
+ hist_get.Add(get_perf_context()->user_key_comparison_count);
+ hist_get_time.Add(elapsed_nanos);
+ }
+
+ if (FLAGS_verbose) {
+ std::cout << "Get user key comparison: \n" << hist_get.ToString()
+ << "Get time: \n" << hist_get_time.ToString();
+ }
+
+ {
+ HistogramImpl hist_seek_to_first;
+ std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+
+ get_perf_context()->Reset();
+ StopWatchNano timer(Env::Default(), true);
+ iter->SeekToFirst();
+ hist_seek_to_first.Add(get_perf_context()->user_key_comparison_count);
+ auto elapsed_nanos = timer.ElapsedNanos();
+
+ if (FLAGS_verbose) {
+ std::cout << "SeekToFirst uesr key comparison: \n"
+ << hist_seek_to_first.ToString()
+ << "ikey skipped: " << get_perf_context()->internal_key_skipped_count
+ << "\n"
+ << "idelete skipped: "
+ << get_perf_context()->internal_delete_skipped_count << "\n"
+ << "elapsed: " << elapsed_nanos << "\n";
+ }
+ }
+
+ HistogramImpl hist_seek;
+ for (int i = 0; i < FLAGS_total_keys; ++i) {
+ std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+ std::string key = "k" + ToString(i);
+
+ get_perf_context()->Reset();
+ StopWatchNano timer(Env::Default(), true);
+ iter->Seek(key);
+ auto elapsed_nanos = timer.ElapsedNanos();
+ hist_seek.Add(get_perf_context()->user_key_comparison_count);
+ if (FLAGS_verbose) {
+ std::cout << "seek cmp: " << get_perf_context()->user_key_comparison_count
+ << " ikey skipped " << get_perf_context()->internal_key_skipped_count
+ << " idelete skipped "
+ << get_perf_context()->internal_delete_skipped_count
+ << " elapsed: " << elapsed_nanos << "ns\n";
+ }
+
+ get_perf_context()->Reset();
+ ASSERT_TRUE(iter->Valid());
+ StopWatchNano timer2(Env::Default(), true);
+ iter->Next();
+ auto elapsed_nanos2 = timer2.ElapsedNanos();
+ if (FLAGS_verbose) {
+ std::cout << "next cmp: " << get_perf_context()->user_key_comparison_count
+ << "elapsed: " << elapsed_nanos2 << "ns\n";
+ }
+ }
+
+ if (FLAGS_verbose) {
+ std::cout << "Seek uesr key comparison: \n" << hist_seek.ToString();
+ }
+}
+
+TEST_F(PerfContextTest, StopWatchNanoOverhead) {
+ // profile the timer cost by itself!
+ const int kTotalIterations = 1000000;
+ std::vector<uint64_t> timings(kTotalIterations);
+
+ StopWatchNano timer(Env::Default(), true);
+ for (auto& timing : timings) {
+ timing = timer.ElapsedNanos(true /* reset */);
+ }
+
+ HistogramImpl histogram;
+ for (const auto timing : timings) {
+ histogram.Add(timing);
+ }
+
+ if (FLAGS_verbose) {
+ std::cout << histogram.ToString();
+ }
+}
+
+TEST_F(PerfContextTest, StopWatchOverhead) {
+ // profile the timer cost by itself!
+ const int kTotalIterations = 1000000;
+ uint64_t elapsed = 0;
+ std::vector<uint64_t> timings(kTotalIterations);
+
+ StopWatch timer(Env::Default(), nullptr, 0, &elapsed);
+ for (auto& timing : timings) {
+ timing = elapsed;
+ }
+
+ HistogramImpl histogram;
+ uint64_t prev_timing = 0;
+ for (const auto timing : timings) {
+ histogram.Add(timing - prev_timing);
+ prev_timing = timing;
+ }
+
+ if (FLAGS_verbose) {
+ std::cout << histogram.ToString();
+ }
+}
+
+void ProfileQueries(bool enabled_time = false) {
+ DestroyDB(kDbName, Options()); // Start this test with a fresh DB
+
+ auto db = OpenDb();
+
+ WriteOptions write_options;
+ ReadOptions read_options;
+
+ HistogramImpl hist_put;
+
+ HistogramImpl hist_get;
+ HistogramImpl hist_get_snapshot;
+ HistogramImpl hist_get_memtable;
+ HistogramImpl hist_get_files;
+ HistogramImpl hist_get_post_process;
+ HistogramImpl hist_num_memtable_checked;
+
+ HistogramImpl hist_mget;
+ HistogramImpl hist_mget_snapshot;
+ HistogramImpl hist_mget_memtable;
+ HistogramImpl hist_mget_files;
+ HistogramImpl hist_mget_post_process;
+ HistogramImpl hist_mget_num_memtable_checked;
+
+ HistogramImpl hist_write_pre_post;
+ HistogramImpl hist_write_wal_time;
+ HistogramImpl hist_write_memtable_time;
+ HistogramImpl hist_write_delay_time;
+ HistogramImpl hist_write_thread_wait_nanos;
+ HistogramImpl hist_write_scheduling_time;
+
+ uint64_t total_db_mutex_nanos = 0;
+
+ if (FLAGS_verbose) {
+ std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
+ }
+
+ std::vector<int> keys;
+ const int kFlushFlag = -1;
+ for (int i = 0; i < FLAGS_total_keys; ++i) {
+ keys.push_back(i);
+ if (i == FLAGS_total_keys / 2) {
+ // Issuing a flush in the middle.
+ keys.push_back(kFlushFlag);
+ }
+ }
+
+ if (FLAGS_random_key) {
+ std::random_shuffle(keys.begin(), keys.end());
+ }
+#ifndef NDEBUG
+ ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 1U);
+#endif
+ int num_mutex_waited = 0;
+ for (const int i : keys) {
+ if (i == kFlushFlag) {
+ FlushOptions fo;
+ db->Flush(fo);
+ continue;
+ }
+
+ std::string key = "k" + ToString(i);
+ std::string value = "v" + ToString(i);
+
+ std::vector<std::string> values;
+
+ get_perf_context()->Reset();
+ db->Put(write_options, key, value);
+ if (++num_mutex_waited > 3) {
+#ifndef NDEBUG
+ ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0U);
+#endif
+ }
+ hist_write_pre_post.Add(
+ get_perf_context()->write_pre_and_post_process_time);
+ hist_write_wal_time.Add(get_perf_context()->write_wal_time);
+ hist_write_memtable_time.Add(get_perf_context()->write_memtable_time);
+ hist_write_delay_time.Add(get_perf_context()->write_delay_time);
+ hist_write_thread_wait_nanos.Add(
+ get_perf_context()->write_thread_wait_nanos);
+ hist_write_scheduling_time.Add(
+ get_perf_context()->write_scheduling_flushes_compactions_time);
+ hist_put.Add(get_perf_context()->user_key_comparison_count);
+ total_db_mutex_nanos += get_perf_context()->db_mutex_lock_nanos;
+ }
+#ifndef NDEBUG
+ ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0U);
+#endif
+
+ for (const int i : keys) {
+ if (i == kFlushFlag) {
+ continue;
+ }
+ std::string key = "k" + ToString(i);
+ std::string expected_value = "v" + ToString(i);
+ std::string value;
+
+ std::vector<Slice> multiget_keys = {Slice(key)};
+ std::vector<std::string> values;
+
+ get_perf_context()->Reset();
+ ASSERT_OK(db->Get(read_options, key, &value));
+ ASSERT_EQ(expected_value, value);
+ hist_get_snapshot.Add(get_perf_context()->get_snapshot_time);
+ hist_get_memtable.Add(get_perf_context()->get_from_memtable_time);
+ hist_get_files.Add(get_perf_context()->get_from_output_files_time);
+ hist_num_memtable_checked.Add(get_perf_context()->get_from_memtable_count);
+ hist_get_post_process.Add(get_perf_context()->get_post_process_time);
+ hist_get.Add(get_perf_context()->user_key_comparison_count);
+
+ get_perf_context()->Reset();
+ db->MultiGet(read_options, multiget_keys, &values);
+ hist_mget_snapshot.Add(get_perf_context()->get_snapshot_time);
+ hist_mget_memtable.Add(get_perf_context()->get_from_memtable_time);
+ hist_mget_files.Add(get_perf_context()->get_from_output_files_time);
+ hist_mget_num_memtable_checked.Add(get_perf_context()->get_from_memtable_count);
+ hist_mget_post_process.Add(get_perf_context()->get_post_process_time);
+ hist_mget.Add(get_perf_context()->user_key_comparison_count);
+ }
+
+ if (FLAGS_verbose) {
+ std::cout << "Put uesr key comparison: \n" << hist_put.ToString()
+ << "Get uesr key comparison: \n" << hist_get.ToString()
+ << "MultiGet uesr key comparison: \n" << hist_get.ToString();
+ std::cout << "Put(): Pre and Post Process Time: \n"
+ << hist_write_pre_post.ToString() << " Writing WAL time: \n"
+ << hist_write_wal_time.ToString() << "\n"
+ << " Writing Mem Table time: \n"
+ << hist_write_memtable_time.ToString() << "\n"
+ << " Write Delay: \n" << hist_write_delay_time.ToString() << "\n"
+ << " Waiting for Batch time: \n"
+ << hist_write_thread_wait_nanos.ToString() << "\n"
+ << " Scheduling Flushes and Compactions Time: \n"
+ << hist_write_scheduling_time.ToString() << "\n"
+ << " Total DB mutex nanos: \n" << total_db_mutex_nanos << "\n";
+
+ std::cout << "Get(): Time to get snapshot: \n"
+ << hist_get_snapshot.ToString()
+ << " Time to get value from memtables: \n"
+ << hist_get_memtable.ToString() << "\n"
+ << " Time to get value from output files: \n"
+ << hist_get_files.ToString() << "\n"
+ << " Number of memtables checked: \n"
+ << hist_num_memtable_checked.ToString() << "\n"
+ << " Time to post process: \n" << hist_get_post_process.ToString()
+ << "\n";
+
+ std::cout << "MultiGet(): Time to get snapshot: \n"
+ << hist_mget_snapshot.ToString()
+ << " Time to get value from memtables: \n"
+ << hist_mget_memtable.ToString() << "\n"
+ << " Time to get value from output files: \n"
+ << hist_mget_files.ToString() << "\n"
+ << " Number of memtables checked: \n"
+ << hist_mget_num_memtable_checked.ToString() << "\n"
+ << " Time to post process: \n"
+ << hist_mget_post_process.ToString() << "\n";
+ }
+
+ if (enabled_time) {
+ ASSERT_GT(hist_get.Average(), 0);
+ ASSERT_GT(hist_get_snapshot.Average(), 0);
+ ASSERT_GT(hist_get_memtable.Average(), 0);
+ ASSERT_GT(hist_get_files.Average(), 0);
+ ASSERT_GT(hist_get_post_process.Average(), 0);
+ ASSERT_GT(hist_num_memtable_checked.Average(), 0);
+
+ ASSERT_GT(hist_mget.Average(), 0);
+ ASSERT_GT(hist_mget_snapshot.Average(), 0);
+ ASSERT_GT(hist_mget_memtable.Average(), 0);
+ ASSERT_GT(hist_mget_files.Average(), 0);
+ ASSERT_GT(hist_mget_post_process.Average(), 0);
+ ASSERT_GT(hist_mget_num_memtable_checked.Average(), 0);
+
+ EXPECT_GT(hist_write_pre_post.Average(), 0);
+ EXPECT_GT(hist_write_wal_time.Average(), 0);
+ EXPECT_GT(hist_write_memtable_time.Average(), 0);
+ EXPECT_EQ(hist_write_delay_time.Average(), 0);
+ EXPECT_EQ(hist_write_thread_wait_nanos.Average(), 0);
+ EXPECT_GT(hist_write_scheduling_time.Average(), 0);
+
+#ifndef NDEBUG
+ ASSERT_GT(total_db_mutex_nanos, 2000U);
+#endif
+ }
+
+ db.reset();
+ db = OpenDb(true);
+
+ hist_get.Clear();
+ hist_get_snapshot.Clear();
+ hist_get_memtable.Clear();
+ hist_get_files.Clear();
+ hist_get_post_process.Clear();
+ hist_num_memtable_checked.Clear();
+
+ hist_mget.Clear();
+ hist_mget_snapshot.Clear();
+ hist_mget_memtable.Clear();
+ hist_mget_files.Clear();
+ hist_mget_post_process.Clear();
+ hist_mget_num_memtable_checked.Clear();
+
+ for (const int i : keys) {
+ if (i == kFlushFlag) {
+ continue;
+ }
+ std::string key = "k" + ToString(i);
+ std::string expected_value = "v" + ToString(i);
+ std::string value;
+
+ std::vector<Slice> multiget_keys = {Slice(key)};
+ std::vector<std::string> values;
+
+ get_perf_context()->Reset();
+ ASSERT_OK(db->Get(read_options, key, &value));
+ ASSERT_EQ(expected_value, value);
+ hist_get_snapshot.Add(get_perf_context()->get_snapshot_time);
+ hist_get_memtable.Add(get_perf_context()->get_from_memtable_time);
+ hist_get_files.Add(get_perf_context()->get_from_output_files_time);
+ hist_num_memtable_checked.Add(get_perf_context()->get_from_memtable_count);
+ hist_get_post_process.Add(get_perf_context()->get_post_process_time);
+ hist_get.Add(get_perf_context()->user_key_comparison_count);
+
+ get_perf_context()->Reset();
+ db->MultiGet(read_options, multiget_keys, &values);
+ hist_mget_snapshot.Add(get_perf_context()->get_snapshot_time);
+ hist_mget_memtable.Add(get_perf_context()->get_from_memtable_time);
+ hist_mget_files.Add(get_perf_context()->get_from_output_files_time);
+ hist_mget_num_memtable_checked.Add(get_perf_context()->get_from_memtable_count);
+ hist_mget_post_process.Add(get_perf_context()->get_post_process_time);
+ hist_mget.Add(get_perf_context()->user_key_comparison_count);
+ }
+
+ if (FLAGS_verbose) {
+ std::cout << "ReadOnly Get uesr key comparison: \n" << hist_get.ToString()
+ << "ReadOnly MultiGet uesr key comparison: \n"
+ << hist_mget.ToString();
+
+ std::cout << "ReadOnly Get(): Time to get snapshot: \n"
+ << hist_get_snapshot.ToString()
+ << " Time to get value from memtables: \n"
+ << hist_get_memtable.ToString() << "\n"
+ << " Time to get value from output files: \n"
+ << hist_get_files.ToString() << "\n"
+ << " Number of memtables checked: \n"
+ << hist_num_memtable_checked.ToString() << "\n"
+ << " Time to post process: \n" << hist_get_post_process.ToString()
+ << "\n";
+
+ std::cout << "ReadOnly MultiGet(): Time to get snapshot: \n"
+ << hist_mget_snapshot.ToString()
+ << " Time to get value from memtables: \n"
+ << hist_mget_memtable.ToString() << "\n"
+ << " Time to get value from output files: \n"
+ << hist_mget_files.ToString() << "\n"
+ << " Number of memtables checked: \n"
+ << hist_mget_num_memtable_checked.ToString() << "\n"
+ << " Time to post process: \n"
+ << hist_mget_post_process.ToString() << "\n";
+ }
+
+ if (enabled_time) {
+ ASSERT_GT(hist_get.Average(), 0);
+ ASSERT_GT(hist_get_memtable.Average(), 0);
+ ASSERT_GT(hist_get_files.Average(), 0);
+ ASSERT_GT(hist_num_memtable_checked.Average(), 0);
+ // In read-only mode Get(), no super version operation is needed
+ ASSERT_EQ(hist_get_post_process.Average(), 0);
+ ASSERT_GT(hist_get_snapshot.Average(), 0);
+
+ ASSERT_GT(hist_mget.Average(), 0);
+ ASSERT_GT(hist_mget_snapshot.Average(), 0);
+ ASSERT_GT(hist_mget_memtable.Average(), 0);
+ ASSERT_GT(hist_mget_files.Average(), 0);
+ ASSERT_GT(hist_mget_post_process.Average(), 0);
+ ASSERT_GT(hist_mget_num_memtable_checked.Average(), 0);
+ }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(PerfContextTest, KeyComparisonCount) {
+ SetPerfLevel(kEnableCount);
+ ProfileQueries();
+
+ SetPerfLevel(kDisable);
+ ProfileQueries();
+
+ SetPerfLevel(kEnableTime);
+ ProfileQueries(true);
+}
+#endif // ROCKSDB_LITE
+
+// make perf_context_test
+// export ROCKSDB_TESTS=PerfContextTest.SeekKeyComparison
+// For one memtable:
+// ./perf_context_test --write_buffer_size=500000 --total_keys=10000
+// For two memtables:
+// ./perf_context_test --write_buffer_size=250000 --total_keys=10000
+// Specify --random_key=1 to shuffle the key before insertion
+// Results show that, for sequential insertion, worst-case Seek Key comparison
+// is close to the total number of keys (linear), when there is only one
+// memtable. When there are two memtables, even the avg Seek Key comparison
+// starts to become linear to the input size.
+
+TEST_F(PerfContextTest, SeekKeyComparison) {
+ DestroyDB(kDbName, Options());
+ auto db = OpenDb();
+ WriteOptions write_options;
+ ReadOptions read_options;
+
+ if (FLAGS_verbose) {
+ std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
+ }
+
+ std::vector<int> keys;
+ for (int i = 0; i < FLAGS_total_keys; ++i) {
+ keys.push_back(i);
+ }
+
+ if (FLAGS_random_key) {
+ std::random_shuffle(keys.begin(), keys.end());
+ }
+
+ HistogramImpl hist_put_time;
+ HistogramImpl hist_wal_time;
+ HistogramImpl hist_time_diff;
+
+ SetPerfLevel(kEnableTime);
+ StopWatchNano timer(Env::Default());
+ for (const int i : keys) {
+ std::string key = "k" + ToString(i);
+ std::string value = "v" + ToString(i);
+
+ get_perf_context()->Reset();
+ timer.Start();
+ db->Put(write_options, key, value);
+ auto put_time = timer.ElapsedNanos();
+ hist_put_time.Add(put_time);
+ hist_wal_time.Add(get_perf_context()->write_wal_time);
+ hist_time_diff.Add(put_time - get_perf_context()->write_wal_time);
+ }
+
+ if (FLAGS_verbose) {
+ std::cout << "Put time:\n" << hist_put_time.ToString() << "WAL time:\n"
+ << hist_wal_time.ToString() << "time diff:\n"
+ << hist_time_diff.ToString();
+ }
+
+ HistogramImpl hist_seek;
+ HistogramImpl hist_next;
+
+ for (int i = 0; i < FLAGS_total_keys; ++i) {
+ std::string key = "k" + ToString(i);
+ std::string value = "v" + ToString(i);
+
+ std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+ get_perf_context()->Reset();
+ iter->Seek(key);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->value().ToString(), value);
+ hist_seek.Add(get_perf_context()->user_key_comparison_count);
+ }
+
+ std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+ for (iter->SeekToFirst(); iter->Valid();) {
+ get_perf_context()->Reset();
+ iter->Next();
+ hist_next.Add(get_perf_context()->user_key_comparison_count);
+ }
+
+ if (FLAGS_verbose) {
+ std::cout << "Seek:\n" << hist_seek.ToString() << "Next:\n"
+ << hist_next.ToString();
+ }
+}
+
+TEST_F(PerfContextTest, DBMutexLockCounter) {
+ int stats_code[] = {0, static_cast<int>(DB_MUTEX_WAIT_MICROS)};
+ for (PerfLevel perf_level_test :
+ {PerfLevel::kEnableTimeExceptForMutex, PerfLevel::kEnableTime}) {
+ for (int c = 0; c < 2; ++c) {
+ InstrumentedMutex mutex(nullptr, Env::Default(), stats_code[c]);
+ mutex.Lock();
+ ROCKSDB_NAMESPACE::port::Thread child_thread([&] {
+ SetPerfLevel(perf_level_test);
+ get_perf_context()->Reset();
+ ASSERT_EQ(get_perf_context()->db_mutex_lock_nanos, 0);
+ mutex.Lock();
+ mutex.Unlock();
+ if (perf_level_test == PerfLevel::kEnableTimeExceptForMutex ||
+ stats_code[c] != DB_MUTEX_WAIT_MICROS) {
+ ASSERT_EQ(get_perf_context()->db_mutex_lock_nanos, 0);
+ } else {
+ // increment the counter only when it's a DB Mutex
+ ASSERT_GT(get_perf_context()->db_mutex_lock_nanos, 0);
+ }
+ });
+ Env::Default()->SleepForMicroseconds(100);
+ mutex.Unlock();
+ child_thread.join();
+ }
+ }
+}
+
+TEST_F(PerfContextTest, FalseDBMutexWait) {
+ SetPerfLevel(kEnableTime);
+ int stats_code[] = {0, static_cast<int>(DB_MUTEX_WAIT_MICROS)};
+ for (int c = 0; c < 2; ++c) {
+ InstrumentedMutex mutex(nullptr, Env::Default(), stats_code[c]);
+ InstrumentedCondVar lock(&mutex);
+ get_perf_context()->Reset();
+ mutex.Lock();
+ lock.TimedWait(100);
+ mutex.Unlock();
+ if (stats_code[c] == static_cast<int>(DB_MUTEX_WAIT_MICROS)) {
+ // increment the counter only when it's a DB Mutex
+ ASSERT_GT(get_perf_context()->db_condition_wait_nanos, 0);
+ } else {
+ ASSERT_EQ(get_perf_context()->db_condition_wait_nanos, 0);
+ }
+ }
+}
+
+TEST_F(PerfContextTest, ToString) {
+ get_perf_context()->Reset();
+ get_perf_context()->block_read_count = 12345;
+
+ std::string zero_included = get_perf_context()->ToString();
+ ASSERT_NE(std::string::npos, zero_included.find("= 0"));
+ ASSERT_NE(std::string::npos, zero_included.find("= 12345"));
+
+ std::string zero_excluded = get_perf_context()->ToString(true);
+ ASSERT_EQ(std::string::npos, zero_excluded.find("= 0"));
+ ASSERT_NE(std::string::npos, zero_excluded.find("= 12345"));
+}
+
+TEST_F(PerfContextTest, MergeOperatorTime) {
+ DestroyDB(kDbName, Options());
+ DB* db;
+ Options options;
+ options.create_if_missing = true;
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ Status s = DB::Open(options, kDbName, &db);
+ EXPECT_OK(s);
+
+ std::string val;
+ ASSERT_OK(db->Merge(WriteOptions(), "k1", "val1"));
+ ASSERT_OK(db->Merge(WriteOptions(), "k1", "val2"));
+ ASSERT_OK(db->Merge(WriteOptions(), "k1", "val3"));
+ ASSERT_OK(db->Merge(WriteOptions(), "k1", "val4"));
+
+ SetPerfLevel(kEnableTime);
+ get_perf_context()->Reset();
+ ASSERT_OK(db->Get(ReadOptions(), "k1", &val));
+#ifdef OS_SOLARIS
+ for (int i = 0; i < 100; i++) {
+ ASSERT_OK(db->Get(ReadOptions(), "k1", &val));
+ }
+#endif
+ EXPECT_GT(get_perf_context()->merge_operator_time_nanos, 0);
+
+ ASSERT_OK(db->Flush(FlushOptions()));
+
+ get_perf_context()->Reset();
+ ASSERT_OK(db->Get(ReadOptions(), "k1", &val));
+#ifdef OS_SOLARIS
+ for (int i = 0; i < 100; i++) {
+ ASSERT_OK(db->Get(ReadOptions(), "k1", &val));
+ }
+#endif
+ EXPECT_GT(get_perf_context()->merge_operator_time_nanos, 0);
+
+ ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ get_perf_context()->Reset();
+ ASSERT_OK(db->Get(ReadOptions(), "k1", &val));
+#ifdef OS_SOLARIS
+ for (int i = 0; i < 100; i++) {
+ ASSERT_OK(db->Get(ReadOptions(), "k1", &val));
+ }
+#endif
+ EXPECT_GT(get_perf_context()->merge_operator_time_nanos, 0);
+
+ delete db;
+}
+
+TEST_F(PerfContextTest, CopyAndMove) {
+ // Assignment operator
+ {
+ get_perf_context()->Reset();
+ get_perf_context()->EnablePerLevelPerfContext();
+ PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5);
+ ASSERT_EQ(
+ 1,
+ (*(get_perf_context()->level_to_perf_context))[5].bloom_filter_useful);
+ PerfContext perf_context_assign;
+ perf_context_assign = *get_perf_context();
+ ASSERT_EQ(
+ 1,
+ (*(perf_context_assign.level_to_perf_context))[5].bloom_filter_useful);
+ get_perf_context()->ClearPerLevelPerfContext();
+ get_perf_context()->Reset();
+ ASSERT_EQ(
+ 1,
+ (*(perf_context_assign.level_to_perf_context))[5].bloom_filter_useful);
+ perf_context_assign.ClearPerLevelPerfContext();
+ perf_context_assign.Reset();
+ }
+ // Copy constructor
+ {
+ get_perf_context()->Reset();
+ get_perf_context()->EnablePerLevelPerfContext();
+ PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5);
+ ASSERT_EQ(
+ 1,
+ (*(get_perf_context()->level_to_perf_context))[5].bloom_filter_useful);
+ PerfContext perf_context_copy(*get_perf_context());
+ ASSERT_EQ(
+ 1, (*(perf_context_copy.level_to_perf_context))[5].bloom_filter_useful);
+ get_perf_context()->ClearPerLevelPerfContext();
+ get_perf_context()->Reset();
+ ASSERT_EQ(
+ 1, (*(perf_context_copy.level_to_perf_context))[5].bloom_filter_useful);
+ perf_context_copy.ClearPerLevelPerfContext();
+ perf_context_copy.Reset();
+ }
+ // Move constructor
+ {
+ get_perf_context()->Reset();
+ get_perf_context()->EnablePerLevelPerfContext();
+ PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5);
+ ASSERT_EQ(
+ 1,
+ (*(get_perf_context()->level_to_perf_context))[5].bloom_filter_useful);
+ PerfContext perf_context_move = std::move(*get_perf_context());
+ ASSERT_EQ(
+ 1, (*(perf_context_move.level_to_perf_context))[5].bloom_filter_useful);
+ get_perf_context()->ClearPerLevelPerfContext();
+ get_perf_context()->Reset();
+ ASSERT_EQ(
+ 1, (*(perf_context_move.level_to_perf_context))[5].bloom_filter_useful);
+ perf_context_move.ClearPerLevelPerfContext();
+ perf_context_move.Reset();
+ }
+}
+
+TEST_F(PerfContextTest, PerfContextDisableEnable) {
+ get_perf_context()->Reset();
+ get_perf_context()->EnablePerLevelPerfContext();
+ PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, 0);
+ get_perf_context()->DisablePerLevelPerfContext();
+ PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5);
+ get_perf_context()->EnablePerLevelPerfContext();
+ PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1, 0);
+ get_perf_context()->DisablePerLevelPerfContext();
+ PerfContext perf_context_copy(*get_perf_context());
+ ASSERT_EQ(1, (*(perf_context_copy.level_to_perf_context))[0]
+ .bloom_filter_full_positive);
+ // this was set when per level perf context is disabled, should not be copied
+ ASSERT_NE(
+ 1, (*(perf_context_copy.level_to_perf_context))[5].bloom_filter_useful);
+ ASSERT_EQ(
+ 1, (*(perf_context_copy.level_to_perf_context))[0].block_cache_hit_count);
+ perf_context_copy.ClearPerLevelPerfContext();
+ perf_context_copy.Reset();
+ get_perf_context()->ClearPerLevelPerfContext();
+ get_perf_context()->Reset();
+}
+
+TEST_F(PerfContextTest, PerfContextByLevelGetSet) {
+ get_perf_context()->Reset();
+ get_perf_context()->EnablePerLevelPerfContext();
+ PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, 0);
+ PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5);
+ PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 7);
+ PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 7);
+ PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1, 2);
+ PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1, 0);
+ PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 5, 2);
+ PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 2, 3);
+ PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 4, 1);
+ ASSERT_EQ(
+ 0, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+ ASSERT_EQ(
+ 1, (*(get_perf_context()->level_to_perf_context))[5].bloom_filter_useful);
+ ASSERT_EQ(
+ 2, (*(get_perf_context()->level_to_perf_context))[7].bloom_filter_useful);
+ ASSERT_EQ(1, (*(get_perf_context()->level_to_perf_context))[0]
+ .bloom_filter_full_positive);
+ ASSERT_EQ(1, (*(get_perf_context()->level_to_perf_context))[2]
+ .bloom_filter_full_true_positive);
+ ASSERT_EQ(1, (*(get_perf_context()->level_to_perf_context))[0]
+ .block_cache_hit_count);
+ ASSERT_EQ(5, (*(get_perf_context()->level_to_perf_context))[2]
+ .block_cache_hit_count);
+ ASSERT_EQ(2, (*(get_perf_context()->level_to_perf_context))[3]
+ .block_cache_miss_count);
+ ASSERT_EQ(4, (*(get_perf_context()->level_to_perf_context))[1]
+ .block_cache_miss_count);
+ std::string zero_excluded = get_perf_context()->ToString(true);
+ ASSERT_NE(std::string::npos,
+ zero_excluded.find("bloom_filter_useful = 1@level5, 2@level7"));
+ ASSERT_NE(std::string::npos,
+ zero_excluded.find("bloom_filter_full_positive = 1@level0"));
+ ASSERT_NE(std::string::npos,
+ zero_excluded.find("bloom_filter_full_true_positive = 1@level2"));
+ ASSERT_NE(std::string::npos,
+ zero_excluded.find("block_cache_hit_count = 1@level0, 5@level2"));
+ ASSERT_NE(std::string::npos,
+ zero_excluded.find("block_cache_miss_count = 4@level1, 2@level3"));
+}
+
+TEST_F(PerfContextTest, CPUTimer) {
+ DestroyDB(kDbName, Options());
+ auto db = OpenDb();
+ WriteOptions write_options;
+ ReadOptions read_options;
+ SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
+
+ std::string max_str = "0";
+ for (int i = 0; i < FLAGS_total_keys; ++i) {
+ std::string i_str = ToString(i);
+ std::string key = "k" + i_str;
+ std::string value = "v" + i_str;
+ max_str = max_str > i_str ? max_str : i_str;
+
+ db->Put(write_options, key, value);
+ }
+ std::string last_key = "k" + max_str;
+ std::string last_value = "v" + max_str;
+
+ {
+ // Get
+ get_perf_context()->Reset();
+ std::string value;
+ ASSERT_OK(db->Get(read_options, "k0", &value));
+ ASSERT_EQ(value, "v0");
+
+ if (FLAGS_verbose) {
+ std::cout << "Get CPU time nanos: " << get_perf_context()->get_cpu_nanos
+ << "ns\n";
+ }
+
+ // Iter
+ std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+
+ // Seek
+ get_perf_context()->Reset();
+ iter->Seek(last_key);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(last_value, iter->value().ToString());
+
+ if (FLAGS_verbose) {
+ std::cout << "Iter Seek CPU time nanos: "
+ << get_perf_context()->iter_seek_cpu_nanos << "ns\n";
+ }
+
+ // SeekForPrev
+ get_perf_context()->Reset();
+ iter->SeekForPrev(last_key);
+ ASSERT_TRUE(iter->Valid());
+
+ if (FLAGS_verbose) {
+ std::cout << "Iter SeekForPrev CPU time nanos: "
+ << get_perf_context()->iter_seek_cpu_nanos << "ns\n";
+ }
+
+ // SeekToLast
+ get_perf_context()->Reset();
+ iter->SeekToLast();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(last_value, iter->value().ToString());
+
+ if (FLAGS_verbose) {
+ std::cout << "Iter SeekToLast CPU time nanos: "
+ << get_perf_context()->iter_seek_cpu_nanos << "ns\n";
+ }
+
+ // SeekToFirst
+ get_perf_context()->Reset();
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("v0", iter->value().ToString());
+
+ if (FLAGS_verbose) {
+ std::cout << "Iter SeekToFirst CPU time nanos: "
+ << get_perf_context()->iter_seek_cpu_nanos << "ns\n";
+ }
+
+ // Next
+ get_perf_context()->Reset();
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("v1", iter->value().ToString());
+
+ if (FLAGS_verbose) {
+ std::cout << "Iter Next CPU time nanos: "
+ << get_perf_context()->iter_next_cpu_nanos << "ns\n";
+ }
+
+ // Prev
+ get_perf_context()->Reset();
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("v0", iter->value().ToString());
+
+ if (FLAGS_verbose) {
+ std::cout << "Iter Prev CPU time nanos: "
+ << get_perf_context()->iter_prev_cpu_nanos << "ns\n";
+ }
+
+ // monotonically increasing
+ get_perf_context()->Reset();
+ auto count = get_perf_context()->iter_seek_cpu_nanos;
+ for (int i = 0; i < FLAGS_total_keys; ++i) {
+ iter->Seek("k" + ToString(i));
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("v" + ToString(i), iter->value().ToString());
+ auto next_count = get_perf_context()->iter_seek_cpu_nanos;
+ ASSERT_GT(next_count, count);
+ count = next_count;
+ }
+
+ // iterator creation/destruction; multiple iterators
+ {
+ std::unique_ptr<Iterator> iter2(db->NewIterator(read_options));
+ ASSERT_EQ(count, get_perf_context()->iter_seek_cpu_nanos);
+ iter2->Seek(last_key);
+ ASSERT_TRUE(iter2->Valid());
+ ASSERT_EQ(last_value, iter2->value().ToString());
+ ASSERT_GT(get_perf_context()->iter_seek_cpu_nanos, count);
+ count = get_perf_context()->iter_seek_cpu_nanos;
+ }
+ ASSERT_EQ(count, get_perf_context()->iter_seek_cpu_nanos);
+ }
+}
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+
+ for (int i = 1; i < argc; i++) {
+ int n;
+ char junk;
+
+ if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
+ FLAGS_write_buffer_size = n;
+ }
+
+ if (sscanf(argv[i], "--total_keys=%d%c", &n, &junk) == 1) {
+ FLAGS_total_keys = n;
+ }
+
+ if (sscanf(argv[i], "--random_key=%d%c", &n, &junk) == 1 &&
+ (n == 0 || n == 1)) {
+ FLAGS_random_key = n;
+ }
+
+ if (sscanf(argv[i], "--use_set_based_memetable=%d%c", &n, &junk) == 1 &&
+ (n == 0 || n == 1)) {
+ FLAGS_use_set_based_memetable = n;
+ }
+
+ if (sscanf(argv[i], "--verbose=%d%c", &n, &junk) == 1 &&
+ (n == 0 || n == 1)) {
+ FLAGS_verbose = n;
+ }
+ }
+
+ if (FLAGS_verbose) {
+ std::cout << kDbName << "\n";
+ }
+
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/pinned_iterators_manager.h b/src/rocksdb/db/pinned_iterators_manager.h
new file mode 100644
index 000000000..5e8ad51dd
--- /dev/null
+++ b/src/rocksdb/db/pinned_iterators_manager.h
@@ -0,0 +1,87 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+#include <algorithm>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// PinnedIteratorsManager will be notified whenever we need to pin an Iterator
+// and it will be responsible for deleting pinned Iterators when they are
+// not needed anymore.
+class PinnedIteratorsManager : public Cleanable {
+ public:
+ PinnedIteratorsManager() : pinning_enabled(false) {}
+ ~PinnedIteratorsManager() {
+ if (pinning_enabled) {
+ ReleasePinnedData();
+ }
+ }
+
+ // Enable Iterators pinning
+ void StartPinning() {
+ assert(pinning_enabled == false);
+ pinning_enabled = true;
+ }
+
+ // Is pinning enabled ?
+ bool PinningEnabled() { return pinning_enabled; }
+
+ // Take ownership of iter and delete it when ReleasePinnedData() is called
+ void PinIterator(InternalIterator* iter, bool arena = false) {
+ if (arena) {
+ PinPtr(iter, &PinnedIteratorsManager::ReleaseArenaInternalIterator);
+ } else {
+ PinPtr(iter, &PinnedIteratorsManager::ReleaseInternalIterator);
+ }
+ }
+
+ typedef void (*ReleaseFunction)(void* arg1);
+ void PinPtr(void* ptr, ReleaseFunction release_func) {
+ assert(pinning_enabled);
+ if (ptr == nullptr) {
+ return;
+ }
+ pinned_ptrs_.emplace_back(ptr, release_func);
+ }
+
+ // Release pinned Iterators
+ inline void ReleasePinnedData() {
+ assert(pinning_enabled == true);
+ pinning_enabled = false;
+
+ // Remove duplicate pointers
+ std::sort(pinned_ptrs_.begin(), pinned_ptrs_.end());
+ auto unique_end = std::unique(pinned_ptrs_.begin(), pinned_ptrs_.end());
+
+ for (auto i = pinned_ptrs_.begin(); i != unique_end; ++i) {
+ void* ptr = i->first;
+ ReleaseFunction release_func = i->second;
+ release_func(ptr);
+ }
+ pinned_ptrs_.clear();
+ // Also do cleanups from the base Cleanable
+ Cleanable::Reset();
+ }
+
+ private:
+ static void ReleaseInternalIterator(void* ptr) {
+ delete reinterpret_cast<InternalIterator*>(ptr);
+ }
+
+ static void ReleaseArenaInternalIterator(void* ptr) {
+ reinterpret_cast<InternalIterator*>(ptr)->~InternalIterator();
+ }
+
+ bool pinning_enabled;
+ std::vector<std::pair<void*, ReleaseFunction>> pinned_ptrs_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/plain_table_db_test.cc b/src/rocksdb/db/plain_table_db_test.cc
new file mode 100644
index 000000000..d80cc4f67
--- /dev/null
+++ b/src/rocksdb/db/plain_table_db_test.cc
@@ -0,0 +1,1375 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include <algorithm>
+#include <set>
+
+#include "db/db_impl/db_impl.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "file/filename.h"
+#include "logging/logging.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "table/meta_blocks.h"
+#include "table/plain/plain_table_bloom.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_key_coding.h"
+#include "table/plain/plain_table_reader.h"
+#include "table/table_builder.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/hash.h"
+#include "util/mutexlock.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+using std::unique_ptr;
+
+namespace ROCKSDB_NAMESPACE {
+class PlainTableKeyDecoderTest : public testing::Test {};
+
+TEST_F(PlainTableKeyDecoderTest, ReadNonMmap) {
+ std::string tmp;
+ Random rnd(301);
+ const uint32_t kLength = 2222;
+ Slice contents = test::RandomString(&rnd, kLength, &tmp);
+ test::StringSource* string_source =
+ new test::StringSource(contents, 0, false);
+
+ std::unique_ptr<RandomAccessFileReader> file_reader(
+ test::GetRandomAccessFileReader(string_source));
+ std::unique_ptr<PlainTableReaderFileInfo> file_info(
+ new PlainTableReaderFileInfo(std::move(file_reader), EnvOptions(),
+ kLength));
+
+ {
+ PlainTableFileReader reader(file_info.get());
+
+ const uint32_t kReadSize = 77;
+ for (uint32_t pos = 0; pos < kLength; pos += kReadSize) {
+ uint32_t read_size = std::min(kLength - pos, kReadSize);
+ Slice out;
+ ASSERT_TRUE(reader.Read(pos, read_size, &out));
+ ASSERT_EQ(0, out.compare(tmp.substr(pos, read_size)));
+ }
+
+ ASSERT_LT(uint32_t(string_source->total_reads()), kLength / kReadSize / 2);
+ }
+
+ std::vector<std::vector<std::pair<uint32_t, uint32_t>>> reads = {
+ {{600, 30}, {590, 30}, {600, 20}, {600, 40}},
+ {{800, 20}, {100, 20}, {500, 20}, {1500, 20}, {100, 20}, {80, 20}},
+ {{1000, 20}, {500, 20}, {1000, 50}},
+ {{1000, 20}, {500, 20}, {500, 20}},
+ {{1000, 20}, {500, 20}, {200, 20}, {500, 20}},
+ {{1000, 20}, {500, 20}, {200, 20}, {1000, 50}},
+ {{600, 500}, {610, 20}, {100, 20}},
+ {{500, 100}, {490, 100}, {550, 50}},
+ };
+
+ std::vector<int> num_file_reads = {2, 6, 2, 2, 4, 3, 2, 2};
+
+ for (size_t i = 0; i < reads.size(); i++) {
+ string_source->set_total_reads(0);
+ PlainTableFileReader reader(file_info.get());
+ for (auto p : reads[i]) {
+ Slice out;
+ ASSERT_TRUE(reader.Read(p.first, p.second, &out));
+ ASSERT_EQ(0, out.compare(tmp.substr(p.first, p.second)));
+ }
+ ASSERT_EQ(num_file_reads[i], string_source->total_reads());
+ }
+}
+
+class PlainTableDBTest : public testing::Test,
+ public testing::WithParamInterface<bool> {
+ protected:
+ private:
+ std::string dbname_;
+ Env* env_;
+ DB* db_;
+
+ bool mmap_mode_;
+ Options last_options_;
+
+ public:
+ PlainTableDBTest() : env_(Env::Default()) {}
+
+ ~PlainTableDBTest() override {
+ delete db_;
+ EXPECT_OK(DestroyDB(dbname_, Options()));
+ }
+
+ void SetUp() override {
+ mmap_mode_ = GetParam();
+ dbname_ = test::PerThreadDBPath("plain_table_db_test");
+ EXPECT_OK(DestroyDB(dbname_, Options()));
+ db_ = nullptr;
+ Reopen();
+ }
+
+ // Return the current option configuration.
+ Options CurrentOptions() {
+ Options options;
+
+ PlainTableOptions plain_table_options;
+ plain_table_options.user_key_len = 0;
+ plain_table_options.bloom_bits_per_key = 2;
+ plain_table_options.hash_table_ratio = 0.8;
+ plain_table_options.index_sparseness = 3;
+ plain_table_options.huge_page_tlb_size = 0;
+ plain_table_options.encoding_type = kPrefix;
+ plain_table_options.full_scan_mode = false;
+ plain_table_options.store_index_in_file = false;
+
+ options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+ options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true));
+
+ options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+ options.allow_mmap_reads = mmap_mode_;
+ options.allow_concurrent_memtable_write = false;
+ options.unordered_write = false;
+ return options;
+ }
+
+ DBImpl* dbfull() {
+ return reinterpret_cast<DBImpl*>(db_);
+ }
+
+ void Reopen(Options* options = nullptr) {
+ ASSERT_OK(TryReopen(options));
+ }
+
+ void Close() {
+ delete db_;
+ db_ = nullptr;
+ }
+
+ bool mmap_mode() const { return mmap_mode_; }
+
+ void DestroyAndReopen(Options* options = nullptr) {
+ //Destroy using last options
+ Destroy(&last_options_);
+ ASSERT_OK(TryReopen(options));
+ }
+
+ void Destroy(Options* options) {
+ delete db_;
+ db_ = nullptr;
+ ASSERT_OK(DestroyDB(dbname_, *options));
+ }
+
+ Status PureReopen(Options* options, DB** db) {
+ return DB::Open(*options, dbname_, db);
+ }
+
+ Status ReopenForReadOnly(Options* options) {
+ delete db_;
+ db_ = nullptr;
+ return DB::OpenForReadOnly(*options, dbname_, &db_);
+ }
+
+ Status TryReopen(Options* options = nullptr) {
+ delete db_;
+ db_ = nullptr;
+ Options opts;
+ if (options != nullptr) {
+ opts = *options;
+ } else {
+ opts = CurrentOptions();
+ opts.create_if_missing = true;
+ }
+ last_options_ = opts;
+
+ return DB::Open(opts, dbname_, &db_);
+ }
+
+ Status Put(const Slice& k, const Slice& v) {
+ return db_->Put(WriteOptions(), k, v);
+ }
+
+ Status Delete(const std::string& k) {
+ return db_->Delete(WriteOptions(), k);
+ }
+
+ std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
+ ReadOptions options;
+ options.snapshot = snapshot;
+ std::string result;
+ Status s = db_->Get(options, k, &result);
+ if (s.IsNotFound()) {
+ result = "NOT_FOUND";
+ } else if (!s.ok()) {
+ result = s.ToString();
+ }
+ return result;
+ }
+
+
+ int NumTableFilesAtLevel(int level) {
+ std::string property;
+ EXPECT_TRUE(db_->GetProperty(
+ "rocksdb.num-files-at-level" + NumberToString(level), &property));
+ return atoi(property.c_str());
+ }
+
+ // Return spread of files per level
+ std::string FilesPerLevel() {
+ std::string result;
+ size_t last_non_zero_offset = 0;
+ for (int level = 0; level < db_->NumberLevels(); level++) {
+ int f = NumTableFilesAtLevel(level);
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+ result += buf;
+ if (f > 0) {
+ last_non_zero_offset = result.size();
+ }
+ }
+ result.resize(last_non_zero_offset);
+ return result;
+ }
+
+ std::string IterStatus(Iterator* iter) {
+ std::string result;
+ if (iter->Valid()) {
+ result = iter->key().ToString() + "->" + iter->value().ToString();
+ } else {
+ result = "(invalid)";
+ }
+ return result;
+ }
+};
+
+TEST_P(PlainTableDBTest, Empty) {
+ ASSERT_TRUE(dbfull() != nullptr);
+ ASSERT_EQ("NOT_FOUND", Get("0000000000000foo"));
+}
+
+extern const uint64_t kPlainTableMagicNumber;
+
+class TestPlainTableReader : public PlainTableReader {
+ public:
+ TestPlainTableReader(const EnvOptions& env_options,
+ const InternalKeyComparator& icomparator,
+ EncodingType encoding_type, uint64_t file_size,
+ int bloom_bits_per_key, double hash_table_ratio,
+ size_t index_sparseness,
+ const TableProperties* table_properties,
+ std::unique_ptr<RandomAccessFileReader>&& file,
+ const ImmutableCFOptions& ioptions,
+ const SliceTransform* prefix_extractor,
+ bool* expect_bloom_not_match, bool store_index_in_file,
+ uint32_t column_family_id,
+ const std::string& column_family_name)
+ : PlainTableReader(ioptions, std::move(file), env_options, icomparator,
+ encoding_type, file_size, table_properties,
+ prefix_extractor),
+ expect_bloom_not_match_(expect_bloom_not_match) {
+ Status s = MmapDataIfNeeded();
+ EXPECT_TRUE(s.ok());
+
+ s = PopulateIndex(const_cast<TableProperties*>(table_properties),
+ bloom_bits_per_key, hash_table_ratio, index_sparseness,
+ 2 * 1024 * 1024);
+ EXPECT_TRUE(s.ok());
+
+ TableProperties* props = const_cast<TableProperties*>(table_properties);
+ EXPECT_EQ(column_family_id, static_cast<uint32_t>(props->column_family_id));
+ EXPECT_EQ(column_family_name, props->column_family_name);
+ if (store_index_in_file) {
+ auto bloom_version_ptr = props->user_collected_properties.find(
+ PlainTablePropertyNames::kBloomVersion);
+ EXPECT_TRUE(bloom_version_ptr != props->user_collected_properties.end());
+ EXPECT_EQ(bloom_version_ptr->second, std::string("1"));
+ if (ioptions.bloom_locality > 0) {
+ auto num_blocks_ptr = props->user_collected_properties.find(
+ PlainTablePropertyNames::kNumBloomBlocks);
+ EXPECT_TRUE(num_blocks_ptr != props->user_collected_properties.end());
+ }
+ }
+ table_properties_.reset(props);
+ }
+
+ ~TestPlainTableReader() override {}
+
+ private:
+ bool MatchBloom(uint32_t hash) const override {
+ bool ret = PlainTableReader::MatchBloom(hash);
+ if (*expect_bloom_not_match_) {
+ EXPECT_TRUE(!ret);
+ } else {
+ EXPECT_TRUE(ret);
+ }
+ return ret;
+ }
+ bool* expect_bloom_not_match_;
+};
+
+extern const uint64_t kPlainTableMagicNumber;
+class TestPlainTableFactory : public PlainTableFactory {
+ public:
+ explicit TestPlainTableFactory(bool* expect_bloom_not_match,
+ const PlainTableOptions& options,
+ uint32_t column_family_id,
+ std::string column_family_name)
+ : PlainTableFactory(options),
+ bloom_bits_per_key_(options.bloom_bits_per_key),
+ hash_table_ratio_(options.hash_table_ratio),
+ index_sparseness_(options.index_sparseness),
+ store_index_in_file_(options.store_index_in_file),
+ expect_bloom_not_match_(expect_bloom_not_match),
+ column_family_id_(column_family_id),
+ column_family_name_(std::move(column_family_name)) {}
+
+ Status NewTableReader(
+ const TableReaderOptions& table_reader_options,
+ std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+ std::unique_ptr<TableReader>* table,
+ bool /*prefetch_index_and_filter_in_cache*/) const override {
+ TableProperties* props = nullptr;
+ auto s =
+ ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
+ table_reader_options.ioptions, &props,
+ true /* compression_type_missing */);
+ EXPECT_TRUE(s.ok());
+
+ if (store_index_in_file_) {
+ BlockHandle bloom_block_handle;
+ s = FindMetaBlock(file.get(), file_size, kPlainTableMagicNumber,
+ table_reader_options.ioptions,
+ BloomBlockBuilder::kBloomBlock, &bloom_block_handle,
+ /* compression_type_missing */ true);
+ EXPECT_TRUE(s.ok());
+
+ BlockHandle index_block_handle;
+ s = FindMetaBlock(file.get(), file_size, kPlainTableMagicNumber,
+ table_reader_options.ioptions,
+ PlainTableIndexBuilder::kPlainTableIndexBlock,
+ &index_block_handle, /* compression_type_missing */ true);
+ EXPECT_TRUE(s.ok());
+ }
+
+ auto& user_props = props->user_collected_properties;
+ auto encoding_type_prop =
+ user_props.find(PlainTablePropertyNames::kEncodingType);
+ assert(encoding_type_prop != user_props.end());
+ EncodingType encoding_type = static_cast<EncodingType>(
+ DecodeFixed32(encoding_type_prop->second.c_str()));
+
+ std::unique_ptr<PlainTableReader> new_reader(new TestPlainTableReader(
+ table_reader_options.env_options,
+ table_reader_options.internal_comparator, encoding_type, file_size,
+ bloom_bits_per_key_, hash_table_ratio_, index_sparseness_, props,
+ std::move(file), table_reader_options.ioptions,
+ table_reader_options.prefix_extractor, expect_bloom_not_match_,
+ store_index_in_file_, column_family_id_, column_family_name_));
+
+ *table = std::move(new_reader);
+ return s;
+ }
+
+ private:
+ int bloom_bits_per_key_;
+ double hash_table_ratio_;
+ size_t index_sparseness_;
+ bool store_index_in_file_;
+ bool* expect_bloom_not_match_;
+ const uint32_t column_family_id_;
+ const std::string column_family_name_;
+};
+
+TEST_P(PlainTableDBTest, BadOptions1) {
+ // Build with a prefix extractor
+ ASSERT_OK(Put("1000000000000foo", "v1"));
+ dbfull()->TEST_FlushMemTable();
+
+ // Bad attempt to re-open without a prefix extractor
+ Options options = CurrentOptions();
+ options.prefix_extractor.reset();
+ ASSERT_EQ(
+ "Invalid argument: Prefix extractor is missing when opening a PlainTable "
+ "built using a prefix extractor",
+ TryReopen(&options).ToString());
+
+ // Bad attempt to re-open with different prefix extractor
+ options.prefix_extractor.reset(NewFixedPrefixTransform(6));
+ ASSERT_EQ(
+ "Invalid argument: Prefix extractor given doesn't match the one used to "
+ "build PlainTable",
+ TryReopen(&options).ToString());
+
+ // Correct prefix extractor
+ options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+ Reopen(&options);
+ ASSERT_EQ("v1", Get("1000000000000foo"));
+}
+
+TEST_P(PlainTableDBTest, BadOptions2) {
+ Options options = CurrentOptions();
+ options.prefix_extractor.reset();
+ options.create_if_missing = true;
+ DestroyAndReopen(&options);
+ // Build without a prefix extractor
+ // (apparently works even if hash_table_ratio > 0)
+ ASSERT_OK(Put("1000000000000foo", "v1"));
+ dbfull()->TEST_FlushMemTable();
+
+ // Bad attempt to re-open with hash_table_ratio > 0 and no prefix extractor
+ Status s = TryReopen(&options);
+ ASSERT_EQ(
+ "Not implemented: PlainTable requires a prefix extractor enable prefix "
+ "hash mode.",
+ s.ToString());
+
+ // OK to open with hash_table_ratio == 0 and no prefix extractor
+ PlainTableOptions plain_table_options;
+ plain_table_options.hash_table_ratio = 0;
+ options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+ Reopen(&options);
+ ASSERT_EQ("v1", Get("1000000000000foo"));
+
+ // OK to open newly with a prefix_extractor and hash table; builds index
+ // in memory.
+ options = CurrentOptions();
+ Reopen(&options);
+ ASSERT_EQ("v1", Get("1000000000000foo"));
+}
+
+TEST_P(PlainTableDBTest, Flush) {
+ for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+ huge_page_tlb_size += 2 * 1024 * 1024) {
+ for (EncodingType encoding_type : {kPlain, kPrefix}) {
+ for (int bloom = -1; bloom <= 117; bloom += 117) {
+ const int bloom_bits = std::max(bloom, 0);
+ const bool full_scan_mode = bloom < 0;
+ for (int total_order = 0; total_order <= 1; total_order++) {
+ for (int store_index_in_file = 0; store_index_in_file <= 1;
+ ++store_index_in_file) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ // Set only one bucket to force bucket conflict.
+ // Test index interval for the same prefix to be 1, 2 and 4
+ if (total_order) {
+ options.prefix_extractor.reset();
+
+ PlainTableOptions plain_table_options;
+ plain_table_options.user_key_len = 0;
+ plain_table_options.bloom_bits_per_key = bloom_bits;
+ plain_table_options.hash_table_ratio = 0;
+ plain_table_options.index_sparseness = 2;
+ plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+ plain_table_options.encoding_type = encoding_type;
+ plain_table_options.full_scan_mode = full_scan_mode;
+ plain_table_options.store_index_in_file = store_index_in_file;
+
+ options.table_factory.reset(
+ NewPlainTableFactory(plain_table_options));
+ } else {
+ PlainTableOptions plain_table_options;
+ plain_table_options.user_key_len = 0;
+ plain_table_options.bloom_bits_per_key = bloom_bits;
+ plain_table_options.hash_table_ratio = 0.75;
+ plain_table_options.index_sparseness = 16;
+ plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+ plain_table_options.encoding_type = encoding_type;
+ plain_table_options.full_scan_mode = full_scan_mode;
+ plain_table_options.store_index_in_file = store_index_in_file;
+
+ options.table_factory.reset(
+ NewPlainTableFactory(plain_table_options));
+ }
+ DestroyAndReopen(&options);
+ uint64_t int_num;
+ ASSERT_TRUE(dbfull()->GetIntProperty(
+ "rocksdb.estimate-table-readers-mem", &int_num));
+ ASSERT_EQ(int_num, 0U);
+
+ ASSERT_OK(Put("1000000000000foo", "v1"));
+ ASSERT_OK(Put("0000000000000bar", "v2"));
+ ASSERT_OK(Put("1000000000000foo", "v3"));
+ dbfull()->TEST_FlushMemTable();
+
+ ASSERT_TRUE(dbfull()->GetIntProperty(
+ "rocksdb.estimate-table-readers-mem", &int_num));
+ ASSERT_GT(int_num, 0U);
+
+ TablePropertiesCollection ptc;
+ reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
+ ASSERT_EQ(1U, ptc.size());
+ auto row = ptc.begin();
+ auto tp = row->second;
+
+ if (full_scan_mode) {
+ // Does not support Get/Seek
+ std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ReadOptions()));
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("0000000000000bar", iter->key().ToString());
+ ASSERT_EQ("v2", iter->value().ToString());
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("1000000000000foo", iter->key().ToString());
+ ASSERT_EQ("v3", iter->value().ToString());
+ iter->Next();
+ ASSERT_TRUE(!iter->Valid());
+ ASSERT_TRUE(iter->status().ok());
+ } else {
+ if (!store_index_in_file) {
+ ASSERT_EQ(total_order ? "4" : "12",
+ (tp->user_collected_properties)
+ .at("plain_table_hash_table_size"));
+ ASSERT_EQ("0", (tp->user_collected_properties)
+ .at("plain_table_sub_index_size"));
+ } else {
+ ASSERT_EQ("0", (tp->user_collected_properties)
+ .at("plain_table_hash_table_size"));
+ ASSERT_EQ("0", (tp->user_collected_properties)
+ .at("plain_table_sub_index_size"));
+ }
+ ASSERT_EQ("v3", Get("1000000000000foo"));
+ ASSERT_EQ("v2", Get("0000000000000bar"));
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+TEST_P(PlainTableDBTest, Flush2) {
+ for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+ huge_page_tlb_size += 2 * 1024 * 1024) {
+ for (EncodingType encoding_type : {kPlain, kPrefix}) {
+ for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
+ for (int total_order = 0; total_order <= 1; total_order++) {
+ for (int store_index_in_file = 0; store_index_in_file <= 1;
+ ++store_index_in_file) {
+ if (encoding_type == kPrefix && total_order) {
+ continue;
+ }
+ if (!bloom_bits && store_index_in_file) {
+ continue;
+ }
+ if (total_order && store_index_in_file) {
+ continue;
+ }
+ bool expect_bloom_not_match = false;
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ // Set only one bucket to force bucket conflict.
+ // Test index interval for the same prefix to be 1, 2 and 4
+ PlainTableOptions plain_table_options;
+ if (total_order) {
+ options.prefix_extractor = nullptr;
+ plain_table_options.hash_table_ratio = 0;
+ plain_table_options.index_sparseness = 2;
+ } else {
+ plain_table_options.hash_table_ratio = 0.75;
+ plain_table_options.index_sparseness = 16;
+ }
+ plain_table_options.user_key_len = kPlainTableVariableLength;
+ plain_table_options.bloom_bits_per_key = bloom_bits;
+ plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+ plain_table_options.encoding_type = encoding_type;
+ plain_table_options.store_index_in_file = store_index_in_file;
+ options.table_factory.reset(new TestPlainTableFactory(
+ &expect_bloom_not_match, plain_table_options,
+ 0 /* column_family_id */, kDefaultColumnFamilyName));
+
+ DestroyAndReopen(&options);
+ ASSERT_OK(Put("0000000000000bar", "b"));
+ ASSERT_OK(Put("1000000000000foo", "v1"));
+ dbfull()->TEST_FlushMemTable();
+
+ ASSERT_OK(Put("1000000000000foo", "v2"));
+ dbfull()->TEST_FlushMemTable();
+ ASSERT_EQ("v2", Get("1000000000000foo"));
+
+ ASSERT_OK(Put("0000000000000eee", "v3"));
+ dbfull()->TEST_FlushMemTable();
+ ASSERT_EQ("v3", Get("0000000000000eee"));
+
+ ASSERT_OK(Delete("0000000000000bar"));
+ dbfull()->TEST_FlushMemTable();
+ ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
+
+ ASSERT_OK(Put("0000000000000eee", "v5"));
+ ASSERT_OK(Put("9000000000000eee", "v5"));
+ dbfull()->TEST_FlushMemTable();
+ ASSERT_EQ("v5", Get("0000000000000eee"));
+
+ // Test Bloom Filter
+ if (bloom_bits > 0) {
+ // Neither key nor value should exist.
+ expect_bloom_not_match = true;
+ ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar"));
+ // Key doesn't exist any more but prefix exists.
+ if (total_order) {
+ ASSERT_EQ("NOT_FOUND", Get("1000000000000not"));
+ ASSERT_EQ("NOT_FOUND", Get("0000000000000not"));
+ }
+ expect_bloom_not_match = false;
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+TEST_P(PlainTableDBTest, Immortal) {
+ for (EncodingType encoding_type : {kPlain, kPrefix}) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.max_open_files = -1;
+ // Set only one bucket to force bucket conflict.
+ // Test index interval for the same prefix to be 1, 2 and 4
+ PlainTableOptions plain_table_options;
+ plain_table_options.hash_table_ratio = 0.75;
+ plain_table_options.index_sparseness = 16;
+ plain_table_options.user_key_len = kPlainTableVariableLength;
+ plain_table_options.bloom_bits_per_key = 10;
+ plain_table_options.encoding_type = encoding_type;
+ options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+
+ DestroyAndReopen(&options);
+ ASSERT_OK(Put("0000000000000bar", "b"));
+ ASSERT_OK(Put("1000000000000foo", "v1"));
+ dbfull()->TEST_FlushMemTable();
+
+ int copied = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "GetContext::SaveValue::PinSelf", [&](void* /*arg*/) { copied++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_EQ("b", Get("0000000000000bar"));
+ ASSERT_EQ("v1", Get("1000000000000foo"));
+ ASSERT_EQ(2, copied);
+ copied = 0;
+
+ Close();
+ ASSERT_OK(ReopenForReadOnly(&options));
+
+ ASSERT_EQ("b", Get("0000000000000bar"));
+ ASSERT_EQ("v1", Get("1000000000000foo"));
+ ASSERT_EQ("NOT_FOUND", Get("1000000000000bar"));
+ if (mmap_mode()) {
+ ASSERT_EQ(0, copied);
+ } else {
+ ASSERT_EQ(2, copied);
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+}
+
+TEST_P(PlainTableDBTest, Iterator) {
+ for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+ huge_page_tlb_size += 2 * 1024 * 1024) {
+ for (EncodingType encoding_type : {kPlain, kPrefix}) {
+ for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
+ for (int total_order = 0; total_order <= 1; total_order++) {
+ if (encoding_type == kPrefix && total_order == 1) {
+ continue;
+ }
+ bool expect_bloom_not_match = false;
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ // Set only one bucket to force bucket conflict.
+ // Test index interval for the same prefix to be 1, 2 and 4
+ if (total_order) {
+ options.prefix_extractor = nullptr;
+
+ PlainTableOptions plain_table_options;
+ plain_table_options.user_key_len = 16;
+ plain_table_options.bloom_bits_per_key = bloom_bits;
+ plain_table_options.hash_table_ratio = 0;
+ plain_table_options.index_sparseness = 2;
+ plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+ plain_table_options.encoding_type = encoding_type;
+
+ options.table_factory.reset(new TestPlainTableFactory(
+ &expect_bloom_not_match, plain_table_options,
+ 0 /* column_family_id */, kDefaultColumnFamilyName));
+ } else {
+ PlainTableOptions plain_table_options;
+ plain_table_options.user_key_len = 16;
+ plain_table_options.bloom_bits_per_key = bloom_bits;
+ plain_table_options.hash_table_ratio = 0.75;
+ plain_table_options.index_sparseness = 16;
+ plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+ plain_table_options.encoding_type = encoding_type;
+
+ options.table_factory.reset(new TestPlainTableFactory(
+ &expect_bloom_not_match, plain_table_options,
+ 0 /* column_family_id */, kDefaultColumnFamilyName));
+ }
+ DestroyAndReopen(&options);
+
+ ASSERT_OK(Put("1000000000foo002", "v_2"));
+ ASSERT_OK(Put("0000000000000bar", "random"));
+ ASSERT_OK(Put("1000000000foo001", "v1"));
+ ASSERT_OK(Put("3000000000000bar", "bar_v"));
+ ASSERT_OK(Put("1000000000foo003", "v__3"));
+ ASSERT_OK(Put("1000000000foo004", "v__4"));
+ ASSERT_OK(Put("1000000000foo005", "v__5"));
+ ASSERT_OK(Put("1000000000foo007", "v__7"));
+ ASSERT_OK(Put("1000000000foo008", "v__8"));
+ dbfull()->TEST_FlushMemTable();
+ ASSERT_EQ("v1", Get("1000000000foo001"));
+ ASSERT_EQ("v__3", Get("1000000000foo003"));
+ Iterator* iter = dbfull()->NewIterator(ReadOptions());
+ iter->Seek("1000000000foo000");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("1000000000foo001", iter->key().ToString());
+ ASSERT_EQ("v1", iter->value().ToString());
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("1000000000foo002", iter->key().ToString());
+ ASSERT_EQ("v_2", iter->value().ToString());
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("1000000000foo003", iter->key().ToString());
+ ASSERT_EQ("v__3", iter->value().ToString());
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("1000000000foo004", iter->key().ToString());
+ ASSERT_EQ("v__4", iter->value().ToString());
+
+ iter->Seek("3000000000000bar");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("3000000000000bar", iter->key().ToString());
+ ASSERT_EQ("bar_v", iter->value().ToString());
+
+ iter->Seek("1000000000foo000");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("1000000000foo001", iter->key().ToString());
+ ASSERT_EQ("v1", iter->value().ToString());
+
+ iter->Seek("1000000000foo005");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("1000000000foo005", iter->key().ToString());
+ ASSERT_EQ("v__5", iter->value().ToString());
+
+ iter->Seek("1000000000foo006");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("1000000000foo007", iter->key().ToString());
+ ASSERT_EQ("v__7", iter->value().ToString());
+
+ iter->Seek("1000000000foo008");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("1000000000foo008", iter->key().ToString());
+ ASSERT_EQ("v__8", iter->value().ToString());
+
+ if (total_order == 0) {
+ iter->Seek("1000000000foo009");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("3000000000000bar", iter->key().ToString());
+ }
+
+ // Test Bloom Filter
+ if (bloom_bits > 0) {
+ if (!total_order) {
+ // Neither key nor value should exist.
+ expect_bloom_not_match = true;
+ iter->Seek("2not000000000bar");
+ ASSERT_TRUE(!iter->Valid());
+ ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
+ expect_bloom_not_match = false;
+ } else {
+ expect_bloom_not_match = true;
+ ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
+ expect_bloom_not_match = false;
+ }
+ }
+
+ delete iter;
+ }
+ }
+ }
+ }
+}
+
+namespace {
+std::string NthKey(size_t n, char filler) {
+ std::string rv(16, filler);
+ rv[0] = n % 10;
+ rv[1] = (n / 10) % 10;
+ rv[2] = (n / 100) % 10;
+ rv[3] = (n / 1000) % 10;
+ return rv;
+}
+} // anonymous namespace
+
+TEST_P(PlainTableDBTest, BloomSchema) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ for (int bloom_locality = 0; bloom_locality <= 1; bloom_locality++) {
+ options.bloom_locality = bloom_locality;
+ PlainTableOptions plain_table_options;
+ plain_table_options.user_key_len = 16;
+ plain_table_options.bloom_bits_per_key = 3; // high FP rate for test
+ plain_table_options.hash_table_ratio = 0.75;
+ plain_table_options.index_sparseness = 16;
+ plain_table_options.huge_page_tlb_size = 0;
+ plain_table_options.encoding_type = kPlain;
+
+ bool expect_bloom_not_match = false;
+ options.table_factory.reset(new TestPlainTableFactory(
+ &expect_bloom_not_match, plain_table_options, 0 /* column_family_id */,
+ kDefaultColumnFamilyName));
+ DestroyAndReopen(&options);
+
+ for (unsigned i = 0; i < 2345; ++i) {
+ ASSERT_OK(Put(NthKey(i, 'y'), "added"));
+ }
+ dbfull()->TEST_FlushMemTable();
+ ASSERT_EQ("added", Get(NthKey(42, 'y')));
+
+ for (unsigned i = 0; i < 32; ++i) {
+ // Known pattern of Bloom filter false positives can detect schema change
+ // with high probability. Known FPs stuffed into bits:
+ uint32_t pattern;
+ if (!bloom_locality) {
+ pattern = 1785868347UL;
+ } else if (CACHE_LINE_SIZE == 64U) {
+ pattern = 2421694657UL;
+ } else if (CACHE_LINE_SIZE == 128U) {
+ pattern = 788710956UL;
+ } else {
+ ASSERT_EQ(CACHE_LINE_SIZE, 256U);
+ pattern = 163905UL;
+ }
+ bool expect_fp = pattern & (1UL << i);
+ // fprintf(stderr, "expect_fp@%u: %d\n", i, (int)expect_fp);
+ expect_bloom_not_match = !expect_fp;
+ ASSERT_EQ("NOT_FOUND", Get(NthKey(i, 'n')));
+ }
+ }
+}
+
+namespace {
+std::string MakeLongKey(size_t length, char c) {
+ return std::string(length, c);
+}
+} // namespace
+
+TEST_P(PlainTableDBTest, IteratorLargeKeys) {
+ Options options = CurrentOptions();
+
+ PlainTableOptions plain_table_options;
+ plain_table_options.user_key_len = 0;
+ plain_table_options.bloom_bits_per_key = 0;
+ plain_table_options.hash_table_ratio = 0;
+
+ options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+ options.create_if_missing = true;
+ options.prefix_extractor.reset();
+ DestroyAndReopen(&options);
+
+ std::string key_list[] = {
+ MakeLongKey(30, '0'),
+ MakeLongKey(16, '1'),
+ MakeLongKey(32, '2'),
+ MakeLongKey(60, '3'),
+ MakeLongKey(90, '4'),
+ MakeLongKey(50, '5'),
+ MakeLongKey(26, '6')
+ };
+
+ for (size_t i = 0; i < 7; i++) {
+ ASSERT_OK(Put(key_list[i], ToString(i)));
+ }
+
+ dbfull()->TEST_FlushMemTable();
+
+ Iterator* iter = dbfull()->NewIterator(ReadOptions());
+ iter->Seek(key_list[0]);
+
+ for (size_t i = 0; i < 7; i++) {
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(key_list[i], iter->key().ToString());
+ ASSERT_EQ(ToString(i), iter->value().ToString());
+ iter->Next();
+ }
+
+ ASSERT_TRUE(!iter->Valid());
+
+ delete iter;
+}
+
+namespace {
+std::string MakeLongKeyWithPrefix(size_t length, char c) {
+ return "00000000" + std::string(length - 8, c);
+}
+} // namespace
+
+TEST_P(PlainTableDBTest, IteratorLargeKeysWithPrefix) {
+ Options options = CurrentOptions();
+
+ PlainTableOptions plain_table_options;
+ plain_table_options.user_key_len = 16;
+ plain_table_options.bloom_bits_per_key = 0;
+ plain_table_options.hash_table_ratio = 0.8;
+ plain_table_options.index_sparseness = 3;
+ plain_table_options.huge_page_tlb_size = 0;
+ plain_table_options.encoding_type = kPrefix;
+
+ options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+ options.create_if_missing = true;
+ DestroyAndReopen(&options);
+
+ std::string key_list[] = {
+ MakeLongKeyWithPrefix(30, '0'), MakeLongKeyWithPrefix(16, '1'),
+ MakeLongKeyWithPrefix(32, '2'), MakeLongKeyWithPrefix(60, '3'),
+ MakeLongKeyWithPrefix(90, '4'), MakeLongKeyWithPrefix(50, '5'),
+ MakeLongKeyWithPrefix(26, '6')};
+
+ for (size_t i = 0; i < 7; i++) {
+ ASSERT_OK(Put(key_list[i], ToString(i)));
+ }
+
+ dbfull()->TEST_FlushMemTable();
+
+ Iterator* iter = dbfull()->NewIterator(ReadOptions());
+ iter->Seek(key_list[0]);
+
+ for (size_t i = 0; i < 7; i++) {
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(key_list[i], iter->key().ToString());
+ ASSERT_EQ(ToString(i), iter->value().ToString());
+ iter->Next();
+ }
+
+ ASSERT_TRUE(!iter->Valid());
+
+ delete iter;
+}
+
+TEST_P(PlainTableDBTest, IteratorReverseSuffixComparator) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ // Set only one bucket to force bucket conflict.
+ // Test index interval for the same prefix to be 1, 2 and 4
+ test::SimpleSuffixReverseComparator comp;
+ options.comparator = &comp;
+ DestroyAndReopen(&options);
+
+ ASSERT_OK(Put("1000000000foo002", "v_2"));
+ ASSERT_OK(Put("0000000000000bar", "random"));
+ ASSERT_OK(Put("1000000000foo001", "v1"));
+ ASSERT_OK(Put("3000000000000bar", "bar_v"));
+ ASSERT_OK(Put("1000000000foo003", "v__3"));
+ ASSERT_OK(Put("1000000000foo004", "v__4"));
+ ASSERT_OK(Put("1000000000foo005", "v__5"));
+ ASSERT_OK(Put("1000000000foo007", "v__7"));
+ ASSERT_OK(Put("1000000000foo008", "v__8"));
+ dbfull()->TEST_FlushMemTable();
+ ASSERT_EQ("v1", Get("1000000000foo001"));
+ ASSERT_EQ("v__3", Get("1000000000foo003"));
+ Iterator* iter = dbfull()->NewIterator(ReadOptions());
+ iter->Seek("1000000000foo009");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("1000000000foo008", iter->key().ToString());
+ ASSERT_EQ("v__8", iter->value().ToString());
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("1000000000foo007", iter->key().ToString());
+ ASSERT_EQ("v__7", iter->value().ToString());
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("1000000000foo005", iter->key().ToString());
+ ASSERT_EQ("v__5", iter->value().ToString());
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("1000000000foo004", iter->key().ToString());
+ ASSERT_EQ("v__4", iter->value().ToString());
+
+ iter->Seek("3000000000000bar");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("3000000000000bar", iter->key().ToString());
+ ASSERT_EQ("bar_v", iter->value().ToString());
+
+ iter->Seek("1000000000foo005");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("1000000000foo005", iter->key().ToString());
+ ASSERT_EQ("v__5", iter->value().ToString());
+
+ iter->Seek("1000000000foo006");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("1000000000foo005", iter->key().ToString());
+ ASSERT_EQ("v__5", iter->value().ToString());
+
+ iter->Seek("1000000000foo008");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("1000000000foo008", iter->key().ToString());
+ ASSERT_EQ("v__8", iter->value().ToString());
+
+ iter->Seek("1000000000foo000");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("3000000000000bar", iter->key().ToString());
+
+ delete iter;
+}
+
+TEST_P(PlainTableDBTest, HashBucketConflict) {
+ for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+ huge_page_tlb_size += 2 * 1024 * 1024) {
+ for (unsigned char i = 1; i <= 3; i++) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ // Set only one bucket to force bucket conflict.
+ // Test index interval for the same prefix to be 1, 2 and 4
+
+ PlainTableOptions plain_table_options;
+ plain_table_options.user_key_len = 16;
+ plain_table_options.bloom_bits_per_key = 0;
+ plain_table_options.hash_table_ratio = 0;
+ plain_table_options.index_sparseness = 2 ^ i;
+ plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+
+ options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+
+ DestroyAndReopen(&options);
+ ASSERT_OK(Put("5000000000000fo0", "v1"));
+ ASSERT_OK(Put("5000000000000fo1", "v2"));
+ ASSERT_OK(Put("5000000000000fo2", "v"));
+ ASSERT_OK(Put("2000000000000fo0", "v3"));
+ ASSERT_OK(Put("2000000000000fo1", "v4"));
+ ASSERT_OK(Put("2000000000000fo2", "v"));
+ ASSERT_OK(Put("2000000000000fo3", "v"));
+
+ dbfull()->TEST_FlushMemTable();
+
+ ASSERT_EQ("v1", Get("5000000000000fo0"));
+ ASSERT_EQ("v2", Get("5000000000000fo1"));
+ ASSERT_EQ("v3", Get("2000000000000fo0"));
+ ASSERT_EQ("v4", Get("2000000000000fo1"));
+
+ ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
+ ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
+ ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
+ ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
+
+ ReadOptions ro;
+ Iterator* iter = dbfull()->NewIterator(ro);
+
+ iter->Seek("5000000000000fo0");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+
+ iter->Seek("5000000000000fo1");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+
+ iter->Seek("2000000000000fo0");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("2000000000000fo0", iter->key().ToString());
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+
+ iter->Seek("2000000000000fo1");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+
+ iter->Seek("2000000000000bar");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("2000000000000fo0", iter->key().ToString());
+
+ iter->Seek("5000000000000bar");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+
+ iter->Seek("2000000000000fo8");
+ ASSERT_TRUE(!iter->Valid() ||
+ options.comparator->Compare(iter->key(), "20000001") > 0);
+
+ iter->Seek("5000000000000fo8");
+ ASSERT_TRUE(!iter->Valid());
+
+ iter->Seek("1000000000000fo2");
+ ASSERT_TRUE(!iter->Valid());
+
+ iter->Seek("3000000000000fo2");
+ ASSERT_TRUE(!iter->Valid());
+
+ iter->Seek("8000000000000fo2");
+ ASSERT_TRUE(!iter->Valid());
+
+ delete iter;
+ }
+ }
+}
+
+TEST_P(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) {
+ for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+ huge_page_tlb_size += 2 * 1024 * 1024) {
+ for (unsigned char i = 1; i <= 3; i++) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ test::SimpleSuffixReverseComparator comp;
+ options.comparator = &comp;
+ // Set only one bucket to force bucket conflict.
+ // Test index interval for the same prefix to be 1, 2 and 4
+
+ PlainTableOptions plain_table_options;
+ plain_table_options.user_key_len = 16;
+ plain_table_options.bloom_bits_per_key = 0;
+ plain_table_options.hash_table_ratio = 0;
+ plain_table_options.index_sparseness = 2 ^ i;
+ plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+
+ options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+ DestroyAndReopen(&options);
+ ASSERT_OK(Put("5000000000000fo0", "v1"));
+ ASSERT_OK(Put("5000000000000fo1", "v2"));
+ ASSERT_OK(Put("5000000000000fo2", "v"));
+ ASSERT_OK(Put("2000000000000fo0", "v3"));
+ ASSERT_OK(Put("2000000000000fo1", "v4"));
+ ASSERT_OK(Put("2000000000000fo2", "v"));
+ ASSERT_OK(Put("2000000000000fo3", "v"));
+
+ dbfull()->TEST_FlushMemTable();
+
+ ASSERT_EQ("v1", Get("5000000000000fo0"));
+ ASSERT_EQ("v2", Get("5000000000000fo1"));
+ ASSERT_EQ("v3", Get("2000000000000fo0"));
+ ASSERT_EQ("v4", Get("2000000000000fo1"));
+
+ ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
+ ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
+ ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
+ ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
+
+ ReadOptions ro;
+ Iterator* iter = dbfull()->NewIterator(ro);
+
+ iter->Seek("5000000000000fo1");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+
+ iter->Seek("5000000000000fo1");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+
+ iter->Seek("2000000000000fo1");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("2000000000000fo0", iter->key().ToString());
+
+ iter->Seek("2000000000000fo1");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+
+ iter->Seek("2000000000000var");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("2000000000000fo3", iter->key().ToString());
+
+ iter->Seek("5000000000000var");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("5000000000000fo2", iter->key().ToString());
+
+ std::string seek_key = "2000000000000bar";
+ iter->Seek(seek_key);
+ ASSERT_TRUE(!iter->Valid() ||
+ options.prefix_extractor->Transform(iter->key()) !=
+ options.prefix_extractor->Transform(seek_key));
+
+ iter->Seek("1000000000000fo2");
+ ASSERT_TRUE(!iter->Valid());
+
+ iter->Seek("3000000000000fo2");
+ ASSERT_TRUE(!iter->Valid());
+
+ iter->Seek("8000000000000fo2");
+ ASSERT_TRUE(!iter->Valid());
+
+ delete iter;
+ }
+ }
+}
+
+TEST_P(PlainTableDBTest, NonExistingKeyToNonEmptyBucket) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ // Set only one bucket to force bucket conflict.
+ // Test index interval for the same prefix to be 1, 2 and 4
+ PlainTableOptions plain_table_options;
+ plain_table_options.user_key_len = 16;
+ plain_table_options.bloom_bits_per_key = 0;
+ plain_table_options.hash_table_ratio = 0;
+ plain_table_options.index_sparseness = 5;
+
+ options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+ DestroyAndReopen(&options);
+ ASSERT_OK(Put("5000000000000fo0", "v1"));
+ ASSERT_OK(Put("5000000000000fo1", "v2"));
+ ASSERT_OK(Put("5000000000000fo2", "v3"));
+
+ dbfull()->TEST_FlushMemTable();
+
+ ASSERT_EQ("v1", Get("5000000000000fo0"));
+ ASSERT_EQ("v2", Get("5000000000000fo1"));
+ ASSERT_EQ("v3", Get("5000000000000fo2"));
+
+ ASSERT_EQ("NOT_FOUND", Get("8000000000000bar"));
+ ASSERT_EQ("NOT_FOUND", Get("1000000000000bar"));
+
+ Iterator* iter = dbfull()->NewIterator(ReadOptions());
+
+ iter->Seek("5000000000000bar");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+
+ iter->Seek("5000000000000fo8");
+ ASSERT_TRUE(!iter->Valid());
+
+ iter->Seek("1000000000000fo2");
+ ASSERT_TRUE(!iter->Valid());
+
+ iter->Seek("8000000000000fo2");
+ ASSERT_TRUE(!iter->Valid());
+
+ delete iter;
+}
+
+static std::string Key(int i) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "key_______%06d", i);
+ return std::string(buf);
+}
+
+static std::string RandomString(Random* rnd, int len) {
+ std::string r;
+ test::RandomString(rnd, len, &r);
+ return r;
+}
+
+TEST_P(PlainTableDBTest, CompactionTrigger) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 120 << 10; // 100KB
+ options.num_levels = 3;
+ options.level0_file_num_compaction_trigger = 3;
+ Reopen(&options);
+
+ Random rnd(301);
+
+ for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+ num++) {
+ std::vector<std::string> values;
+ // Write 120KB (10 values, each 12K)
+ for (int i = 0; i < 10; i++) {
+ values.push_back(RandomString(&rnd, 12000));
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Put(Key(999), ""));
+ dbfull()->TEST_WaitForFlushMemTable();
+ ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
+ }
+
+ //generate one more file in level-0, and should trigger level-0 compaction
+ std::vector<std::string> values;
+ for (int i = 0; i < 12; i++) {
+ values.push_back(RandomString(&rnd, 10000));
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Put(Key(999), ""));
+ dbfull()->TEST_WaitForCompact();
+
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+}
+
+TEST_P(PlainTableDBTest, AdaptiveTable) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+
+ options.table_factory.reset(NewPlainTableFactory());
+ DestroyAndReopen(&options);
+
+ ASSERT_OK(Put("1000000000000foo", "v1"));
+ ASSERT_OK(Put("0000000000000bar", "v2"));
+ ASSERT_OK(Put("1000000000000foo", "v3"));
+ dbfull()->TEST_FlushMemTable();
+
+ options.create_if_missing = false;
+ std::shared_ptr<TableFactory> block_based_factory(
+ NewBlockBasedTableFactory());
+ std::shared_ptr<TableFactory> plain_table_factory(
+ NewPlainTableFactory());
+ std::shared_ptr<TableFactory> dummy_factory;
+ options.table_factory.reset(NewAdaptiveTableFactory(
+ block_based_factory, block_based_factory, plain_table_factory));
+ Reopen(&options);
+ ASSERT_EQ("v3", Get("1000000000000foo"));
+ ASSERT_EQ("v2", Get("0000000000000bar"));
+
+ ASSERT_OK(Put("2000000000000foo", "v4"));
+ ASSERT_OK(Put("3000000000000bar", "v5"));
+ dbfull()->TEST_FlushMemTable();
+ ASSERT_EQ("v4", Get("2000000000000foo"));
+ ASSERT_EQ("v5", Get("3000000000000bar"));
+
+ Reopen(&options);
+ ASSERT_EQ("v3", Get("1000000000000foo"));
+ ASSERT_EQ("v2", Get("0000000000000bar"));
+ ASSERT_EQ("v4", Get("2000000000000foo"));
+ ASSERT_EQ("v5", Get("3000000000000bar"));
+
+ options.paranoid_checks = false;
+ options.table_factory.reset(NewBlockBasedTableFactory());
+ Reopen(&options);
+ ASSERT_NE("v3", Get("1000000000000foo"));
+
+ options.paranoid_checks = false;
+ options.table_factory.reset(NewPlainTableFactory());
+ Reopen(&options);
+ ASSERT_NE("v5", Get("3000000000000bar"));
+}
+
+INSTANTIATE_TEST_CASE_P(PlainTableDBTest, PlainTableDBTest, ::testing::Bool());
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "SKIPPED as plain table is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/pre_release_callback.h b/src/rocksdb/db/pre_release_callback.h
new file mode 100644
index 000000000..b74be9537
--- /dev/null
+++ b/src/rocksdb/db/pre_release_callback.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DB;
+
+class PreReleaseCallback {
+ public:
+ virtual ~PreReleaseCallback() {}
+
+ // Will be called while on the write thread after the write to the WAL and
+ // before the write to memtable. This is useful if any operation needs to be
+ // done before the write gets visible to the readers, or if we want to reduce
+ // the overhead of locking by updating something sequentially while we are on
+ // the write thread. If the callback fails, this function returns a non-OK
+ // status, the sequence number will not be released, and same status will be
+ // propagated to all the writers in the write group.
+ // seq is the sequence number that is used for this write and will be
+ // released.
+ // is_mem_disabled is currently used for debugging purposes to assert that
+ // the callback is done from the right write queue.
+ // If non-zero, log_number indicates the WAL log to which we wrote.
+ // index >= 0 specifies the order of callback in the same write thread.
+ // total > index specifies the total number of callbacks in the same write
+ // thread. Together with index, could be used to reduce the redundant
+ // operations among the callbacks.
+ virtual Status Callback(SequenceNumber seq, bool is_mem_disabled,
+ uint64_t log_number, size_t index, size_t total) = 0;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/prefix_test.cc b/src/rocksdb/db/prefix_test.cc
new file mode 100644
index 000000000..c61ec2a1e
--- /dev/null
+++ b/src/rocksdb/db/prefix_test.cc
@@ -0,0 +1,895 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+ fprintf(stderr, "Please install gflags to run this test... Skipping...\n");
+ return 0;
+}
+#else
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "monitoring/histogram.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "test_util/testharness.h"
+#include "util/coding.h"
+#include "util/gflags_compat.h"
+#include "util/random.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+DEFINE_bool(trigger_deadlock, false,
+ "issue delete in range scan to trigger PrefixHashMap deadlock");
+DEFINE_int32(bucket_count, 100000, "number of buckets");
+DEFINE_uint64(num_locks, 10001, "number of locks");
+DEFINE_bool(random_prefix, false, "randomize prefix");
+DEFINE_uint64(total_prefixes, 100000, "total number of prefixes");
+DEFINE_uint64(items_per_prefix, 1, "total number of values per prefix");
+DEFINE_int64(write_buffer_size, 33554432, "");
+DEFINE_int32(max_write_buffer_number, 2, "");
+DEFINE_int32(min_write_buffer_number_to_merge, 1, "");
+DEFINE_int32(skiplist_height, 4, "");
+DEFINE_double(memtable_prefix_bloom_size_ratio, 0.1, "");
+DEFINE_int32(memtable_huge_page_size, 2 * 1024 * 1024, "");
+DEFINE_int32(value_size, 40, "");
+DEFINE_bool(enable_print, false, "Print options generated to console.");
+
+// Path to the database on file system
+const std::string kDbName =
+ ROCKSDB_NAMESPACE::test::PerThreadDBPath("prefix_test");
+
+namespace ROCKSDB_NAMESPACE {
+
+struct TestKey {
+ uint64_t prefix;
+ uint64_t sorted;
+
+ TestKey(uint64_t _prefix, uint64_t _sorted)
+ : prefix(_prefix), sorted(_sorted) {}
+};
+
+// return a slice backed by test_key
+inline Slice TestKeyToSlice(std::string &s, const TestKey& test_key) {
+ s.clear();
+ PutFixed64(&s, test_key.prefix);
+ PutFixed64(&s, test_key.sorted);
+ return Slice(s.c_str(), s.size());
+}
+
+inline const TestKey SliceToTestKey(const Slice& slice) {
+ return TestKey(DecodeFixed64(slice.data()),
+ DecodeFixed64(slice.data() + 8));
+}
+
+class TestKeyComparator : public Comparator {
+ public:
+
+ // Compare needs to be aware of the possibility of a and/or b is
+ // prefix only
+ int Compare(const Slice& a, const Slice& b) const override {
+ const TestKey kkey_a = SliceToTestKey(a);
+ const TestKey kkey_b = SliceToTestKey(b);
+ const TestKey *key_a = &kkey_a;
+ const TestKey *key_b = &kkey_b;
+ if (key_a->prefix != key_b->prefix) {
+ if (key_a->prefix < key_b->prefix) return -1;
+ if (key_a->prefix > key_b->prefix) return 1;
+ } else {
+ EXPECT_TRUE(key_a->prefix == key_b->prefix);
+ // note, both a and b could be prefix only
+ if (a.size() != b.size()) {
+ // one of them is prefix
+ EXPECT_TRUE(
+ (a.size() == sizeof(uint64_t) && b.size() == sizeof(TestKey)) ||
+ (b.size() == sizeof(uint64_t) && a.size() == sizeof(TestKey)));
+ if (a.size() < b.size()) return -1;
+ if (a.size() > b.size()) return 1;
+ } else {
+ // both a and b are prefix
+ if (a.size() == sizeof(uint64_t)) {
+ return 0;
+ }
+
+ // both a and b are whole key
+ EXPECT_TRUE(a.size() == sizeof(TestKey) && b.size() == sizeof(TestKey));
+ if (key_a->sorted < key_b->sorted) return -1;
+ if (key_a->sorted > key_b->sorted) return 1;
+ if (key_a->sorted == key_b->sorted) return 0;
+ }
+ }
+ return 0;
+ }
+
+ bool operator()(const TestKey& a, const TestKey& b) const {
+ std::string sa, sb;
+ return Compare(TestKeyToSlice(sa, a), TestKeyToSlice(sb, b)) < 0;
+ }
+
+ const char* Name() const override { return "TestKeyComparator"; }
+
+ void FindShortestSeparator(std::string* /*start*/,
+ const Slice& /*limit*/) const override {}
+
+ void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+namespace {
+void PutKey(DB* db, WriteOptions write_options, uint64_t prefix,
+ uint64_t suffix, const Slice& value) {
+ TestKey test_key(prefix, suffix);
+ std::string s;
+ Slice key = TestKeyToSlice(s, test_key);
+ ASSERT_OK(db->Put(write_options, key, value));
+}
+
+void PutKey(DB* db, WriteOptions write_options, const TestKey& test_key,
+ const Slice& value) {
+ std::string s;
+ Slice key = TestKeyToSlice(s, test_key);
+ ASSERT_OK(db->Put(write_options, key, value));
+}
+
+void MergeKey(DB* db, WriteOptions write_options, const TestKey& test_key,
+ const Slice& value) {
+ std::string s;
+ Slice key = TestKeyToSlice(s, test_key);
+ ASSERT_OK(db->Merge(write_options, key, value));
+}
+
+void DeleteKey(DB* db, WriteOptions write_options, const TestKey& test_key) {
+ std::string s;
+ Slice key = TestKeyToSlice(s, test_key);
+ ASSERT_OK(db->Delete(write_options, key));
+}
+
+void SeekIterator(Iterator* iter, uint64_t prefix, uint64_t suffix) {
+ TestKey test_key(prefix, suffix);
+ std::string s;
+ Slice key = TestKeyToSlice(s, test_key);
+ iter->Seek(key);
+}
+
+const std::string kNotFoundResult = "NOT_FOUND";
+
+std::string Get(DB* db, const ReadOptions& read_options, uint64_t prefix,
+ uint64_t suffix) {
+ TestKey test_key(prefix, suffix);
+ std::string s2;
+ Slice key = TestKeyToSlice(s2, test_key);
+
+ std::string result;
+ Status s = db->Get(read_options, key, &result);
+ if (s.IsNotFound()) {
+ result = kNotFoundResult;
+ } else if (!s.ok()) {
+ result = s.ToString();
+ }
+ return result;
+}
+
+class SamePrefixTransform : public SliceTransform {
+ private:
+ const Slice prefix_;
+ std::string name_;
+
+ public:
+ explicit SamePrefixTransform(const Slice& prefix)
+ : prefix_(prefix), name_("rocksdb.SamePrefix." + prefix.ToString()) {}
+
+ const char* Name() const override { return name_.c_str(); }
+
+ Slice Transform(const Slice& src) const override {
+ assert(InDomain(src));
+ return prefix_;
+ }
+
+ bool InDomain(const Slice& src) const override {
+ if (src.size() >= prefix_.size()) {
+ return Slice(src.data(), prefix_.size()) == prefix_;
+ }
+ return false;
+ }
+
+ bool InRange(const Slice& dst) const override { return dst == prefix_; }
+
+ bool FullLengthEnabled(size_t* /*len*/) const override { return false; }
+};
+
+} // namespace
+
+class PrefixTest : public testing::Test {
+ public:
+ std::shared_ptr<DB> OpenDb() {
+ DB* db;
+
+ options.create_if_missing = true;
+ options.write_buffer_size = FLAGS_write_buffer_size;
+ options.max_write_buffer_number = FLAGS_max_write_buffer_number;
+ options.min_write_buffer_number_to_merge =
+ FLAGS_min_write_buffer_number_to_merge;
+
+ options.memtable_prefix_bloom_size_ratio =
+ FLAGS_memtable_prefix_bloom_size_ratio;
+ options.memtable_huge_page_size = FLAGS_memtable_huge_page_size;
+
+ options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ bbto.whole_key_filtering = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ options.allow_concurrent_memtable_write = false;
+
+ Status s = DB::Open(options, kDbName, &db);
+ EXPECT_OK(s);
+ return std::shared_ptr<DB>(db);
+ }
+
+ void FirstOption() {
+ option_config_ = kBegin;
+ }
+
+ bool NextOptions(int bucket_count) {
+ // skip some options
+ option_config_++;
+ if (option_config_ < kEnd) {
+ options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+ switch(option_config_) {
+ case kHashSkipList:
+ options.memtable_factory.reset(
+ NewHashSkipListRepFactory(bucket_count, FLAGS_skiplist_height));
+ return true;
+ case kHashLinkList:
+ options.memtable_factory.reset(
+ NewHashLinkListRepFactory(bucket_count));
+ return true;
+ case kHashLinkListHugePageTlb:
+ options.memtable_factory.reset(
+ NewHashLinkListRepFactory(bucket_count, 2 * 1024 * 1024));
+ return true;
+ case kHashLinkListTriggerSkipList:
+ options.memtable_factory.reset(
+ NewHashLinkListRepFactory(bucket_count, 0, 3));
+ return true;
+ default:
+ return false;
+ }
+ }
+ return false;
+ }
+
+ PrefixTest() : option_config_(kBegin) {
+ options.comparator = new TestKeyComparator();
+ }
+ ~PrefixTest() override { delete options.comparator; }
+
+ protected:
+ enum OptionConfig {
+ kBegin,
+ kHashSkipList,
+ kHashLinkList,
+ kHashLinkListHugePageTlb,
+ kHashLinkListTriggerSkipList,
+ kEnd
+ };
+ int option_config_;
+ Options options;
+};
+
+TEST(SamePrefixTest, InDomainTest) {
+ DB* db;
+ Options options;
+ options.create_if_missing = true;
+ options.prefix_extractor.reset(new SamePrefixTransform("HHKB"));
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ bbto.whole_key_filtering = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ WriteOptions write_options;
+ ReadOptions read_options;
+ {
+ ASSERT_OK(DestroyDB(kDbName, Options()));
+ ASSERT_OK(DB::Open(options, kDbName, &db));
+ ASSERT_OK(db->Put(write_options, "HHKB pro2", "Mar 24, 2006"));
+ ASSERT_OK(db->Put(write_options, "HHKB pro2 Type-S", "June 29, 2011"));
+ ASSERT_OK(db->Put(write_options, "Realforce 87u", "idk"));
+ db->Flush(FlushOptions());
+ std::string result;
+ auto db_iter = db->NewIterator(ReadOptions());
+
+ db_iter->Seek("Realforce 87u");
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_OK(db_iter->status());
+ ASSERT_EQ(db_iter->key(), "Realforce 87u");
+ ASSERT_EQ(db_iter->value(), "idk");
+
+ delete db_iter;
+ delete db;
+ ASSERT_OK(DestroyDB(kDbName, Options()));
+ }
+
+ {
+ ASSERT_OK(DB::Open(options, kDbName, &db));
+ ASSERT_OK(db->Put(write_options, "pikachu", "1"));
+ ASSERT_OK(db->Put(write_options, "Meowth", "1"));
+ ASSERT_OK(db->Put(write_options, "Mewtwo", "idk"));
+ db->Flush(FlushOptions());
+ std::string result;
+ auto db_iter = db->NewIterator(ReadOptions());
+
+ db_iter->Seek("Mewtwo");
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_OK(db_iter->status());
+ delete db_iter;
+ delete db;
+ ASSERT_OK(DestroyDB(kDbName, Options()));
+ }
+}
+
+TEST_F(PrefixTest, TestResult) {
+ for (int num_buckets = 1; num_buckets <= 2; num_buckets++) {
+ FirstOption();
+ while (NextOptions(num_buckets)) {
+ std::cout << "*** Mem table: " << options.memtable_factory->Name()
+ << " number of buckets: " << num_buckets
+ << std::endl;
+ DestroyDB(kDbName, Options());
+ auto db = OpenDb();
+ WriteOptions write_options;
+ ReadOptions read_options;
+
+ // 1. Insert one row.
+ Slice v16("v16");
+ PutKey(db.get(), write_options, 1, 6, v16);
+ std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+ SeekIterator(iter.get(), 1, 6);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v16 == iter->value());
+ SeekIterator(iter.get(), 1, 5);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v16 == iter->value());
+ SeekIterator(iter.get(), 1, 5);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v16 == iter->value());
+ iter->Next();
+ ASSERT_TRUE(!iter->Valid());
+
+ SeekIterator(iter.get(), 2, 0);
+ ASSERT_TRUE(!iter->Valid());
+
+ ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6));
+ ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 1, 5));
+ ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 1, 7));
+ ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 0, 6));
+ ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 2, 6));
+
+ // 2. Insert an entry for the same prefix as the last entry in the bucket.
+ Slice v17("v17");
+ PutKey(db.get(), write_options, 1, 7, v17);
+ iter.reset(db->NewIterator(read_options));
+ SeekIterator(iter.get(), 1, 7);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v17 == iter->value());
+
+ SeekIterator(iter.get(), 1, 6);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v16 == iter->value());
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v17 == iter->value());
+ iter->Next();
+ ASSERT_TRUE(!iter->Valid());
+
+ SeekIterator(iter.get(), 2, 0);
+ ASSERT_TRUE(!iter->Valid());
+
+ // 3. Insert an entry for the same prefix as the head of the bucket.
+ Slice v15("v15");
+ PutKey(db.get(), write_options, 1, 5, v15);
+ iter.reset(db->NewIterator(read_options));
+
+ SeekIterator(iter.get(), 1, 7);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v17 == iter->value());
+
+ SeekIterator(iter.get(), 1, 5);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v15 == iter->value());
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v16 == iter->value());
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v17 == iter->value());
+
+ SeekIterator(iter.get(), 1, 5);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v15 == iter->value());
+
+ ASSERT_EQ(v15.ToString(), Get(db.get(), read_options, 1, 5));
+ ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6));
+ ASSERT_EQ(v17.ToString(), Get(db.get(), read_options, 1, 7));
+
+ // 4. Insert an entry with a larger prefix
+ Slice v22("v22");
+ PutKey(db.get(), write_options, 2, 2, v22);
+ iter.reset(db->NewIterator(read_options));
+
+ SeekIterator(iter.get(), 2, 2);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v22 == iter->value());
+ SeekIterator(iter.get(), 2, 0);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v22 == iter->value());
+
+ SeekIterator(iter.get(), 1, 5);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v15 == iter->value());
+
+ SeekIterator(iter.get(), 1, 7);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v17 == iter->value());
+
+ // 5. Insert an entry with a smaller prefix
+ Slice v02("v02");
+ PutKey(db.get(), write_options, 0, 2, v02);
+ iter.reset(db->NewIterator(read_options));
+
+ SeekIterator(iter.get(), 0, 2);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v02 == iter->value());
+ SeekIterator(iter.get(), 0, 0);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v02 == iter->value());
+
+ SeekIterator(iter.get(), 2, 0);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v22 == iter->value());
+
+ SeekIterator(iter.get(), 1, 5);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v15 == iter->value());
+
+ SeekIterator(iter.get(), 1, 7);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v17 == iter->value());
+
+ // 6. Insert to the beginning and the end of the first prefix
+ Slice v13("v13");
+ Slice v18("v18");
+ PutKey(db.get(), write_options, 1, 3, v13);
+ PutKey(db.get(), write_options, 1, 8, v18);
+ iter.reset(db->NewIterator(read_options));
+ SeekIterator(iter.get(), 1, 7);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v17 == iter->value());
+
+ SeekIterator(iter.get(), 1, 3);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v13 == iter->value());
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v15 == iter->value());
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v16 == iter->value());
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v17 == iter->value());
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v18 == iter->value());
+
+ SeekIterator(iter.get(), 0, 0);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v02 == iter->value());
+
+ SeekIterator(iter.get(), 2, 0);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v22 == iter->value());
+
+ ASSERT_EQ(v22.ToString(), Get(db.get(), read_options, 2, 2));
+ ASSERT_EQ(v02.ToString(), Get(db.get(), read_options, 0, 2));
+ ASSERT_EQ(v13.ToString(), Get(db.get(), read_options, 1, 3));
+ ASSERT_EQ(v15.ToString(), Get(db.get(), read_options, 1, 5));
+ ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6));
+ ASSERT_EQ(v17.ToString(), Get(db.get(), read_options, 1, 7));
+ ASSERT_EQ(v18.ToString(), Get(db.get(), read_options, 1, 8));
+ }
+ }
+}
+
+// Show results in prefix
+TEST_F(PrefixTest, PrefixValid) {
+ for (int num_buckets = 1; num_buckets <= 2; num_buckets++) {
+ FirstOption();
+ while (NextOptions(num_buckets)) {
+ std::cout << "*** Mem table: " << options.memtable_factory->Name()
+ << " number of buckets: " << num_buckets << std::endl;
+ DestroyDB(kDbName, Options());
+ auto db = OpenDb();
+ WriteOptions write_options;
+ ReadOptions read_options;
+
+ // Insert keys with common prefix and one key with different
+ Slice v16("v16");
+ Slice v17("v17");
+ Slice v18("v18");
+ Slice v19("v19");
+ PutKey(db.get(), write_options, 12345, 6, v16);
+ PutKey(db.get(), write_options, 12345, 7, v17);
+ PutKey(db.get(), write_options, 12345, 8, v18);
+ PutKey(db.get(), write_options, 12345, 9, v19);
+ PutKey(db.get(), write_options, 12346, 8, v16);
+ db->Flush(FlushOptions());
+ TestKey test_key(12346, 8);
+ std::string s;
+ db->Delete(write_options, TestKeyToSlice(s, test_key));
+ db->Flush(FlushOptions());
+ read_options.prefix_same_as_start = true;
+ std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+ SeekIterator(iter.get(), 12345, 6);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v16 == iter->value());
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v17 == iter->value());
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v18 == iter->value());
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v19 == iter->value());
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 12346, 8));
+
+ // Verify seeking past the prefix won't return a result.
+ SeekIterator(iter.get(), 12345, 10);
+ ASSERT_TRUE(!iter->Valid());
+ }
+ }
+}
+
+TEST_F(PrefixTest, DynamicPrefixIterator) {
+ while (NextOptions(FLAGS_bucket_count)) {
+ std::cout << "*** Mem table: " << options.memtable_factory->Name()
+ << std::endl;
+ DestroyDB(kDbName, Options());
+ auto db = OpenDb();
+ WriteOptions write_options;
+ ReadOptions read_options;
+
+ std::vector<uint64_t> prefixes;
+ for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
+ prefixes.push_back(i);
+ }
+
+ if (FLAGS_random_prefix) {
+ std::random_shuffle(prefixes.begin(), prefixes.end());
+ }
+
+ HistogramImpl hist_put_time;
+ HistogramImpl hist_put_comparison;
+
+ // insert x random prefix, each with y continuous element.
+ for (auto prefix : prefixes) {
+ for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
+ TestKey test_key(prefix, sorted);
+
+ std::string s;
+ Slice key = TestKeyToSlice(s, test_key);
+ std::string value(FLAGS_value_size, 0);
+
+ get_perf_context()->Reset();
+ StopWatchNano timer(Env::Default(), true);
+ ASSERT_OK(db->Put(write_options, key, value));
+ hist_put_time.Add(timer.ElapsedNanos());
+ hist_put_comparison.Add(get_perf_context()->user_key_comparison_count);
+ }
+ }
+
+ std::cout << "Put key comparison: \n" << hist_put_comparison.ToString()
+ << "Put time: \n" << hist_put_time.ToString();
+
+ // test seek existing keys
+ HistogramImpl hist_seek_time;
+ HistogramImpl hist_seek_comparison;
+
+ std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+
+ for (auto prefix : prefixes) {
+ TestKey test_key(prefix, FLAGS_items_per_prefix / 2);
+ std::string s;
+ Slice key = TestKeyToSlice(s, test_key);
+ std::string value = "v" + ToString(0);
+
+ get_perf_context()->Reset();
+ StopWatchNano timer(Env::Default(), true);
+ auto key_prefix = options.prefix_extractor->Transform(key);
+ uint64_t total_keys = 0;
+ for (iter->Seek(key);
+ iter->Valid() && iter->key().starts_with(key_prefix);
+ iter->Next()) {
+ if (FLAGS_trigger_deadlock) {
+ std::cout << "Behold the deadlock!\n";
+ db->Delete(write_options, iter->key());
+ }
+ total_keys++;
+ }
+ hist_seek_time.Add(timer.ElapsedNanos());
+ hist_seek_comparison.Add(get_perf_context()->user_key_comparison_count);
+ ASSERT_EQ(total_keys, FLAGS_items_per_prefix - FLAGS_items_per_prefix/2);
+ }
+
+ std::cout << "Seek key comparison: \n"
+ << hist_seek_comparison.ToString()
+ << "Seek time: \n"
+ << hist_seek_time.ToString();
+
+ // test non-existing keys
+ HistogramImpl hist_no_seek_time;
+ HistogramImpl hist_no_seek_comparison;
+
+ for (auto prefix = FLAGS_total_prefixes;
+ prefix < FLAGS_total_prefixes + 10000;
+ prefix++) {
+ TestKey test_key(prefix, 0);
+ std::string s;
+ Slice key = TestKeyToSlice(s, test_key);
+
+ get_perf_context()->Reset();
+ StopWatchNano timer(Env::Default(), true);
+ iter->Seek(key);
+ hist_no_seek_time.Add(timer.ElapsedNanos());
+ hist_no_seek_comparison.Add(get_perf_context()->user_key_comparison_count);
+ ASSERT_TRUE(!iter->Valid());
+ }
+
+ std::cout << "non-existing Seek key comparison: \n"
+ << hist_no_seek_comparison.ToString()
+ << "non-existing Seek time: \n"
+ << hist_no_seek_time.ToString();
+ }
+}
+
+TEST_F(PrefixTest, PrefixSeekModePrev) {
+ // Only for SkipListFactory
+ options.memtable_factory.reset(new SkipListFactory);
+ options.merge_operator = MergeOperators::CreatePutOperator();
+ options.write_buffer_size = 1024 * 1024;
+ Random rnd(1);
+ for (size_t m = 1; m < 100; m++) {
+ std::cout << "[" + std::to_string(m) + "]" + "*** Mem table: "
+ << options.memtable_factory->Name() << std::endl;
+ DestroyDB(kDbName, Options());
+ auto db = OpenDb();
+ WriteOptions write_options;
+ ReadOptions read_options;
+ std::map<TestKey, std::string, TestKeyComparator> entry_maps[3], whole_map;
+ for (uint64_t i = 0; i < 10; i++) {
+ int div = i % 3 + 1;
+ for (uint64_t j = 0; j < 10; j++) {
+ whole_map[TestKey(i, j)] = entry_maps[rnd.Uniform(div)][TestKey(i, j)] =
+ 'v' + std::to_string(i) + std::to_string(j);
+ }
+ }
+
+ std::map<TestKey, std::string, TestKeyComparator> type_map;
+ for (size_t i = 0; i < 3; i++) {
+ for (auto& kv : entry_maps[i]) {
+ if (rnd.OneIn(3)) {
+ PutKey(db.get(), write_options, kv.first, kv.second);
+ type_map[kv.first] = "value";
+ } else {
+ MergeKey(db.get(), write_options, kv.first, kv.second);
+ type_map[kv.first] = "merge";
+ }
+ }
+ if (i < 2) {
+ db->Flush(FlushOptions());
+ }
+ }
+
+ for (size_t i = 0; i < 2; i++) {
+ for (auto& kv : entry_maps[i]) {
+ if (rnd.OneIn(10)) {
+ whole_map.erase(kv.first);
+ DeleteKey(db.get(), write_options, kv.first);
+ entry_maps[2][kv.first] = "delete";
+ }
+ }
+ }
+
+ if (FLAGS_enable_print) {
+ for (size_t i = 0; i < 3; i++) {
+ for (auto& kv : entry_maps[i]) {
+ std::cout << "[" << i << "]" << kv.first.prefix << kv.first.sorted
+ << " " << kv.second + " " + type_map[kv.first] << std::endl;
+ }
+ }
+ }
+
+ std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+ for (uint64_t prefix = 0; prefix < 10; prefix++) {
+ uint64_t start_suffix = rnd.Uniform(9);
+ SeekIterator(iter.get(), prefix, start_suffix);
+ auto it = whole_map.find(TestKey(prefix, start_suffix));
+ if (it == whole_map.end()) {
+ continue;
+ }
+ ASSERT_NE(it, whole_map.end());
+ ASSERT_TRUE(iter->Valid());
+ if (FLAGS_enable_print) {
+ std::cout << "round " << prefix
+ << " iter: " << SliceToTestKey(iter->key()).prefix
+ << SliceToTestKey(iter->key()).sorted
+ << " | map: " << it->first.prefix << it->first.sorted << " | "
+ << iter->value().ToString() << " " << it->second << std::endl;
+ }
+ ASSERT_EQ(iter->value(), it->second);
+ uint64_t stored_prefix = prefix;
+ for (size_t k = 0; k < 9; k++) {
+ if (rnd.OneIn(2) || it == whole_map.begin()) {
+ iter->Next();
+ ++it;
+ if (FLAGS_enable_print) {
+ std::cout << "Next >> ";
+ }
+ } else {
+ iter->Prev();
+ it--;
+ if (FLAGS_enable_print) {
+ std::cout << "Prev >> ";
+ }
+ }
+ if (!iter->Valid() ||
+ SliceToTestKey(iter->key()).prefix != stored_prefix) {
+ break;
+ }
+ stored_prefix = SliceToTestKey(iter->key()).prefix;
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_NE(it, whole_map.end());
+ ASSERT_EQ(iter->value(), it->second);
+ if (FLAGS_enable_print) {
+ std::cout << "iter: " << SliceToTestKey(iter->key()).prefix
+ << SliceToTestKey(iter->key()).sorted
+ << " | map: " << it->first.prefix << it->first.sorted
+ << " | " << iter->value().ToString() << " " << it->second
+ << std::endl;
+ }
+ }
+ }
+ }
+}
+
+TEST_F(PrefixTest, PrefixSeekModePrev2) {
+ // Only for SkipListFactory
+ // test the case
+ // iter1 iter2
+ // | prefix | suffix | | prefix | suffix |
+ // | 1 | 1 | | 1 | 2 |
+ // | 1 | 3 | | 1 | 4 |
+ // | 2 | 1 | | 3 | 3 |
+ // | 2 | 2 | | 3 | 4 |
+ // after seek(15), iter1 will be at 21 and iter2 will be 33.
+ // Then if call Prev() in prefix mode where SeekForPrev(21) gets called,
+ // iter2 should turn to invalid state because of bloom filter.
+ options.memtable_factory.reset(new SkipListFactory);
+ options.write_buffer_size = 1024 * 1024;
+ std::string v13("v13");
+ DestroyDB(kDbName, Options());
+ auto db = OpenDb();
+ WriteOptions write_options;
+ ReadOptions read_options;
+ PutKey(db.get(), write_options, TestKey(1, 2), "v12");
+ PutKey(db.get(), write_options, TestKey(1, 4), "v14");
+ PutKey(db.get(), write_options, TestKey(3, 3), "v33");
+ PutKey(db.get(), write_options, TestKey(3, 4), "v34");
+ db->Flush(FlushOptions());
+ reinterpret_cast<DBImpl*>(db.get())->TEST_WaitForFlushMemTable();
+ PutKey(db.get(), write_options, TestKey(1, 1), "v11");
+ PutKey(db.get(), write_options, TestKey(1, 3), "v13");
+ PutKey(db.get(), write_options, TestKey(2, 1), "v21");
+ PutKey(db.get(), write_options, TestKey(2, 2), "v22");
+ db->Flush(FlushOptions());
+ reinterpret_cast<DBImpl*>(db.get())->TEST_WaitForFlushMemTable();
+ std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+ SeekIterator(iter.get(), 1, 5);
+ iter->Prev();
+ ASSERT_EQ(iter->value(), v13);
+}
+
+TEST_F(PrefixTest, PrefixSeekModePrev3) {
+ // Only for SkipListFactory
+ // test SeekToLast() with iterate_upper_bound_ in prefix_seek_mode
+ options.memtable_factory.reset(new SkipListFactory);
+ options.write_buffer_size = 1024 * 1024;
+ std::string v14("v14");
+ TestKey upper_bound_key = TestKey(1, 5);
+ std::string s;
+ Slice upper_bound = TestKeyToSlice(s, upper_bound_key);
+
+ {
+ DestroyDB(kDbName, Options());
+ auto db = OpenDb();
+ WriteOptions write_options;
+ ReadOptions read_options;
+ read_options.iterate_upper_bound = &upper_bound;
+ PutKey(db.get(), write_options, TestKey(1, 2), "v12");
+ PutKey(db.get(), write_options, TestKey(1, 4), "v14");
+ db->Flush(FlushOptions());
+ reinterpret_cast<DBImpl*>(db.get())->TEST_WaitForFlushMemTable();
+ PutKey(db.get(), write_options, TestKey(1, 1), "v11");
+ PutKey(db.get(), write_options, TestKey(1, 3), "v13");
+ PutKey(db.get(), write_options, TestKey(2, 1), "v21");
+ PutKey(db.get(), write_options, TestKey(2, 2), "v22");
+ db->Flush(FlushOptions());
+ reinterpret_cast<DBImpl*>(db.get())->TEST_WaitForFlushMemTable();
+ std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+ iter->SeekToLast();
+ ASSERT_EQ(iter->value(), v14);
+ }
+ {
+ DestroyDB(kDbName, Options());
+ auto db = OpenDb();
+ WriteOptions write_options;
+ ReadOptions read_options;
+ read_options.iterate_upper_bound = &upper_bound;
+ PutKey(db.get(), write_options, TestKey(1, 2), "v12");
+ PutKey(db.get(), write_options, TestKey(1, 4), "v14");
+ PutKey(db.get(), write_options, TestKey(3, 3), "v33");
+ PutKey(db.get(), write_options, TestKey(3, 4), "v34");
+ db->Flush(FlushOptions());
+ reinterpret_cast<DBImpl*>(db.get())->TEST_WaitForFlushMemTable();
+ PutKey(db.get(), write_options, TestKey(1, 1), "v11");
+ PutKey(db.get(), write_options, TestKey(1, 3), "v13");
+ db->Flush(FlushOptions());
+ reinterpret_cast<DBImpl*>(db.get())->TEST_WaitForFlushMemTable();
+ std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+ iter->SeekToLast();
+ ASSERT_EQ(iter->value(), v14);
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ ParseCommandLineFlags(&argc, &argv, true);
+ return RUN_ALL_TESTS();
+}
+
+#endif // GFLAGS
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr,
+ "SKIPPED as HashSkipList and HashLinkList are not supported in "
+ "ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/range_del_aggregator.cc b/src/rocksdb/db/range_del_aggregator.cc
new file mode 100644
index 000000000..1f6a7b139
--- /dev/null
+++ b/src/rocksdb/db/range_del_aggregator.cc
@@ -0,0 +1,484 @@
+// Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/range_del_aggregator.h"
+
+#include "db/compaction/compaction_iteration_stats.h"
+#include "db/dbformat.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/range_del_aggregator.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/version_edit.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/types.h"
+#include "table/internal_iterator.h"
+#include "table/scoped_arena_iterator.h"
+#include "table/table_builder.h"
+#include "util/heap.h"
+#include "util/kv_map.h"
+#include "util/vector_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+TruncatedRangeDelIterator::TruncatedRangeDelIterator(
+ std::unique_ptr<FragmentedRangeTombstoneIterator> iter,
+ const InternalKeyComparator* icmp, const InternalKey* smallest,
+ const InternalKey* largest)
+ : iter_(std::move(iter)),
+ icmp_(icmp),
+ smallest_ikey_(smallest),
+ largest_ikey_(largest) {
+ if (smallest != nullptr) {
+ pinned_bounds_.emplace_back();
+ auto& parsed_smallest = pinned_bounds_.back();
+ if (!ParseInternalKey(smallest->Encode(), &parsed_smallest)) {
+ assert(false);
+ }
+ smallest_ = &parsed_smallest;
+ }
+ if (largest != nullptr) {
+ pinned_bounds_.emplace_back();
+ auto& parsed_largest = pinned_bounds_.back();
+ if (!ParseInternalKey(largest->Encode(), &parsed_largest)) {
+ assert(false);
+ }
+ if (parsed_largest.type == kTypeRangeDeletion &&
+ parsed_largest.sequence == kMaxSequenceNumber) {
+ // The file boundary has been artificially extended by a range tombstone.
+ // We do not need to adjust largest to properly truncate range
+ // tombstones that extend past the boundary.
+ } else if (parsed_largest.sequence == 0) {
+ // The largest key in the sstable has a sequence number of 0. Since we
+ // guarantee that no internal keys with the same user key and sequence
+ // number can exist in a DB, we know that the largest key in this sstable
+ // cannot exist as the smallest key in the next sstable. This further
+ // implies that no range tombstone in this sstable covers largest;
+ // otherwise, the file boundary would have been artificially extended.
+ //
+ // Therefore, we will never truncate a range tombstone at largest, so we
+ // can leave it unchanged.
+ } else {
+ // The same user key may straddle two sstable boundaries. To ensure that
+ // the truncated end key can cover the largest key in this sstable, reduce
+ // its sequence number by 1.
+ parsed_largest.sequence -= 1;
+ }
+ largest_ = &parsed_largest;
+ }
+}
+
+bool TruncatedRangeDelIterator::Valid() const {
+ return iter_->Valid() &&
+ (smallest_ == nullptr ||
+ icmp_->Compare(*smallest_, iter_->parsed_end_key()) < 0) &&
+ (largest_ == nullptr ||
+ icmp_->Compare(iter_->parsed_start_key(), *largest_) < 0);
+}
+
+void TruncatedRangeDelIterator::Next() { iter_->TopNext(); }
+
+void TruncatedRangeDelIterator::Prev() { iter_->TopPrev(); }
+
+void TruncatedRangeDelIterator::InternalNext() { iter_->Next(); }
+
+// NOTE: target is a user key
+void TruncatedRangeDelIterator::Seek(const Slice& target) {
+ if (largest_ != nullptr &&
+ icmp_->Compare(*largest_, ParsedInternalKey(target, kMaxSequenceNumber,
+ kTypeRangeDeletion)) <= 0) {
+ iter_->Invalidate();
+ return;
+ }
+ if (smallest_ != nullptr &&
+ icmp_->user_comparator()->Compare(target, smallest_->user_key) < 0) {
+ iter_->Seek(smallest_->user_key);
+ return;
+ }
+ iter_->Seek(target);
+}
+
+// NOTE: target is a user key
+void TruncatedRangeDelIterator::SeekForPrev(const Slice& target) {
+ if (smallest_ != nullptr &&
+ icmp_->Compare(ParsedInternalKey(target, 0, kTypeRangeDeletion),
+ *smallest_) < 0) {
+ iter_->Invalidate();
+ return;
+ }
+ if (largest_ != nullptr &&
+ icmp_->user_comparator()->Compare(largest_->user_key, target) < 0) {
+ iter_->SeekForPrev(largest_->user_key);
+ return;
+ }
+ iter_->SeekForPrev(target);
+}
+
+void TruncatedRangeDelIterator::SeekToFirst() {
+ if (smallest_ != nullptr) {
+ iter_->Seek(smallest_->user_key);
+ return;
+ }
+ iter_->SeekToTopFirst();
+}
+
+void TruncatedRangeDelIterator::SeekToLast() {
+ if (largest_ != nullptr) {
+ iter_->SeekForPrev(largest_->user_key);
+ return;
+ }
+ iter_->SeekToTopLast();
+}
+
+std::map<SequenceNumber, std::unique_ptr<TruncatedRangeDelIterator>>
+TruncatedRangeDelIterator::SplitBySnapshot(
+ const std::vector<SequenceNumber>& snapshots) {
+ using FragmentedIterPair =
+ std::pair<const SequenceNumber,
+ std::unique_ptr<FragmentedRangeTombstoneIterator>>;
+
+ auto split_untruncated_iters = iter_->SplitBySnapshot(snapshots);
+ std::map<SequenceNumber, std::unique_ptr<TruncatedRangeDelIterator>>
+ split_truncated_iters;
+ std::for_each(
+ split_untruncated_iters.begin(), split_untruncated_iters.end(),
+ [&](FragmentedIterPair& iter_pair) {
+ std::unique_ptr<TruncatedRangeDelIterator> truncated_iter(
+ new TruncatedRangeDelIterator(std::move(iter_pair.second), icmp_,
+ smallest_ikey_, largest_ikey_));
+ split_truncated_iters.emplace(iter_pair.first,
+ std::move(truncated_iter));
+ });
+ return split_truncated_iters;
+}
+
+ForwardRangeDelIterator::ForwardRangeDelIterator(
+ const InternalKeyComparator* icmp)
+ : icmp_(icmp),
+ unused_idx_(0),
+ active_seqnums_(SeqMaxComparator()),
+ active_iters_(EndKeyMinComparator(icmp)),
+ inactive_iters_(StartKeyMinComparator(icmp)) {}
+
+bool ForwardRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) {
+ // Move active iterators that end before parsed.
+ while (!active_iters_.empty() &&
+ icmp_->Compare((*active_iters_.top())->end_key(), parsed) <= 0) {
+ TruncatedRangeDelIterator* iter = PopActiveIter();
+ do {
+ iter->Next();
+ } while (iter->Valid() && icmp_->Compare(iter->end_key(), parsed) <= 0);
+ PushIter(iter, parsed);
+ assert(active_iters_.size() == active_seqnums_.size());
+ }
+
+ // Move inactive iterators that start before parsed.
+ while (!inactive_iters_.empty() &&
+ icmp_->Compare(inactive_iters_.top()->start_key(), parsed) <= 0) {
+ TruncatedRangeDelIterator* iter = PopInactiveIter();
+ while (iter->Valid() && icmp_->Compare(iter->end_key(), parsed) <= 0) {
+ iter->Next();
+ }
+ PushIter(iter, parsed);
+ assert(active_iters_.size() == active_seqnums_.size());
+ }
+
+ return active_seqnums_.empty()
+ ? false
+ : (*active_seqnums_.begin())->seq() > parsed.sequence;
+}
+
+void ForwardRangeDelIterator::Invalidate() {
+ unused_idx_ = 0;
+ active_iters_.clear();
+ active_seqnums_.clear();
+ inactive_iters_.clear();
+}
+
+ReverseRangeDelIterator::ReverseRangeDelIterator(
+ const InternalKeyComparator* icmp)
+ : icmp_(icmp),
+ unused_idx_(0),
+ active_seqnums_(SeqMaxComparator()),
+ active_iters_(StartKeyMaxComparator(icmp)),
+ inactive_iters_(EndKeyMaxComparator(icmp)) {}
+
+bool ReverseRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) {
+ // Move active iterators that start after parsed.
+ while (!active_iters_.empty() &&
+ icmp_->Compare(parsed, (*active_iters_.top())->start_key()) < 0) {
+ TruncatedRangeDelIterator* iter = PopActiveIter();
+ do {
+ iter->Prev();
+ } while (iter->Valid() && icmp_->Compare(parsed, iter->start_key()) < 0);
+ PushIter(iter, parsed);
+ assert(active_iters_.size() == active_seqnums_.size());
+ }
+
+ // Move inactive iterators that end after parsed.
+ while (!inactive_iters_.empty() &&
+ icmp_->Compare(parsed, inactive_iters_.top()->end_key()) < 0) {
+ TruncatedRangeDelIterator* iter = PopInactiveIter();
+ while (iter->Valid() && icmp_->Compare(parsed, iter->start_key()) < 0) {
+ iter->Prev();
+ }
+ PushIter(iter, parsed);
+ assert(active_iters_.size() == active_seqnums_.size());
+ }
+
+ return active_seqnums_.empty()
+ ? false
+ : (*active_seqnums_.begin())->seq() > parsed.sequence;
+}
+
+void ReverseRangeDelIterator::Invalidate() {
+ unused_idx_ = 0;
+ active_iters_.clear();
+ active_seqnums_.clear();
+ inactive_iters_.clear();
+}
+
+bool RangeDelAggregator::StripeRep::ShouldDelete(
+ const ParsedInternalKey& parsed, RangeDelPositioningMode mode) {
+ if (!InStripe(parsed.sequence) || IsEmpty()) {
+ return false;
+ }
+ switch (mode) {
+ case RangeDelPositioningMode::kForwardTraversal:
+ InvalidateReverseIter();
+
+ // Pick up previously unseen iterators.
+ for (auto it = std::next(iters_.begin(), forward_iter_.UnusedIdx());
+ it != iters_.end(); ++it, forward_iter_.IncUnusedIdx()) {
+ auto& iter = *it;
+ forward_iter_.AddNewIter(iter.get(), parsed);
+ }
+
+ return forward_iter_.ShouldDelete(parsed);
+ case RangeDelPositioningMode::kBackwardTraversal:
+ InvalidateForwardIter();
+
+ // Pick up previously unseen iterators.
+ for (auto it = std::next(iters_.begin(), reverse_iter_.UnusedIdx());
+ it != iters_.end(); ++it, reverse_iter_.IncUnusedIdx()) {
+ auto& iter = *it;
+ reverse_iter_.AddNewIter(iter.get(), parsed);
+ }
+
+ return reverse_iter_.ShouldDelete(parsed);
+ default:
+ assert(false);
+ return false;
+ }
+}
+
+bool RangeDelAggregator::StripeRep::IsRangeOverlapped(const Slice& start,
+ const Slice& end) {
+ Invalidate();
+
+ // Set the internal start/end keys so that:
+ // - if start_ikey has the same user key and sequence number as the
+ // current end key, start_ikey will be considered greater; and
+ // - if end_ikey has the same user key and sequence number as the current
+ // start key, end_ikey will be considered greater.
+ ParsedInternalKey start_ikey(start, kMaxSequenceNumber,
+ static_cast<ValueType>(0));
+ ParsedInternalKey end_ikey(end, 0, static_cast<ValueType>(0));
+ for (auto& iter : iters_) {
+ bool checked_candidate_tombstones = false;
+ for (iter->SeekForPrev(start);
+ iter->Valid() && icmp_->Compare(iter->start_key(), end_ikey) <= 0;
+ iter->Next()) {
+ checked_candidate_tombstones = true;
+ if (icmp_->Compare(start_ikey, iter->end_key()) < 0 &&
+ icmp_->Compare(iter->start_key(), end_ikey) <= 0) {
+ return true;
+ }
+ }
+
+ if (!checked_candidate_tombstones) {
+ // Do an additional check for when the end of the range is the begin
+ // key of a tombstone, which we missed earlier since SeekForPrev'ing
+ // to the start was invalid.
+ iter->SeekForPrev(end);
+ if (iter->Valid() && icmp_->Compare(start_ikey, iter->end_key()) < 0 &&
+ icmp_->Compare(iter->start_key(), end_ikey) <= 0) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+void ReadRangeDelAggregator::AddTombstones(
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+ const InternalKey* smallest, const InternalKey* largest) {
+ if (input_iter == nullptr || input_iter->empty()) {
+ return;
+ }
+ rep_.AddTombstones(
+ std::unique_ptr<TruncatedRangeDelIterator>(new TruncatedRangeDelIterator(
+ std::move(input_iter), icmp_, smallest, largest)));
+}
+
+bool ReadRangeDelAggregator::ShouldDeleteImpl(const ParsedInternalKey& parsed,
+ RangeDelPositioningMode mode) {
+ return rep_.ShouldDelete(parsed, mode);
+}
+
+bool ReadRangeDelAggregator::IsRangeOverlapped(const Slice& start,
+ const Slice& end) {
+ InvalidateRangeDelMapPositions();
+ return rep_.IsRangeOverlapped(start, end);
+}
+
+void CompactionRangeDelAggregator::AddTombstones(
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+ const InternalKey* smallest, const InternalKey* largest) {
+ if (input_iter == nullptr || input_iter->empty()) {
+ return;
+ }
+ assert(input_iter->lower_bound() == 0);
+ assert(input_iter->upper_bound() == kMaxSequenceNumber);
+ parent_iters_.emplace_back(new TruncatedRangeDelIterator(
+ std::move(input_iter), icmp_, smallest, largest));
+
+ auto split_iters = parent_iters_.back()->SplitBySnapshot(*snapshots_);
+ for (auto& split_iter : split_iters) {
+ auto it = reps_.find(split_iter.first);
+ if (it == reps_.end()) {
+ bool inserted;
+ SequenceNumber upper_bound = split_iter.second->upper_bound();
+ SequenceNumber lower_bound = split_iter.second->lower_bound();
+ std::tie(it, inserted) = reps_.emplace(
+ split_iter.first, StripeRep(icmp_, upper_bound, lower_bound));
+ assert(inserted);
+ }
+ assert(it != reps_.end());
+ it->second.AddTombstones(std::move(split_iter.second));
+ }
+}
+
+bool CompactionRangeDelAggregator::ShouldDelete(const ParsedInternalKey& parsed,
+ RangeDelPositioningMode mode) {
+ auto it = reps_.lower_bound(parsed.sequence);
+ if (it == reps_.end()) {
+ return false;
+ }
+ return it->second.ShouldDelete(parsed, mode);
+}
+
+namespace {
+
+class TruncatedRangeDelMergingIter : public InternalIterator {
+ public:
+ TruncatedRangeDelMergingIter(
+ const InternalKeyComparator* icmp, const Slice* lower_bound,
+ const Slice* upper_bound, bool upper_bound_inclusive,
+ const std::vector<std::unique_ptr<TruncatedRangeDelIterator>>& children)
+ : icmp_(icmp),
+ lower_bound_(lower_bound),
+ upper_bound_(upper_bound),
+ upper_bound_inclusive_(upper_bound_inclusive),
+ heap_(StartKeyMinComparator(icmp)) {
+ for (auto& child : children) {
+ if (child != nullptr) {
+ assert(child->lower_bound() == 0);
+ assert(child->upper_bound() == kMaxSequenceNumber);
+ children_.push_back(child.get());
+ }
+ }
+ }
+
+ bool Valid() const override {
+ return !heap_.empty() && BeforeEndKey(heap_.top());
+ }
+ Status status() const override { return Status::OK(); }
+
+ void SeekToFirst() override {
+ heap_.clear();
+ for (auto& child : children_) {
+ if (lower_bound_ != nullptr) {
+ child->Seek(*lower_bound_);
+ } else {
+ child->SeekToFirst();
+ }
+ if (child->Valid()) {
+ heap_.push(child);
+ }
+ }
+ }
+
+ void Next() override {
+ auto* top = heap_.top();
+ top->InternalNext();
+ if (top->Valid()) {
+ heap_.replace_top(top);
+ } else {
+ heap_.pop();
+ }
+ }
+
+ Slice key() const override {
+ auto* top = heap_.top();
+ cur_start_key_.Set(top->start_key().user_key, top->seq(),
+ kTypeRangeDeletion);
+ return cur_start_key_.Encode();
+ }
+
+ Slice value() const override {
+ auto* top = heap_.top();
+ assert(top->end_key().sequence == kMaxSequenceNumber);
+ return top->end_key().user_key;
+ }
+
+ // Unused InternalIterator methods
+ void Prev() override { assert(false); }
+ void Seek(const Slice& /* target */) override { assert(false); }
+ void SeekForPrev(const Slice& /* target */) override { assert(false); }
+ void SeekToLast() override { assert(false); }
+
+ private:
+ bool BeforeEndKey(const TruncatedRangeDelIterator* iter) const {
+ if (upper_bound_ == nullptr) {
+ return true;
+ }
+ int cmp = icmp_->user_comparator()->Compare(iter->start_key().user_key,
+ *upper_bound_);
+ return upper_bound_inclusive_ ? cmp <= 0 : cmp < 0;
+ }
+
+ const InternalKeyComparator* icmp_;
+ const Slice* lower_bound_;
+ const Slice* upper_bound_;
+ bool upper_bound_inclusive_;
+ BinaryHeap<TruncatedRangeDelIterator*, StartKeyMinComparator> heap_;
+ std::vector<TruncatedRangeDelIterator*> children_;
+
+ mutable InternalKey cur_start_key_;
+};
+
+} // namespace
+
+std::unique_ptr<FragmentedRangeTombstoneIterator>
+CompactionRangeDelAggregator::NewIterator(const Slice* lower_bound,
+ const Slice* upper_bound,
+ bool upper_bound_inclusive) {
+ InvalidateRangeDelMapPositions();
+ std::unique_ptr<TruncatedRangeDelMergingIter> merging_iter(
+ new TruncatedRangeDelMergingIter(icmp_, lower_bound, upper_bound,
+ upper_bound_inclusive, parent_iters_));
+
+ auto fragmented_tombstone_list =
+ std::make_shared<FragmentedRangeTombstoneList>(
+ std::move(merging_iter), *icmp_, true /* for_compaction */,
+ *snapshots_);
+
+ return std::unique_ptr<FragmentedRangeTombstoneIterator>(
+ new FragmentedRangeTombstoneIterator(
+ fragmented_tombstone_list, *icmp_,
+ kMaxSequenceNumber /* upper_bound */));
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/range_del_aggregator.h b/src/rocksdb/db/range_del_aggregator.h
new file mode 100644
index 000000000..b47cf31d3
--- /dev/null
+++ b/src/rocksdb/db/range_del_aggregator.h
@@ -0,0 +1,441 @@
+// Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <algorithm>
+#include <iterator>
+#include <list>
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "db/compaction/compaction_iteration_stats.h"
+#include "db/dbformat.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/range_del_aggregator.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/version_edit.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/types.h"
+#include "table/internal_iterator.h"
+#include "table/scoped_arena_iterator.h"
+#include "table/table_builder.h"
+#include "util/heap.h"
+#include "util/kv_map.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TruncatedRangeDelIterator {
+ public:
+ TruncatedRangeDelIterator(
+ std::unique_ptr<FragmentedRangeTombstoneIterator> iter,
+ const InternalKeyComparator* icmp, const InternalKey* smallest,
+ const InternalKey* largest);
+
+ bool Valid() const;
+
+ void Next();
+ void Prev();
+
+ void InternalNext();
+
+ // Seeks to the tombstone with the highest viisble sequence number that covers
+ // target (a user key). If no such tombstone exists, the position will be at
+ // the earliest tombstone that ends after target.
+ void Seek(const Slice& target);
+
+ // Seeks to the tombstone with the highest viisble sequence number that covers
+ // target (a user key). If no such tombstone exists, the position will be at
+ // the latest tombstone that starts before target.
+ void SeekForPrev(const Slice& target);
+
+ void SeekToFirst();
+ void SeekToLast();
+
+ ParsedInternalKey start_key() const {
+ return (smallest_ == nullptr ||
+ icmp_->Compare(*smallest_, iter_->parsed_start_key()) <= 0)
+ ? iter_->parsed_start_key()
+ : *smallest_;
+ }
+
+ ParsedInternalKey end_key() const {
+ return (largest_ == nullptr ||
+ icmp_->Compare(iter_->parsed_end_key(), *largest_) <= 0)
+ ? iter_->parsed_end_key()
+ : *largest_;
+ }
+
+ SequenceNumber seq() const { return iter_->seq(); }
+
+ std::map<SequenceNumber, std::unique_ptr<TruncatedRangeDelIterator>>
+ SplitBySnapshot(const std::vector<SequenceNumber>& snapshots);
+
+ SequenceNumber upper_bound() const { return iter_->upper_bound(); }
+
+ SequenceNumber lower_bound() const { return iter_->lower_bound(); }
+
+ private:
+ std::unique_ptr<FragmentedRangeTombstoneIterator> iter_;
+ const InternalKeyComparator* icmp_;
+ const ParsedInternalKey* smallest_ = nullptr;
+ const ParsedInternalKey* largest_ = nullptr;
+ std::list<ParsedInternalKey> pinned_bounds_;
+
+ const InternalKey* smallest_ikey_;
+ const InternalKey* largest_ikey_;
+};
+
+struct SeqMaxComparator {
+ bool operator()(const TruncatedRangeDelIterator* a,
+ const TruncatedRangeDelIterator* b) const {
+ return a->seq() > b->seq();
+ }
+};
+
+struct StartKeyMinComparator {
+ explicit StartKeyMinComparator(const InternalKeyComparator* c) : icmp(c) {}
+
+ bool operator()(const TruncatedRangeDelIterator* a,
+ const TruncatedRangeDelIterator* b) const {
+ return icmp->Compare(a->start_key(), b->start_key()) > 0;
+ }
+
+ const InternalKeyComparator* icmp;
+};
+
+class ForwardRangeDelIterator {
+ public:
+ explicit ForwardRangeDelIterator(const InternalKeyComparator* icmp);
+
+ bool ShouldDelete(const ParsedInternalKey& parsed);
+ void Invalidate();
+
+ void AddNewIter(TruncatedRangeDelIterator* iter,
+ const ParsedInternalKey& parsed) {
+ iter->Seek(parsed.user_key);
+ PushIter(iter, parsed);
+ assert(active_iters_.size() == active_seqnums_.size());
+ }
+
+ size_t UnusedIdx() const { return unused_idx_; }
+ void IncUnusedIdx() { unused_idx_++; }
+
+ private:
+ using ActiveSeqSet =
+ std::multiset<TruncatedRangeDelIterator*, SeqMaxComparator>;
+
+ struct EndKeyMinComparator {
+ explicit EndKeyMinComparator(const InternalKeyComparator* c) : icmp(c) {}
+
+ bool operator()(const ActiveSeqSet::const_iterator& a,
+ const ActiveSeqSet::const_iterator& b) const {
+ return icmp->Compare((*a)->end_key(), (*b)->end_key()) > 0;
+ }
+
+ const InternalKeyComparator* icmp;
+ };
+
+ void PushIter(TruncatedRangeDelIterator* iter,
+ const ParsedInternalKey& parsed) {
+ if (!iter->Valid()) {
+ // The iterator has been fully consumed, so we don't need to add it to
+ // either of the heaps.
+ return;
+ }
+ int cmp = icmp_->Compare(parsed, iter->start_key());
+ if (cmp < 0) {
+ PushInactiveIter(iter);
+ } else {
+ PushActiveIter(iter);
+ }
+ }
+
+ void PushActiveIter(TruncatedRangeDelIterator* iter) {
+ auto seq_pos = active_seqnums_.insert(iter);
+ active_iters_.push(seq_pos);
+ }
+
+ TruncatedRangeDelIterator* PopActiveIter() {
+ auto active_top = active_iters_.top();
+ auto iter = *active_top;
+ active_iters_.pop();
+ active_seqnums_.erase(active_top);
+ return iter;
+ }
+
+ void PushInactiveIter(TruncatedRangeDelIterator* iter) {
+ inactive_iters_.push(iter);
+ }
+
+ TruncatedRangeDelIterator* PopInactiveIter() {
+ auto* iter = inactive_iters_.top();
+ inactive_iters_.pop();
+ return iter;
+ }
+
+ const InternalKeyComparator* icmp_;
+ size_t unused_idx_;
+ ActiveSeqSet active_seqnums_;
+ BinaryHeap<ActiveSeqSet::const_iterator, EndKeyMinComparator> active_iters_;
+ BinaryHeap<TruncatedRangeDelIterator*, StartKeyMinComparator> inactive_iters_;
+};
+
+class ReverseRangeDelIterator {
+ public:
+ explicit ReverseRangeDelIterator(const InternalKeyComparator* icmp);
+
+ bool ShouldDelete(const ParsedInternalKey& parsed);
+ void Invalidate();
+
+ void AddNewIter(TruncatedRangeDelIterator* iter,
+ const ParsedInternalKey& parsed) {
+ iter->SeekForPrev(parsed.user_key);
+ PushIter(iter, parsed);
+ assert(active_iters_.size() == active_seqnums_.size());
+ }
+
+ size_t UnusedIdx() const { return unused_idx_; }
+ void IncUnusedIdx() { unused_idx_++; }
+
+ private:
+ using ActiveSeqSet =
+ std::multiset<TruncatedRangeDelIterator*, SeqMaxComparator>;
+
+ struct EndKeyMaxComparator {
+ explicit EndKeyMaxComparator(const InternalKeyComparator* c) : icmp(c) {}
+
+ bool operator()(const TruncatedRangeDelIterator* a,
+ const TruncatedRangeDelIterator* b) const {
+ return icmp->Compare(a->end_key(), b->end_key()) < 0;
+ }
+
+ const InternalKeyComparator* icmp;
+ };
+ struct StartKeyMaxComparator {
+ explicit StartKeyMaxComparator(const InternalKeyComparator* c) : icmp(c) {}
+
+ bool operator()(const ActiveSeqSet::const_iterator& a,
+ const ActiveSeqSet::const_iterator& b) const {
+ return icmp->Compare((*a)->start_key(), (*b)->start_key()) < 0;
+ }
+
+ const InternalKeyComparator* icmp;
+ };
+
+ void PushIter(TruncatedRangeDelIterator* iter,
+ const ParsedInternalKey& parsed) {
+ if (!iter->Valid()) {
+ // The iterator has been fully consumed, so we don't need to add it to
+ // either of the heaps.
+ } else if (icmp_->Compare(iter->end_key(), parsed) <= 0) {
+ PushInactiveIter(iter);
+ } else {
+ PushActiveIter(iter);
+ }
+ }
+
+ void PushActiveIter(TruncatedRangeDelIterator* iter) {
+ auto seq_pos = active_seqnums_.insert(iter);
+ active_iters_.push(seq_pos);
+ }
+
+ TruncatedRangeDelIterator* PopActiveIter() {
+ auto active_top = active_iters_.top();
+ auto iter = *active_top;
+ active_iters_.pop();
+ active_seqnums_.erase(active_top);
+ return iter;
+ }
+
+ void PushInactiveIter(TruncatedRangeDelIterator* iter) {
+ inactive_iters_.push(iter);
+ }
+
+ TruncatedRangeDelIterator* PopInactiveIter() {
+ auto* iter = inactive_iters_.top();
+ inactive_iters_.pop();
+ return iter;
+ }
+
+ const InternalKeyComparator* icmp_;
+ size_t unused_idx_;
+ ActiveSeqSet active_seqnums_;
+ BinaryHeap<ActiveSeqSet::const_iterator, StartKeyMaxComparator> active_iters_;
+ BinaryHeap<TruncatedRangeDelIterator*, EndKeyMaxComparator> inactive_iters_;
+};
+
+enum class RangeDelPositioningMode { kForwardTraversal, kBackwardTraversal };
+class RangeDelAggregator {
+ public:
+ explicit RangeDelAggregator(const InternalKeyComparator* icmp)
+ : icmp_(icmp) {}
+ virtual ~RangeDelAggregator() {}
+
+ virtual void AddTombstones(
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+ const InternalKey* smallest = nullptr,
+ const InternalKey* largest = nullptr) = 0;
+
+ bool ShouldDelete(const Slice& key, RangeDelPositioningMode mode) {
+ ParsedInternalKey parsed;
+ if (!ParseInternalKey(key, &parsed)) {
+ return false;
+ }
+ return ShouldDelete(parsed, mode);
+ }
+ virtual bool ShouldDelete(const ParsedInternalKey& parsed,
+ RangeDelPositioningMode mode) = 0;
+
+ virtual void InvalidateRangeDelMapPositions() = 0;
+
+ virtual bool IsEmpty() const = 0;
+
+ bool AddFile(uint64_t file_number) {
+ return files_seen_.insert(file_number).second;
+ }
+
+ protected:
+ class StripeRep {
+ public:
+ StripeRep(const InternalKeyComparator* icmp, SequenceNumber upper_bound,
+ SequenceNumber lower_bound)
+ : icmp_(icmp),
+ forward_iter_(icmp),
+ reverse_iter_(icmp),
+ upper_bound_(upper_bound),
+ lower_bound_(lower_bound) {}
+
+ void AddTombstones(std::unique_ptr<TruncatedRangeDelIterator> input_iter) {
+ iters_.push_back(std::move(input_iter));
+ }
+
+ bool IsEmpty() const { return iters_.empty(); }
+
+ bool ShouldDelete(const ParsedInternalKey& parsed,
+ RangeDelPositioningMode mode);
+
+ void Invalidate() {
+ if (!IsEmpty()) {
+ InvalidateForwardIter();
+ InvalidateReverseIter();
+ }
+ }
+
+ bool IsRangeOverlapped(const Slice& start, const Slice& end);
+
+ private:
+ bool InStripe(SequenceNumber seq) const {
+ return lower_bound_ <= seq && seq <= upper_bound_;
+ }
+
+ void InvalidateForwardIter() { forward_iter_.Invalidate(); }
+
+ void InvalidateReverseIter() { reverse_iter_.Invalidate(); }
+
+ const InternalKeyComparator* icmp_;
+ std::vector<std::unique_ptr<TruncatedRangeDelIterator>> iters_;
+ ForwardRangeDelIterator forward_iter_;
+ ReverseRangeDelIterator reverse_iter_;
+ SequenceNumber upper_bound_;
+ SequenceNumber lower_bound_;
+ };
+
+ const InternalKeyComparator* icmp_;
+
+ private:
+ std::set<uint64_t> files_seen_;
+};
+
+class ReadRangeDelAggregator final : public RangeDelAggregator {
+ public:
+ ReadRangeDelAggregator(const InternalKeyComparator* icmp,
+ SequenceNumber upper_bound)
+ : RangeDelAggregator(icmp),
+ rep_(icmp, upper_bound, 0 /* lower_bound */) {}
+ ~ReadRangeDelAggregator() override {}
+
+ using RangeDelAggregator::ShouldDelete;
+ void AddTombstones(
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+ const InternalKey* smallest = nullptr,
+ const InternalKey* largest = nullptr) override;
+
+ bool ShouldDelete(const ParsedInternalKey& parsed,
+ RangeDelPositioningMode mode) final override {
+ if (rep_.IsEmpty()) {
+ return false;
+ }
+ return ShouldDeleteImpl(parsed, mode);
+ }
+
+ bool IsRangeOverlapped(const Slice& start, const Slice& end);
+
+ void InvalidateRangeDelMapPositions() override { rep_.Invalidate(); }
+
+ bool IsEmpty() const override { return rep_.IsEmpty(); }
+
+ private:
+ StripeRep rep_;
+
+ bool ShouldDeleteImpl(const ParsedInternalKey& parsed,
+ RangeDelPositioningMode mode);
+};
+
+class CompactionRangeDelAggregator : public RangeDelAggregator {
+ public:
+ CompactionRangeDelAggregator(const InternalKeyComparator* icmp,
+ const std::vector<SequenceNumber>& snapshots)
+ : RangeDelAggregator(icmp), snapshots_(&snapshots) {}
+ ~CompactionRangeDelAggregator() override {}
+
+ void AddTombstones(
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+ const InternalKey* smallest = nullptr,
+ const InternalKey* largest = nullptr) override;
+
+ using RangeDelAggregator::ShouldDelete;
+ bool ShouldDelete(const ParsedInternalKey& parsed,
+ RangeDelPositioningMode mode) override;
+
+ bool IsRangeOverlapped(const Slice& start, const Slice& end);
+
+ void InvalidateRangeDelMapPositions() override {
+ for (auto& rep : reps_) {
+ rep.second.Invalidate();
+ }
+ }
+
+ bool IsEmpty() const override {
+ for (const auto& rep : reps_) {
+ if (!rep.second.IsEmpty()) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ // Creates an iterator over all the range tombstones in the aggregator, for
+ // use in compaction. Nullptr arguments indicate that the iterator range is
+ // unbounded.
+ // NOTE: the boundaries are used for optimization purposes to reduce the
+ // number of tombstones that are passed to the fragmenter; they do not
+ // guarantee that the resulting iterator only contains range tombstones that
+ // cover keys in the provided range. If required, these bounds must be
+ // enforced during iteration.
+ std::unique_ptr<FragmentedRangeTombstoneIterator> NewIterator(
+ const Slice* lower_bound = nullptr, const Slice* upper_bound = nullptr,
+ bool upper_bound_inclusive = false);
+
+ private:
+ std::vector<std::unique_ptr<TruncatedRangeDelIterator>> parent_iters_;
+ std::map<SequenceNumber, StripeRep> reps_;
+
+ const std::vector<SequenceNumber>* snapshots_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/range_del_aggregator_bench.cc b/src/rocksdb/db/range_del_aggregator_bench.cc
new file mode 100644
index 000000000..3f3135f2e
--- /dev/null
+++ b/src/rocksdb/db/range_del_aggregator_bench.cc
@@ -0,0 +1,260 @@
+// Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+ fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+ return 1;
+}
+#else
+
+#include <iostream>
+#include <iomanip>
+#include <memory>
+#include <random>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "db/range_del_aggregator.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/random.h"
+#include "util/stop_watch.h"
+
+#include "util/gflags_compat.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+DEFINE_int32(num_range_tombstones, 1000, "number of range tombstones created");
+
+DEFINE_int32(num_runs, 1000, "number of test runs");
+
+DEFINE_int32(tombstone_start_upper_bound, 1000,
+ "exclusive upper bound on range tombstone start keys");
+
+DEFINE_int32(should_delete_upper_bound, 1000,
+ "exclusive upper bound on keys passed to ShouldDelete");
+
+DEFINE_double(tombstone_width_mean, 100.0, "average range tombstone width");
+
+DEFINE_double(tombstone_width_stddev, 0.0,
+ "standard deviation of range tombstone width");
+
+DEFINE_int32(seed, 0, "random number generator seed");
+
+DEFINE_int32(should_deletes_per_run, 1, "number of ShouldDelete calls per run");
+
+DEFINE_int32(add_tombstones_per_run, 1,
+ "number of AddTombstones calls per run");
+
+namespace {
+
+struct Stats {
+ uint64_t time_add_tombstones = 0;
+ uint64_t time_first_should_delete = 0;
+ uint64_t time_rest_should_delete = 0;
+};
+
+std::ostream& operator<<(std::ostream& os, const Stats& s) {
+ std::ios fmt_holder(nullptr);
+ fmt_holder.copyfmt(os);
+
+ os << std::left;
+ os << std::setw(25) << "AddTombstones: "
+ << s.time_add_tombstones /
+ (FLAGS_add_tombstones_per_run * FLAGS_num_runs * 1.0e3)
+ << " us\n";
+ os << std::setw(25) << "ShouldDelete (first): "
+ << s.time_first_should_delete / (FLAGS_num_runs * 1.0e3) << " us\n";
+ if (FLAGS_should_deletes_per_run > 1) {
+ os << std::setw(25) << "ShouldDelete (rest): "
+ << s.time_rest_should_delete /
+ ((FLAGS_should_deletes_per_run - 1) * FLAGS_num_runs * 1.0e3)
+ << " us\n";
+ }
+
+ os.copyfmt(fmt_holder);
+ return os;
+}
+
+auto icmp = ROCKSDB_NAMESPACE::InternalKeyComparator(
+ ROCKSDB_NAMESPACE::BytewiseComparator());
+
+} // anonymous namespace
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// A wrapper around RangeTombstones and the underlying data of its start and end
+// keys.
+struct PersistentRangeTombstone {
+ std::string start_key;
+ std::string end_key;
+ RangeTombstone tombstone;
+
+ PersistentRangeTombstone(std::string start, std::string end,
+ SequenceNumber seq)
+ : start_key(std::move(start)), end_key(std::move(end)) {
+ tombstone = RangeTombstone(start_key, end_key, seq);
+ }
+
+ PersistentRangeTombstone() = default;
+
+ PersistentRangeTombstone(const PersistentRangeTombstone& t) { *this = t; }
+
+ PersistentRangeTombstone& operator=(const PersistentRangeTombstone& t) {
+ start_key = t.start_key;
+ end_key = t.end_key;
+ tombstone = RangeTombstone(start_key, end_key, t.tombstone.seq_);
+
+ return *this;
+ }
+
+ PersistentRangeTombstone(PersistentRangeTombstone&& t) noexcept { *this = t; }
+
+ PersistentRangeTombstone& operator=(PersistentRangeTombstone&& t) {
+ start_key = std::move(t.start_key);
+ end_key = std::move(t.end_key);
+ tombstone = RangeTombstone(start_key, end_key, t.tombstone.seq_);
+
+ return *this;
+ }
+};
+
+struct TombstoneStartKeyComparator {
+ explicit TombstoneStartKeyComparator(const Comparator* c) : cmp(c) {}
+
+ bool operator()(const RangeTombstone& a, const RangeTombstone& b) const {
+ return cmp->Compare(a.start_key_, b.start_key_) < 0;
+ }
+
+ const Comparator* cmp;
+};
+
+std::unique_ptr<InternalIterator> MakeRangeDelIterator(
+ const std::vector<PersistentRangeTombstone>& range_dels) {
+ std::vector<std::string> keys, values;
+ for (const auto& range_del : range_dels) {
+ auto key_and_value = range_del.tombstone.Serialize();
+ keys.push_back(key_and_value.first.Encode().ToString());
+ values.push_back(key_and_value.second.ToString());
+ }
+ return std::unique_ptr<test::VectorIterator>(
+ new test::VectorIterator(keys, values));
+}
+
+// convert long to a big-endian slice key
+static std::string Key(int64_t val) {
+ std::string little_endian_key;
+ std::string big_endian_key;
+ PutFixed64(&little_endian_key, val);
+ assert(little_endian_key.size() == sizeof(val));
+ big_endian_key.resize(sizeof(val));
+ for (size_t i = 0; i < sizeof(val); ++i) {
+ big_endian_key[i] = little_endian_key[sizeof(val) - 1 - i];
+ }
+ return big_endian_key;
+}
+
+} // anonymous namespace
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ParseCommandLineFlags(&argc, &argv, true);
+
+ Stats stats;
+ ROCKSDB_NAMESPACE::Random64 rnd(FLAGS_seed);
+ std::default_random_engine random_gen(FLAGS_seed);
+ std::normal_distribution<double> normal_dist(FLAGS_tombstone_width_mean,
+ FLAGS_tombstone_width_stddev);
+ std::vector<std::vector<ROCKSDB_NAMESPACE::PersistentRangeTombstone> >
+ all_persistent_range_tombstones(FLAGS_add_tombstones_per_run);
+ for (int i = 0; i < FLAGS_add_tombstones_per_run; i++) {
+ all_persistent_range_tombstones[i] =
+ std::vector<ROCKSDB_NAMESPACE::PersistentRangeTombstone>(
+ FLAGS_num_range_tombstones);
+ }
+ auto mode = ROCKSDB_NAMESPACE::RangeDelPositioningMode::kForwardTraversal;
+
+ for (int i = 0; i < FLAGS_num_runs; i++) {
+ ROCKSDB_NAMESPACE::ReadRangeDelAggregator range_del_agg(
+ &icmp, ROCKSDB_NAMESPACE::kMaxSequenceNumber /* upper_bound */);
+
+ std::vector<
+ std::unique_ptr<ROCKSDB_NAMESPACE::FragmentedRangeTombstoneList> >
+ fragmented_range_tombstone_lists(FLAGS_add_tombstones_per_run);
+
+ for (auto& persistent_range_tombstones : all_persistent_range_tombstones) {
+ // TODO(abhimadan): consider whether creating the range tombstones right
+ // before AddTombstones is artificially warming the cache compared to
+ // real workloads.
+ for (int j = 0; j < FLAGS_num_range_tombstones; j++) {
+ uint64_t start = rnd.Uniform(FLAGS_tombstone_start_upper_bound);
+ uint64_t end = static_cast<uint64_t>(
+ std::round(start + std::max(1.0, normal_dist(random_gen))));
+ persistent_range_tombstones[j] =
+ ROCKSDB_NAMESPACE::PersistentRangeTombstone(
+ ROCKSDB_NAMESPACE::Key(start), ROCKSDB_NAMESPACE::Key(end), j);
+ }
+
+ auto range_del_iter =
+ ROCKSDB_NAMESPACE::MakeRangeDelIterator(persistent_range_tombstones);
+ fragmented_range_tombstone_lists.emplace_back(
+ new ROCKSDB_NAMESPACE::FragmentedRangeTombstoneList(
+ ROCKSDB_NAMESPACE::MakeRangeDelIterator(
+ persistent_range_tombstones),
+ icmp));
+ std::unique_ptr<ROCKSDB_NAMESPACE::FragmentedRangeTombstoneIterator>
+ fragmented_range_del_iter(
+ new ROCKSDB_NAMESPACE::FragmentedRangeTombstoneIterator(
+ fragmented_range_tombstone_lists.back().get(), icmp,
+ ROCKSDB_NAMESPACE::kMaxSequenceNumber));
+
+ ROCKSDB_NAMESPACE::StopWatchNano stop_watch_add_tombstones(
+ ROCKSDB_NAMESPACE::Env::Default(), true /* auto_start */);
+ range_del_agg.AddTombstones(std::move(fragmented_range_del_iter));
+ stats.time_add_tombstones += stop_watch_add_tombstones.ElapsedNanos();
+ }
+
+ ROCKSDB_NAMESPACE::ParsedInternalKey parsed_key;
+ parsed_key.sequence = FLAGS_num_range_tombstones / 2;
+ parsed_key.type = ROCKSDB_NAMESPACE::kTypeValue;
+
+ uint64_t first_key = rnd.Uniform(FLAGS_should_delete_upper_bound -
+ FLAGS_should_deletes_per_run + 1);
+
+ for (int j = 0; j < FLAGS_should_deletes_per_run; j++) {
+ std::string key_string = ROCKSDB_NAMESPACE::Key(first_key + j);
+ parsed_key.user_key = key_string;
+
+ ROCKSDB_NAMESPACE::StopWatchNano stop_watch_should_delete(
+ ROCKSDB_NAMESPACE::Env::Default(), true /* auto_start */);
+ range_del_agg.ShouldDelete(parsed_key, mode);
+ uint64_t call_time = stop_watch_should_delete.ElapsedNanos();
+
+ if (j == 0) {
+ stats.time_first_should_delete += call_time;
+ } else {
+ stats.time_rest_should_delete += call_time;
+ }
+ }
+ }
+
+ std::cout << "=========================\n"
+ << "Results:\n"
+ << "=========================\n"
+ << stats;
+
+ return 0;
+}
+
+#endif // GFLAGS
diff --git a/src/rocksdb/db/range_del_aggregator_test.cc b/src/rocksdb/db/range_del_aggregator_test.cc
new file mode 100644
index 000000000..0b8b5079c
--- /dev/null
+++ b/src/rocksdb/db/range_del_aggregator_test.cc
@@ -0,0 +1,709 @@
+// Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/range_del_aggregator.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class RangeDelAggregatorTest : public testing::Test {};
+
+namespace {
+
+static auto bytewise_icmp = InternalKeyComparator(BytewiseComparator());
+
+std::unique_ptr<InternalIterator> MakeRangeDelIter(
+ const std::vector<RangeTombstone>& range_dels) {
+ std::vector<std::string> keys, values;
+ for (const auto& range_del : range_dels) {
+ auto key_and_value = range_del.Serialize();
+ keys.push_back(key_and_value.first.Encode().ToString());
+ values.push_back(key_and_value.second.ToString());
+ }
+ return std::unique_ptr<test::VectorIterator>(
+ new test::VectorIterator(keys, values));
+}
+
+std::vector<std::unique_ptr<FragmentedRangeTombstoneList>>
+MakeFragmentedTombstoneLists(
+ const std::vector<std::vector<RangeTombstone>>& range_dels_list) {
+ std::vector<std::unique_ptr<FragmentedRangeTombstoneList>> fragment_lists;
+ for (const auto& range_dels : range_dels_list) {
+ auto range_del_iter = MakeRangeDelIter(range_dels);
+ fragment_lists.emplace_back(new FragmentedRangeTombstoneList(
+ std::move(range_del_iter), bytewise_icmp));
+ }
+ return fragment_lists;
+}
+
+struct TruncatedIterScanTestCase {
+ ParsedInternalKey start;
+ ParsedInternalKey end;
+ SequenceNumber seq;
+};
+
+struct TruncatedIterSeekTestCase {
+ Slice target;
+ ParsedInternalKey start;
+ ParsedInternalKey end;
+ SequenceNumber seq;
+ bool invalid;
+};
+
+struct ShouldDeleteTestCase {
+ ParsedInternalKey lookup_key;
+ bool result;
+};
+
+struct IsRangeOverlappedTestCase {
+ Slice start;
+ Slice end;
+ bool result;
+};
+
+ParsedInternalKey UncutEndpoint(const Slice& s) {
+ return ParsedInternalKey(s, kMaxSequenceNumber, kTypeRangeDeletion);
+}
+
+ParsedInternalKey InternalValue(const Slice& key, SequenceNumber seq) {
+ return ParsedInternalKey(key, seq, kTypeValue);
+}
+
+void VerifyIterator(
+ TruncatedRangeDelIterator* iter, const InternalKeyComparator& icmp,
+ const std::vector<TruncatedIterScanTestCase>& expected_range_dels) {
+ // Test forward iteration.
+ iter->SeekToFirst();
+ for (size_t i = 0; i < expected_range_dels.size(); i++, iter->Next()) {
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ(0, icmp.Compare(iter->start_key(), expected_range_dels[i].start));
+ EXPECT_EQ(0, icmp.Compare(iter->end_key(), expected_range_dels[i].end));
+ EXPECT_EQ(expected_range_dels[i].seq, iter->seq());
+ }
+ EXPECT_FALSE(iter->Valid());
+
+ // Test reverse iteration.
+ iter->SeekToLast();
+ std::vector<TruncatedIterScanTestCase> reverse_expected_range_dels(
+ expected_range_dels.rbegin(), expected_range_dels.rend());
+ for (size_t i = 0; i < reverse_expected_range_dels.size();
+ i++, iter->Prev()) {
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ(0, icmp.Compare(iter->start_key(),
+ reverse_expected_range_dels[i].start));
+ EXPECT_EQ(
+ 0, icmp.Compare(iter->end_key(), reverse_expected_range_dels[i].end));
+ EXPECT_EQ(reverse_expected_range_dels[i].seq, iter->seq());
+ }
+ EXPECT_FALSE(iter->Valid());
+}
+
+void VerifySeek(TruncatedRangeDelIterator* iter,
+ const InternalKeyComparator& icmp,
+ const std::vector<TruncatedIterSeekTestCase>& test_cases) {
+ for (const auto& test_case : test_cases) {
+ iter->Seek(test_case.target);
+ if (test_case.invalid) {
+ ASSERT_FALSE(iter->Valid());
+ } else {
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ(0, icmp.Compare(iter->start_key(), test_case.start));
+ EXPECT_EQ(0, icmp.Compare(iter->end_key(), test_case.end));
+ EXPECT_EQ(test_case.seq, iter->seq());
+ }
+ }
+}
+
+void VerifySeekForPrev(
+ TruncatedRangeDelIterator* iter, const InternalKeyComparator& icmp,
+ const std::vector<TruncatedIterSeekTestCase>& test_cases) {
+ for (const auto& test_case : test_cases) {
+ iter->SeekForPrev(test_case.target);
+ if (test_case.invalid) {
+ ASSERT_FALSE(iter->Valid());
+ } else {
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ(0, icmp.Compare(iter->start_key(), test_case.start));
+ EXPECT_EQ(0, icmp.Compare(iter->end_key(), test_case.end));
+ EXPECT_EQ(test_case.seq, iter->seq());
+ }
+ }
+}
+
+void VerifyShouldDelete(RangeDelAggregator* range_del_agg,
+ const std::vector<ShouldDeleteTestCase>& test_cases) {
+ for (const auto& test_case : test_cases) {
+ EXPECT_EQ(
+ test_case.result,
+ range_del_agg->ShouldDelete(
+ test_case.lookup_key, RangeDelPositioningMode::kForwardTraversal));
+ }
+ for (auto it = test_cases.rbegin(); it != test_cases.rend(); ++it) {
+ const auto& test_case = *it;
+ EXPECT_EQ(
+ test_case.result,
+ range_del_agg->ShouldDelete(
+ test_case.lookup_key, RangeDelPositioningMode::kBackwardTraversal));
+ }
+}
+
+void VerifyIsRangeOverlapped(
+ ReadRangeDelAggregator* range_del_agg,
+ const std::vector<IsRangeOverlappedTestCase>& test_cases) {
+ for (const auto& test_case : test_cases) {
+ EXPECT_EQ(test_case.result,
+ range_del_agg->IsRangeOverlapped(test_case.start, test_case.end));
+ }
+}
+
+void CheckIterPosition(const RangeTombstone& tombstone,
+ const FragmentedRangeTombstoneIterator* iter) {
+ // Test InternalIterator interface.
+ EXPECT_EQ(tombstone.start_key_, ExtractUserKey(iter->key()));
+ EXPECT_EQ(tombstone.end_key_, iter->value());
+ EXPECT_EQ(tombstone.seq_, iter->seq());
+
+ // Test FragmentedRangeTombstoneIterator interface.
+ EXPECT_EQ(tombstone.start_key_, iter->start_key());
+ EXPECT_EQ(tombstone.end_key_, iter->end_key());
+ EXPECT_EQ(tombstone.seq_, GetInternalKeySeqno(iter->key()));
+}
+
+void VerifyFragmentedRangeDels(
+ FragmentedRangeTombstoneIterator* iter,
+ const std::vector<RangeTombstone>& expected_tombstones) {
+ iter->SeekToFirst();
+ for (size_t i = 0; i < expected_tombstones.size(); i++, iter->Next()) {
+ ASSERT_TRUE(iter->Valid());
+ CheckIterPosition(expected_tombstones[i], iter);
+ }
+ EXPECT_FALSE(iter->Valid());
+}
+
+} // namespace
+
+TEST_F(RangeDelAggregatorTest, EmptyTruncatedIter) {
+ auto range_del_iter = MakeRangeDelIter({});
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+ new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber));
+
+ TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr,
+ nullptr);
+
+ iter.SeekToFirst();
+ ASSERT_FALSE(iter.Valid());
+
+ iter.SeekToLast();
+ ASSERT_FALSE(iter.Valid());
+}
+
+TEST_F(RangeDelAggregatorTest, UntruncatedIter) {
+ auto range_del_iter =
+ MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}});
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+ new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber));
+
+ TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr,
+ nullptr);
+
+ VerifyIterator(&iter, bytewise_icmp,
+ {{UncutEndpoint("a"), UncutEndpoint("e"), 10},
+ {UncutEndpoint("e"), UncutEndpoint("g"), 8},
+ {UncutEndpoint("j"), UncutEndpoint("n"), 4}});
+
+ VerifySeek(
+ &iter, bytewise_icmp,
+ {{"d", UncutEndpoint("a"), UncutEndpoint("e"), 10},
+ {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+ {"ia", UncutEndpoint("j"), UncutEndpoint("n"), 4},
+ {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+ {"", UncutEndpoint("a"), UncutEndpoint("e"), 10}});
+
+ VerifySeekForPrev(
+ &iter, bytewise_icmp,
+ {{"d", UncutEndpoint("a"), UncutEndpoint("e"), 10},
+ {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+ {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+ {"n", UncutEndpoint("j"), UncutEndpoint("n"), 4},
+ {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
+}
+
+TEST_F(RangeDelAggregatorTest, UntruncatedIterWithSnapshot) {
+ auto range_del_iter =
+ MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}});
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+ new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+ 9 /* snapshot */));
+
+ TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr,
+ nullptr);
+
+ VerifyIterator(&iter, bytewise_icmp,
+ {{UncutEndpoint("e"), UncutEndpoint("g"), 8},
+ {UncutEndpoint("j"), UncutEndpoint("n"), 4}});
+
+ VerifySeek(
+ &iter, bytewise_icmp,
+ {{"d", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+ {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+ {"ia", UncutEndpoint("j"), UncutEndpoint("n"), 4},
+ {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+ {"", UncutEndpoint("e"), UncutEndpoint("g"), 8}});
+
+ VerifySeekForPrev(
+ &iter, bytewise_icmp,
+ {{"d", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+ {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+ {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+ {"n", UncutEndpoint("j"), UncutEndpoint("n"), 4},
+ {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
+}
+
+TEST_F(RangeDelAggregatorTest, TruncatedIterPartiallyCutTombstones) {
+ auto range_del_iter =
+ MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}});
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+ new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber));
+
+ InternalKey smallest("d", 7, kTypeValue);
+ InternalKey largest("m", 9, kTypeValue);
+ TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp,
+ &smallest, &largest);
+
+ VerifyIterator(&iter, bytewise_icmp,
+ {{InternalValue("d", 7), UncutEndpoint("e"), 10},
+ {UncutEndpoint("e"), UncutEndpoint("g"), 8},
+ {UncutEndpoint("j"), InternalValue("m", 8), 4}});
+
+ VerifySeek(
+ &iter, bytewise_icmp,
+ {{"d", InternalValue("d", 7), UncutEndpoint("e"), 10},
+ {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+ {"ia", UncutEndpoint("j"), InternalValue("m", 8), 4},
+ {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+ {"", InternalValue("d", 7), UncutEndpoint("e"), 10}});
+
+ VerifySeekForPrev(
+ &iter, bytewise_icmp,
+ {{"d", InternalValue("d", 7), UncutEndpoint("e"), 10},
+ {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+ {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+ {"n", UncutEndpoint("j"), InternalValue("m", 8), 4},
+ {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
+}
+
+TEST_F(RangeDelAggregatorTest, TruncatedIterFullyCutTombstones) {
+ auto range_del_iter =
+ MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}});
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+ new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber));
+
+ InternalKey smallest("f", 7, kTypeValue);
+ InternalKey largest("i", 9, kTypeValue);
+ TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp,
+ &smallest, &largest);
+
+ VerifyIterator(&iter, bytewise_icmp,
+ {{InternalValue("f", 7), UncutEndpoint("g"), 8}});
+
+ VerifySeek(
+ &iter, bytewise_icmp,
+ {{"d", InternalValue("f", 7), UncutEndpoint("g"), 8},
+ {"f", InternalValue("f", 7), UncutEndpoint("g"), 8},
+ {"j", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
+
+ VerifySeekForPrev(
+ &iter, bytewise_icmp,
+ {{"d", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+ {"f", InternalValue("f", 7), UncutEndpoint("g"), 8},
+ {"j", InternalValue("f", 7), UncutEndpoint("g"), 8}});
+}
+
+TEST_F(RangeDelAggregatorTest, SingleIterInAggregator) {
+ auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, {"c", "g", 8}});
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+ new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber));
+
+ ReadRangeDelAggregator range_del_agg(&bytewise_icmp, kMaxSequenceNumber);
+ range_del_agg.AddTombstones(std::move(input_iter));
+
+ VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), false},
+ {InternalValue("b", 9), true},
+ {InternalValue("d", 9), true},
+ {InternalValue("e", 7), true},
+ {InternalValue("g", 7), false}});
+
+ VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+ {"_", "a", true},
+ {"a", "c", true},
+ {"d", "f", true},
+ {"g", "l", false}});
+}
+
+TEST_F(RangeDelAggregatorTest, MultipleItersInAggregator) {
+ auto fragment_lists = MakeFragmentedTombstoneLists(
+ {{{"a", "e", 10}, {"c", "g", 8}},
+ {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+ ReadRangeDelAggregator range_del_agg(&bytewise_icmp, kMaxSequenceNumber);
+ for (const auto& fragment_list : fragment_lists) {
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+ new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+ kMaxSequenceNumber));
+ range_del_agg.AddTombstones(std::move(input_iter));
+ }
+
+ VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), true},
+ {InternalValue("b", 19), false},
+ {InternalValue("b", 9), true},
+ {InternalValue("d", 9), true},
+ {InternalValue("e", 7), true},
+ {InternalValue("g", 7), false},
+ {InternalValue("h", 24), true},
+ {InternalValue("i", 24), false},
+ {InternalValue("ii", 14), true},
+ {InternalValue("j", 14), false}});
+
+ VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+ {"_", "a", true},
+ {"a", "c", true},
+ {"d", "f", true},
+ {"g", "l", true},
+ {"x", "y", false}});
+}
+
+TEST_F(RangeDelAggregatorTest, MultipleItersInAggregatorWithUpperBound) {
+ auto fragment_lists = MakeFragmentedTombstoneLists(
+ {{{"a", "e", 10}, {"c", "g", 8}},
+ {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+ ReadRangeDelAggregator range_del_agg(&bytewise_icmp, 19);
+ for (const auto& fragment_list : fragment_lists) {
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+ new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+ 19 /* snapshot */));
+ range_del_agg.AddTombstones(std::move(input_iter));
+ }
+
+ VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), false},
+ {InternalValue("a", 9), true},
+ {InternalValue("b", 9), true},
+ {InternalValue("d", 9), true},
+ {InternalValue("e", 7), true},
+ {InternalValue("g", 7), false},
+ {InternalValue("h", 24), false},
+ {InternalValue("i", 24), false},
+ {InternalValue("ii", 14), true},
+ {InternalValue("j", 14), false}});
+
+ VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+ {"_", "a", true},
+ {"a", "c", true},
+ {"d", "f", true},
+ {"g", "l", true},
+ {"x", "y", false}});
+}
+
+TEST_F(RangeDelAggregatorTest, MultipleTruncatedItersInAggregator) {
+ auto fragment_lists = MakeFragmentedTombstoneLists(
+ {{{"a", "z", 10}}, {{"a", "z", 10}}, {{"a", "z", 10}}});
+ std::vector<std::pair<InternalKey, InternalKey>> iter_bounds = {
+ {InternalKey("a", 4, kTypeValue),
+ InternalKey("m", kMaxSequenceNumber, kTypeRangeDeletion)},
+ {InternalKey("m", 20, kTypeValue),
+ InternalKey("x", kMaxSequenceNumber, kTypeRangeDeletion)},
+ {InternalKey("x", 5, kTypeValue), InternalKey("zz", 30, kTypeValue)}};
+
+ ReadRangeDelAggregator range_del_agg(&bytewise_icmp, 19);
+ for (size_t i = 0; i < fragment_lists.size(); i++) {
+ const auto& fragment_list = fragment_lists[i];
+ const auto& bounds = iter_bounds[i];
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+ new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+ 19 /* snapshot */));
+ range_del_agg.AddTombstones(std::move(input_iter), &bounds.first,
+ &bounds.second);
+ }
+
+ VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 10), false},
+ {InternalValue("a", 9), false},
+ {InternalValue("a", 4), true},
+ {InternalValue("m", 10), false},
+ {InternalValue("m", 9), true},
+ {InternalValue("x", 10), false},
+ {InternalValue("x", 9), false},
+ {InternalValue("x", 5), true},
+ {InternalValue("z", 9), false}});
+
+ VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+ {"_", "a", true},
+ {"a", "n", true},
+ {"l", "x", true},
+ {"w", "z", true},
+ {"zzz", "zz", false},
+ {"zz", "zzz", false}});
+}
+
+TEST_F(RangeDelAggregatorTest, MultipleTruncatedItersInAggregatorSameLevel) {
+ auto fragment_lists = MakeFragmentedTombstoneLists(
+ {{{"a", "z", 10}}, {{"a", "z", 10}}, {{"a", "z", 10}}});
+ std::vector<std::pair<InternalKey, InternalKey>> iter_bounds = {
+ {InternalKey("a", 4, kTypeValue),
+ InternalKey("m", kMaxSequenceNumber, kTypeRangeDeletion)},
+ {InternalKey("m", 20, kTypeValue),
+ InternalKey("x", kMaxSequenceNumber, kTypeRangeDeletion)},
+ {InternalKey("x", 5, kTypeValue), InternalKey("zz", 30, kTypeValue)}};
+
+ ReadRangeDelAggregator range_del_agg(&bytewise_icmp, 19);
+
+ auto add_iter_to_agg = [&](size_t i) {
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+ new FragmentedRangeTombstoneIterator(fragment_lists[i].get(),
+ bytewise_icmp, 19 /* snapshot */));
+ range_del_agg.AddTombstones(std::move(input_iter), &iter_bounds[i].first,
+ &iter_bounds[i].second);
+ };
+
+ add_iter_to_agg(0);
+ VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 10), false},
+ {InternalValue("a", 9), false},
+ {InternalValue("a", 4), true}});
+
+ add_iter_to_agg(1);
+ VerifyShouldDelete(&range_del_agg, {{InternalValue("m", 10), false},
+ {InternalValue("m", 9), true}});
+
+ add_iter_to_agg(2);
+ VerifyShouldDelete(&range_del_agg, {{InternalValue("x", 10), false},
+ {InternalValue("x", 9), false},
+ {InternalValue("x", 5), true},
+ {InternalValue("z", 9), false}});
+
+ VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+ {"_", "a", true},
+ {"a", "n", true},
+ {"l", "x", true},
+ {"w", "z", true},
+ {"zzz", "zz", false},
+ {"zz", "zzz", false}});
+}
+
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorNoSnapshots) {
+ auto fragment_lists = MakeFragmentedTombstoneLists(
+ {{{"a", "e", 10}, {"c", "g", 8}},
+ {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+ std::vector<SequenceNumber> snapshots;
+ CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+ for (const auto& fragment_list : fragment_lists) {
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+ new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+ kMaxSequenceNumber));
+ range_del_agg.AddTombstones(std::move(input_iter));
+ }
+
+ VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), true},
+ {InternalValue("b", 19), false},
+ {InternalValue("b", 9), true},
+ {InternalValue("d", 9), true},
+ {InternalValue("e", 7), true},
+ {InternalValue("g", 7), false},
+ {InternalValue("h", 24), true},
+ {InternalValue("i", 24), false},
+ {InternalValue("ii", 14), true},
+ {InternalValue("j", 14), false}});
+
+ auto range_del_compaction_iter = range_del_agg.NewIterator();
+ VerifyFragmentedRangeDels(range_del_compaction_iter.get(), {{"a", "b", 20},
+ {"b", "c", 10},
+ {"c", "e", 10},
+ {"e", "g", 8},
+ {"h", "i", 25},
+ {"ii", "j", 15}});
+}
+
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorWithSnapshots) {
+ auto fragment_lists = MakeFragmentedTombstoneLists(
+ {{{"a", "e", 10}, {"c", "g", 8}},
+ {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+ std::vector<SequenceNumber> snapshots{9, 19};
+ CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+ for (const auto& fragment_list : fragment_lists) {
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+ new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+ kMaxSequenceNumber));
+ range_del_agg.AddTombstones(std::move(input_iter));
+ }
+
+ VerifyShouldDelete(
+ &range_del_agg,
+ {
+ {InternalValue("a", 19), false}, // [10, 19]
+ {InternalValue("a", 9), false}, // [0, 9]
+ {InternalValue("b", 9), false}, // [0, 9]
+ {InternalValue("d", 9), false}, // [0, 9]
+ {InternalValue("d", 7), true}, // [0, 9]
+ {InternalValue("e", 7), true}, // [0, 9]
+ {InternalValue("g", 7), false}, // [0, 9]
+ {InternalValue("h", 24), true}, // [20, kMaxSequenceNumber]
+ {InternalValue("i", 24), false}, // [20, kMaxSequenceNumber]
+ {InternalValue("ii", 14), true}, // [10, 19]
+ {InternalValue("j", 14), false} // [10, 19]
+ });
+
+ auto range_del_compaction_iter = range_del_agg.NewIterator();
+ VerifyFragmentedRangeDels(range_del_compaction_iter.get(), {{"a", "b", 20},
+ {"a", "b", 10},
+ {"b", "c", 10},
+ {"c", "e", 10},
+ {"c", "e", 8},
+ {"e", "g", 8},
+ {"h", "i", 25},
+ {"ii", "j", 15}});
+}
+
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorEmptyIteratorLeft) {
+ auto fragment_lists = MakeFragmentedTombstoneLists(
+ {{{"a", "e", 10}, {"c", "g", 8}},
+ {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+ std::vector<SequenceNumber> snapshots{9, 19};
+ CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+ for (const auto& fragment_list : fragment_lists) {
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+ new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+ kMaxSequenceNumber));
+ range_del_agg.AddTombstones(std::move(input_iter));
+ }
+
+ Slice start("_");
+ Slice end("__");
+}
+
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorEmptyIteratorRight) {
+ auto fragment_lists = MakeFragmentedTombstoneLists(
+ {{{"a", "e", 10}, {"c", "g", 8}},
+ {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+ std::vector<SequenceNumber> snapshots{9, 19};
+ CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+ for (const auto& fragment_list : fragment_lists) {
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+ new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+ kMaxSequenceNumber));
+ range_del_agg.AddTombstones(std::move(input_iter));
+ }
+
+ Slice start("p");
+ Slice end("q");
+ auto range_del_compaction_iter1 =
+ range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */);
+ VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), {});
+
+ auto range_del_compaction_iter2 =
+ range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */);
+ VerifyFragmentedRangeDels(range_del_compaction_iter2.get(), {});
+}
+
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorBoundedIterator) {
+ auto fragment_lists = MakeFragmentedTombstoneLists(
+ {{{"a", "e", 10}, {"c", "g", 8}},
+ {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+ std::vector<SequenceNumber> snapshots{9, 19};
+ CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+ for (const auto& fragment_list : fragment_lists) {
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+ new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+ kMaxSequenceNumber));
+ range_del_agg.AddTombstones(std::move(input_iter));
+ }
+
+ Slice start("bb");
+ Slice end("e");
+ auto range_del_compaction_iter1 =
+ range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */);
+ VerifyFragmentedRangeDels(range_del_compaction_iter1.get(),
+ {{"a", "c", 10}, {"c", "e", 10}, {"c", "e", 8}});
+
+ auto range_del_compaction_iter2 =
+ range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */);
+ VerifyFragmentedRangeDels(
+ range_del_compaction_iter2.get(),
+ {{"a", "c", 10}, {"c", "e", 10}, {"c", "e", 8}, {"e", "g", 8}});
+}
+
+TEST_F(RangeDelAggregatorTest,
+ CompactionAggregatorBoundedIteratorExtraFragments) {
+ auto fragment_lists = MakeFragmentedTombstoneLists(
+ {{{"a", "d", 10}, {"c", "g", 8}},
+ {{"b", "c", 20}, {"d", "f", 30}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+ std::vector<SequenceNumber> snapshots{9, 19};
+ CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+ for (const auto& fragment_list : fragment_lists) {
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+ new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+ kMaxSequenceNumber));
+ range_del_agg.AddTombstones(std::move(input_iter));
+ }
+
+ Slice start("bb");
+ Slice end("e");
+ auto range_del_compaction_iter1 =
+ range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */);
+ VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), {{"a", "b", 10},
+ {"b", "c", 20},
+ {"b", "c", 10},
+ {"c", "d", 10},
+ {"c", "d", 8},
+ {"d", "f", 30},
+ {"d", "f", 8},
+ {"f", "g", 8}});
+
+ auto range_del_compaction_iter2 =
+ range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */);
+ VerifyFragmentedRangeDels(range_del_compaction_iter2.get(), {{"a", "b", 10},
+ {"b", "c", 20},
+ {"b", "c", 10},
+ {"c", "d", 10},
+ {"c", "d", 8},
+ {"d", "f", 30},
+ {"d", "f", 8},
+ {"f", "g", 8}});
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/range_tombstone_fragmenter.cc b/src/rocksdb/db/range_tombstone_fragmenter.cc
new file mode 100644
index 000000000..58426248c
--- /dev/null
+++ b/src/rocksdb/db/range_tombstone_fragmenter.cc
@@ -0,0 +1,439 @@
+// Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/range_tombstone_fragmenter.h"
+
+#include <algorithm>
+#include <functional>
+#include <set>
+
+#include <stdio.h>
+#include <cinttypes>
+
+#include "util/autovector.h"
+#include "util/kv_map.h"
+#include "util/vector_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+FragmentedRangeTombstoneList::FragmentedRangeTombstoneList(
+ std::unique_ptr<InternalIterator> unfragmented_tombstones,
+ const InternalKeyComparator& icmp, bool for_compaction,
+ const std::vector<SequenceNumber>& snapshots) {
+ if (unfragmented_tombstones == nullptr) {
+ return;
+ }
+ bool is_sorted = true;
+ int num_tombstones = 0;
+ InternalKey pinned_last_start_key;
+ Slice last_start_key;
+ for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
+ unfragmented_tombstones->Next(), num_tombstones++) {
+ if (num_tombstones > 0 &&
+ icmp.Compare(last_start_key, unfragmented_tombstones->key()) > 0) {
+ is_sorted = false;
+ break;
+ }
+ if (unfragmented_tombstones->IsKeyPinned()) {
+ last_start_key = unfragmented_tombstones->key();
+ } else {
+ pinned_last_start_key.DecodeFrom(unfragmented_tombstones->key());
+ last_start_key = pinned_last_start_key.Encode();
+ }
+ }
+ if (is_sorted) {
+ FragmentTombstones(std::move(unfragmented_tombstones), icmp, for_compaction,
+ snapshots);
+ return;
+ }
+
+ // Sort the tombstones before fragmenting them.
+ std::vector<std::string> keys, values;
+ keys.reserve(num_tombstones);
+ values.reserve(num_tombstones);
+ for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
+ unfragmented_tombstones->Next()) {
+ keys.emplace_back(unfragmented_tombstones->key().data(),
+ unfragmented_tombstones->key().size());
+ values.emplace_back(unfragmented_tombstones->value().data(),
+ unfragmented_tombstones->value().size());
+ }
+ // VectorIterator implicitly sorts by key during construction.
+ auto iter = std::unique_ptr<VectorIterator>(
+ new VectorIterator(std::move(keys), std::move(values), &icmp));
+ FragmentTombstones(std::move(iter), icmp, for_compaction, snapshots);
+}
+
+void FragmentedRangeTombstoneList::FragmentTombstones(
+ std::unique_ptr<InternalIterator> unfragmented_tombstones,
+ const InternalKeyComparator& icmp, bool for_compaction,
+ const std::vector<SequenceNumber>& snapshots) {
+ Slice cur_start_key(nullptr, 0);
+ auto cmp = ParsedInternalKeyComparator(&icmp);
+
+ // Stores the end keys and sequence numbers of range tombstones with a start
+ // key less than or equal to cur_start_key. Provides an ordering by end key
+ // for use in flush_current_tombstones.
+ std::set<ParsedInternalKey, ParsedInternalKeyComparator> cur_end_keys(cmp);
+
+ // Given the next start key in unfragmented_tombstones,
+ // flush_current_tombstones writes every tombstone fragment that starts
+ // and ends with a key before next_start_key, and starts with a key greater
+ // than or equal to cur_start_key.
+ auto flush_current_tombstones = [&](const Slice& next_start_key) {
+ auto it = cur_end_keys.begin();
+ bool reached_next_start_key = false;
+ for (; it != cur_end_keys.end() && !reached_next_start_key; ++it) {
+ Slice cur_end_key = it->user_key;
+ if (icmp.user_comparator()->Compare(cur_start_key, cur_end_key) == 0) {
+ // Empty tombstone.
+ continue;
+ }
+ if (icmp.user_comparator()->Compare(next_start_key, cur_end_key) <= 0) {
+ // All of the end keys in [it, cur_end_keys.end()) are after
+ // next_start_key, so the tombstones they represent can be used in
+ // fragments that start with keys greater than or equal to
+ // next_start_key. However, the end keys we already passed will not be
+ // used in any more tombstone fragments.
+ //
+ // Remove the fully fragmented tombstones and stop iteration after a
+ // final round of flushing to preserve the tombstones we can create more
+ // fragments from.
+ reached_next_start_key = true;
+ cur_end_keys.erase(cur_end_keys.begin(), it);
+ cur_end_key = next_start_key;
+ }
+
+ // Flush a range tombstone fragment [cur_start_key, cur_end_key), which
+ // should not overlap with the last-flushed tombstone fragment.
+ assert(tombstones_.empty() ||
+ icmp.user_comparator()->Compare(tombstones_.back().end_key,
+ cur_start_key) <= 0);
+
+ // Sort the sequence numbers of the tombstones being fragmented in
+ // descending order, and then flush them in that order.
+ autovector<SequenceNumber> seqnums_to_flush;
+ for (auto flush_it = it; flush_it != cur_end_keys.end(); ++flush_it) {
+ seqnums_to_flush.push_back(flush_it->sequence);
+ }
+ std::sort(seqnums_to_flush.begin(), seqnums_to_flush.end(),
+ std::greater<SequenceNumber>());
+
+ size_t start_idx = tombstone_seqs_.size();
+ size_t end_idx = start_idx + seqnums_to_flush.size();
+
+ if (for_compaction) {
+ // Drop all tombstone seqnums that are not preserved by a snapshot.
+ SequenceNumber next_snapshot = kMaxSequenceNumber;
+ for (auto seq : seqnums_to_flush) {
+ if (seq <= next_snapshot) {
+ // This seqnum is visible by a lower snapshot.
+ tombstone_seqs_.push_back(seq);
+ seq_set_.insert(seq);
+ auto upper_bound_it =
+ std::lower_bound(snapshots.begin(), snapshots.end(), seq);
+ if (upper_bound_it == snapshots.begin()) {
+ // This seqnum is the topmost one visible by the earliest
+ // snapshot. None of the seqnums below it will be visible, so we
+ // can skip them.
+ break;
+ }
+ next_snapshot = *std::prev(upper_bound_it);
+ }
+ }
+ end_idx = tombstone_seqs_.size();
+ } else {
+ // The fragmentation is being done for reads, so preserve all seqnums.
+ tombstone_seqs_.insert(tombstone_seqs_.end(), seqnums_to_flush.begin(),
+ seqnums_to_flush.end());
+ seq_set_.insert(seqnums_to_flush.begin(), seqnums_to_flush.end());
+ }
+
+ assert(start_idx < end_idx);
+ tombstones_.emplace_back(cur_start_key, cur_end_key, start_idx, end_idx);
+
+ cur_start_key = cur_end_key;
+ }
+ if (!reached_next_start_key) {
+ // There is a gap between the last flushed tombstone fragment and
+ // the next tombstone's start key. Remove all the end keys in
+ // the working set, since we have fully fragmented their corresponding
+ // tombstones.
+ cur_end_keys.clear();
+ }
+ cur_start_key = next_start_key;
+ };
+
+ pinned_iters_mgr_.StartPinning();
+
+ bool no_tombstones = true;
+ for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
+ unfragmented_tombstones->Next()) {
+ const Slice& ikey = unfragmented_tombstones->key();
+ Slice tombstone_start_key = ExtractUserKey(ikey);
+ SequenceNumber tombstone_seq = GetInternalKeySeqno(ikey);
+ if (!unfragmented_tombstones->IsKeyPinned()) {
+ pinned_slices_.emplace_back(tombstone_start_key.data(),
+ tombstone_start_key.size());
+ tombstone_start_key = pinned_slices_.back();
+ }
+ no_tombstones = false;
+
+ Slice tombstone_end_key = unfragmented_tombstones->value();
+ if (!unfragmented_tombstones->IsValuePinned()) {
+ pinned_slices_.emplace_back(tombstone_end_key.data(),
+ tombstone_end_key.size());
+ tombstone_end_key = pinned_slices_.back();
+ }
+ if (!cur_end_keys.empty() && icmp.user_comparator()->Compare(
+ cur_start_key, tombstone_start_key) != 0) {
+ // The start key has changed. Flush all tombstones that start before
+ // this new start key.
+ flush_current_tombstones(tombstone_start_key);
+ }
+ cur_start_key = tombstone_start_key;
+
+ cur_end_keys.emplace(tombstone_end_key, tombstone_seq, kTypeRangeDeletion);
+ }
+ if (!cur_end_keys.empty()) {
+ ParsedInternalKey last_end_key = *std::prev(cur_end_keys.end());
+ flush_current_tombstones(last_end_key.user_key);
+ }
+
+ if (!no_tombstones) {
+ pinned_iters_mgr_.PinIterator(unfragmented_tombstones.release(),
+ false /* arena */);
+ }
+}
+
+bool FragmentedRangeTombstoneList::ContainsRange(SequenceNumber lower,
+ SequenceNumber upper) const {
+ auto seq_it = seq_set_.lower_bound(lower);
+ return seq_it != seq_set_.end() && *seq_it <= upper;
+}
+
+FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
+ const FragmentedRangeTombstoneList* tombstones,
+ const InternalKeyComparator& icmp, SequenceNumber _upper_bound,
+ SequenceNumber _lower_bound)
+ : tombstone_start_cmp_(icmp.user_comparator()),
+ tombstone_end_cmp_(icmp.user_comparator()),
+ icmp_(&icmp),
+ ucmp_(icmp.user_comparator()),
+ tombstones_(tombstones),
+ upper_bound_(_upper_bound),
+ lower_bound_(_lower_bound) {
+ assert(tombstones_ != nullptr);
+ Invalidate();
+}
+
+FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
+ const std::shared_ptr<const FragmentedRangeTombstoneList>& tombstones,
+ const InternalKeyComparator& icmp, SequenceNumber _upper_bound,
+ SequenceNumber _lower_bound)
+ : tombstone_start_cmp_(icmp.user_comparator()),
+ tombstone_end_cmp_(icmp.user_comparator()),
+ icmp_(&icmp),
+ ucmp_(icmp.user_comparator()),
+ tombstones_ref_(tombstones),
+ tombstones_(tombstones_ref_.get()),
+ upper_bound_(_upper_bound),
+ lower_bound_(_lower_bound) {
+ assert(tombstones_ != nullptr);
+ Invalidate();
+}
+
+void FragmentedRangeTombstoneIterator::SeekToFirst() {
+ pos_ = tombstones_->begin();
+ seq_pos_ = tombstones_->seq_begin();
+}
+
+void FragmentedRangeTombstoneIterator::SeekToTopFirst() {
+ if (tombstones_->empty()) {
+ Invalidate();
+ return;
+ }
+ pos_ = tombstones_->begin();
+ seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+ tombstones_->seq_iter(pos_->seq_end_idx),
+ upper_bound_, std::greater<SequenceNumber>());
+ ScanForwardToVisibleTombstone();
+}
+
+void FragmentedRangeTombstoneIterator::SeekToLast() {
+ pos_ = std::prev(tombstones_->end());
+ seq_pos_ = std::prev(tombstones_->seq_end());
+}
+
+void FragmentedRangeTombstoneIterator::SeekToTopLast() {
+ if (tombstones_->empty()) {
+ Invalidate();
+ return;
+ }
+ pos_ = std::prev(tombstones_->end());
+ seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+ tombstones_->seq_iter(pos_->seq_end_idx),
+ upper_bound_, std::greater<SequenceNumber>());
+ ScanBackwardToVisibleTombstone();
+}
+
+void FragmentedRangeTombstoneIterator::Seek(const Slice& target) {
+ if (tombstones_->empty()) {
+ Invalidate();
+ return;
+ }
+ SeekToCoveringTombstone(target);
+ ScanForwardToVisibleTombstone();
+}
+
+void FragmentedRangeTombstoneIterator::SeekForPrev(const Slice& target) {
+ if (tombstones_->empty()) {
+ Invalidate();
+ return;
+ }
+ SeekForPrevToCoveringTombstone(target);
+ ScanBackwardToVisibleTombstone();
+}
+
+void FragmentedRangeTombstoneIterator::SeekToCoveringTombstone(
+ const Slice& target) {
+ pos_ = std::upper_bound(tombstones_->begin(), tombstones_->end(), target,
+ tombstone_end_cmp_);
+ if (pos_ == tombstones_->end()) {
+ // All tombstones end before target.
+ seq_pos_ = tombstones_->seq_end();
+ return;
+ }
+ seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+ tombstones_->seq_iter(pos_->seq_end_idx),
+ upper_bound_, std::greater<SequenceNumber>());
+}
+
+void FragmentedRangeTombstoneIterator::SeekForPrevToCoveringTombstone(
+ const Slice& target) {
+ if (tombstones_->empty()) {
+ Invalidate();
+ return;
+ }
+ pos_ = std::upper_bound(tombstones_->begin(), tombstones_->end(), target,
+ tombstone_start_cmp_);
+ if (pos_ == tombstones_->begin()) {
+ // All tombstones start after target.
+ Invalidate();
+ return;
+ }
+ --pos_;
+ seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+ tombstones_->seq_iter(pos_->seq_end_idx),
+ upper_bound_, std::greater<SequenceNumber>());
+}
+
+void FragmentedRangeTombstoneIterator::ScanForwardToVisibleTombstone() {
+ while (pos_ != tombstones_->end() &&
+ (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx) ||
+ *seq_pos_ < lower_bound_)) {
+ ++pos_;
+ if (pos_ == tombstones_->end()) {
+ Invalidate();
+ return;
+ }
+ seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+ tombstones_->seq_iter(pos_->seq_end_idx),
+ upper_bound_, std::greater<SequenceNumber>());
+ }
+}
+
+void FragmentedRangeTombstoneIterator::ScanBackwardToVisibleTombstone() {
+ while (pos_ != tombstones_->end() &&
+ (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx) ||
+ *seq_pos_ < lower_bound_)) {
+ if (pos_ == tombstones_->begin()) {
+ Invalidate();
+ return;
+ }
+ --pos_;
+ seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+ tombstones_->seq_iter(pos_->seq_end_idx),
+ upper_bound_, std::greater<SequenceNumber>());
+ }
+}
+
+void FragmentedRangeTombstoneIterator::Next() {
+ ++seq_pos_;
+ if (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx)) {
+ ++pos_;
+ }
+}
+
+void FragmentedRangeTombstoneIterator::TopNext() {
+ ++pos_;
+ if (pos_ == tombstones_->end()) {
+ return;
+ }
+ seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+ tombstones_->seq_iter(pos_->seq_end_idx),
+ upper_bound_, std::greater<SequenceNumber>());
+ ScanForwardToVisibleTombstone();
+}
+
+void FragmentedRangeTombstoneIterator::Prev() {
+ if (seq_pos_ == tombstones_->seq_begin()) {
+ Invalidate();
+ return;
+ }
+ --seq_pos_;
+ if (pos_ == tombstones_->end() ||
+ seq_pos_ == tombstones_->seq_iter(pos_->seq_start_idx - 1)) {
+ --pos_;
+ }
+}
+
+void FragmentedRangeTombstoneIterator::TopPrev() {
+ if (pos_ == tombstones_->begin()) {
+ Invalidate();
+ return;
+ }
+ --pos_;
+ seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+ tombstones_->seq_iter(pos_->seq_end_idx),
+ upper_bound_, std::greater<SequenceNumber>());
+ ScanBackwardToVisibleTombstone();
+}
+
+bool FragmentedRangeTombstoneIterator::Valid() const {
+ return tombstones_ != nullptr && pos_ != tombstones_->end();
+}
+
+SequenceNumber FragmentedRangeTombstoneIterator::MaxCoveringTombstoneSeqnum(
+ const Slice& target_user_key) {
+ SeekToCoveringTombstone(target_user_key);
+ return ValidPos() && ucmp_->Compare(start_key(), target_user_key) <= 0 ? seq()
+ : 0;
+}
+
+std::map<SequenceNumber, std::unique_ptr<FragmentedRangeTombstoneIterator>>
+FragmentedRangeTombstoneIterator::SplitBySnapshot(
+ const std::vector<SequenceNumber>& snapshots) {
+ std::map<SequenceNumber, std::unique_ptr<FragmentedRangeTombstoneIterator>>
+ splits;
+ SequenceNumber lower = 0;
+ SequenceNumber upper;
+ for (size_t i = 0; i <= snapshots.size(); i++) {
+ if (i >= snapshots.size()) {
+ upper = kMaxSequenceNumber;
+ } else {
+ upper = snapshots[i];
+ }
+ if (tombstones_->ContainsRange(lower, upper)) {
+ splits.emplace(upper, std::unique_ptr<FragmentedRangeTombstoneIterator>(
+ new FragmentedRangeTombstoneIterator(
+ tombstones_, *icmp_, upper, lower)));
+ }
+ lower = upper + 1;
+ }
+ return splits;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/range_tombstone_fragmenter.h b/src/rocksdb/db/range_tombstone_fragmenter.h
new file mode 100644
index 000000000..63ec24e64
--- /dev/null
+++ b/src/rocksdb/db/range_tombstone_fragmenter.h
@@ -0,0 +1,256 @@
+// Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <list>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/pinned_iterators_manager.h"
+#include "rocksdb/status.h"
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct FragmentedRangeTombstoneList {
+ public:
+ // A compact representation of a "stack" of range tombstone fragments, which
+ // start and end at the same user keys but have different sequence numbers.
+ // The members seq_start_idx and seq_end_idx are intended to be parameters to
+ // seq_iter().
+ struct RangeTombstoneStack {
+ RangeTombstoneStack(const Slice& start, const Slice& end, size_t start_idx,
+ size_t end_idx)
+ : start_key(start),
+ end_key(end),
+ seq_start_idx(start_idx),
+ seq_end_idx(end_idx) {}
+
+ Slice start_key;
+ Slice end_key;
+ size_t seq_start_idx;
+ size_t seq_end_idx;
+ };
+ FragmentedRangeTombstoneList(
+ std::unique_ptr<InternalIterator> unfragmented_tombstones,
+ const InternalKeyComparator& icmp, bool for_compaction = false,
+ const std::vector<SequenceNumber>& snapshots = {});
+
+ std::vector<RangeTombstoneStack>::const_iterator begin() const {
+ return tombstones_.begin();
+ }
+
+ std::vector<RangeTombstoneStack>::const_iterator end() const {
+ return tombstones_.end();
+ }
+
+ std::vector<SequenceNumber>::const_iterator seq_iter(size_t idx) const {
+ return std::next(tombstone_seqs_.begin(), idx);
+ }
+
+ std::vector<SequenceNumber>::const_iterator seq_begin() const {
+ return tombstone_seqs_.begin();
+ }
+
+ std::vector<SequenceNumber>::const_iterator seq_end() const {
+ return tombstone_seqs_.end();
+ }
+
+ bool empty() const { return tombstones_.empty(); }
+
+ // Returns true if the stored tombstones contain with one with a sequence
+ // number in [lower, upper].
+ bool ContainsRange(SequenceNumber lower, SequenceNumber upper) const;
+
+ private:
+ // Given an ordered range tombstone iterator unfragmented_tombstones,
+ // "fragment" the tombstones into non-overlapping pieces, and store them in
+ // tombstones_ and tombstone_seqs_.
+ void FragmentTombstones(
+ std::unique_ptr<InternalIterator> unfragmented_tombstones,
+ const InternalKeyComparator& icmp, bool for_compaction,
+ const std::vector<SequenceNumber>& snapshots);
+
+ std::vector<RangeTombstoneStack> tombstones_;
+ std::vector<SequenceNumber> tombstone_seqs_;
+ std::set<SequenceNumber> seq_set_;
+ std::list<std::string> pinned_slices_;
+ PinnedIteratorsManager pinned_iters_mgr_;
+};
+
+// FragmentedRangeTombstoneIterator converts an InternalIterator of a range-del
+// meta block into an iterator over non-overlapping tombstone fragments. The
+// tombstone fragmentation process should be more efficient than the range
+// tombstone collapsing algorithm in RangeDelAggregator because this leverages
+// the internal key ordering already provided by the input iterator, if
+// applicable (when the iterator is unsorted, a new sorted iterator is created
+// before proceeding). If there are few overlaps, creating a
+// FragmentedRangeTombstoneIterator should be O(n), while the RangeDelAggregator
+// tombstone collapsing is always O(n log n).
+class FragmentedRangeTombstoneIterator : public InternalIterator {
+ public:
+ FragmentedRangeTombstoneIterator(
+ const FragmentedRangeTombstoneList* tombstones,
+ const InternalKeyComparator& icmp, SequenceNumber upper_bound,
+ SequenceNumber lower_bound = 0);
+ FragmentedRangeTombstoneIterator(
+ const std::shared_ptr<const FragmentedRangeTombstoneList>& tombstones,
+ const InternalKeyComparator& icmp, SequenceNumber upper_bound,
+ SequenceNumber lower_bound = 0);
+
+ void SeekToFirst() override;
+ void SeekToLast() override;
+
+ void SeekToTopFirst();
+ void SeekToTopLast();
+
+ // NOTE: Seek and SeekForPrev do not behave in the way InternalIterator
+ // seeking should behave. This is OK because they are not currently used, but
+ // eventually FragmentedRangeTombstoneIterator should no longer implement
+ // InternalIterator.
+ //
+ // Seeks to the range tombstone that covers target at a seqnum in the
+ // snapshot. If no such tombstone exists, seek to the earliest tombstone in
+ // the snapshot that ends after target.
+ void Seek(const Slice& target) override;
+ // Seeks to the range tombstone that covers target at a seqnum in the
+ // snapshot. If no such tombstone exists, seek to the latest tombstone in the
+ // snapshot that starts before target.
+ void SeekForPrev(const Slice& target) override;
+
+ void Next() override;
+ void Prev() override;
+
+ void TopNext();
+ void TopPrev();
+
+ bool Valid() const override;
+ Slice key() const override {
+ MaybePinKey();
+ return current_start_key_.Encode();
+ }
+ Slice value() const override { return pos_->end_key; }
+ bool IsKeyPinned() const override { return false; }
+ bool IsValuePinned() const override { return true; }
+ Status status() const override { return Status::OK(); }
+
+ bool empty() const { return tombstones_->empty(); }
+ void Invalidate() {
+ pos_ = tombstones_->end();
+ seq_pos_ = tombstones_->seq_end();
+ pinned_pos_ = tombstones_->end();
+ pinned_seq_pos_ = tombstones_->seq_end();
+ }
+
+ RangeTombstone Tombstone() const {
+ return RangeTombstone(start_key(), end_key(), seq());
+ }
+ Slice start_key() const { return pos_->start_key; }
+ Slice end_key() const { return pos_->end_key; }
+ SequenceNumber seq() const { return *seq_pos_; }
+ ParsedInternalKey parsed_start_key() const {
+ return ParsedInternalKey(pos_->start_key, kMaxSequenceNumber,
+ kTypeRangeDeletion);
+ }
+ ParsedInternalKey parsed_end_key() const {
+ return ParsedInternalKey(pos_->end_key, kMaxSequenceNumber,
+ kTypeRangeDeletion);
+ }
+
+ SequenceNumber MaxCoveringTombstoneSeqnum(const Slice& user_key);
+
+ // Splits the iterator into n+1 iterators (where n is the number of
+ // snapshots), each providing a view over a "stripe" of sequence numbers. The
+ // iterators are keyed by the upper bound of their ranges (the provided
+ // snapshots + kMaxSequenceNumber).
+ //
+ // NOTE: the iterators in the returned map are no longer valid if their
+ // parent iterator is deleted, since they do not modify the refcount of the
+ // underlying tombstone list. Therefore, this map should be deleted before
+ // the parent iterator.
+ std::map<SequenceNumber, std::unique_ptr<FragmentedRangeTombstoneIterator>>
+ SplitBySnapshot(const std::vector<SequenceNumber>& snapshots);
+
+ SequenceNumber upper_bound() const { return upper_bound_; }
+ SequenceNumber lower_bound() const { return lower_bound_; }
+
+ private:
+ using RangeTombstoneStack = FragmentedRangeTombstoneList::RangeTombstoneStack;
+
+ struct RangeTombstoneStackStartComparator {
+ explicit RangeTombstoneStackStartComparator(const Comparator* c) : cmp(c) {}
+
+ bool operator()(const RangeTombstoneStack& a,
+ const RangeTombstoneStack& b) const {
+ return cmp->Compare(a.start_key, b.start_key) < 0;
+ }
+
+ bool operator()(const RangeTombstoneStack& a, const Slice& b) const {
+ return cmp->Compare(a.start_key, b) < 0;
+ }
+
+ bool operator()(const Slice& a, const RangeTombstoneStack& b) const {
+ return cmp->Compare(a, b.start_key) < 0;
+ }
+
+ const Comparator* cmp;
+ };
+
+ struct RangeTombstoneStackEndComparator {
+ explicit RangeTombstoneStackEndComparator(const Comparator* c) : cmp(c) {}
+
+ bool operator()(const RangeTombstoneStack& a,
+ const RangeTombstoneStack& b) const {
+ return cmp->Compare(a.end_key, b.end_key) < 0;
+ }
+
+ bool operator()(const RangeTombstoneStack& a, const Slice& b) const {
+ return cmp->Compare(a.end_key, b) < 0;
+ }
+
+ bool operator()(const Slice& a, const RangeTombstoneStack& b) const {
+ return cmp->Compare(a, b.end_key) < 0;
+ }
+
+ const Comparator* cmp;
+ };
+
+ void MaybePinKey() const {
+ if (pos_ != tombstones_->end() && seq_pos_ != tombstones_->seq_end() &&
+ (pinned_pos_ != pos_ || pinned_seq_pos_ != seq_pos_)) {
+ current_start_key_.Set(pos_->start_key, *seq_pos_, kTypeRangeDeletion);
+ pinned_pos_ = pos_;
+ pinned_seq_pos_ = seq_pos_;
+ }
+ }
+
+ void SeekToCoveringTombstone(const Slice& key);
+ void SeekForPrevToCoveringTombstone(const Slice& key);
+ void ScanForwardToVisibleTombstone();
+ void ScanBackwardToVisibleTombstone();
+ bool ValidPos() const {
+ return Valid() && seq_pos_ != tombstones_->seq_iter(pos_->seq_end_idx);
+ }
+
+ const RangeTombstoneStackStartComparator tombstone_start_cmp_;
+ const RangeTombstoneStackEndComparator tombstone_end_cmp_;
+ const InternalKeyComparator* icmp_;
+ const Comparator* ucmp_;
+ std::shared_ptr<const FragmentedRangeTombstoneList> tombstones_ref_;
+ const FragmentedRangeTombstoneList* tombstones_;
+ SequenceNumber upper_bound_;
+ SequenceNumber lower_bound_;
+ std::vector<RangeTombstoneStack>::const_iterator pos_;
+ std::vector<SequenceNumber>::const_iterator seq_pos_;
+ mutable std::vector<RangeTombstoneStack>::const_iterator pinned_pos_;
+ mutable std::vector<SequenceNumber>::const_iterator pinned_seq_pos_;
+ mutable InternalKey current_start_key_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/range_tombstone_fragmenter_test.cc b/src/rocksdb/db/range_tombstone_fragmenter_test.cc
new file mode 100644
index 000000000..56234b1dd
--- /dev/null
+++ b/src/rocksdb/db/range_tombstone_fragmenter_test.cc
@@ -0,0 +1,552 @@
+// Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/range_tombstone_fragmenter.h"
+
+#include "db/db_test_util.h"
+#include "rocksdb/comparator.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class RangeTombstoneFragmenterTest : public testing::Test {};
+
+namespace {
+
+static auto bytewise_icmp = InternalKeyComparator(BytewiseComparator());
+
+std::unique_ptr<InternalIterator> MakeRangeDelIter(
+ const std::vector<RangeTombstone>& range_dels) {
+ std::vector<std::string> keys, values;
+ for (const auto& range_del : range_dels) {
+ auto key_and_value = range_del.Serialize();
+ keys.push_back(key_and_value.first.Encode().ToString());
+ values.push_back(key_and_value.second.ToString());
+ }
+ return std::unique_ptr<test::VectorIterator>(
+ new test::VectorIterator(keys, values));
+}
+
+void CheckIterPosition(const RangeTombstone& tombstone,
+ const FragmentedRangeTombstoneIterator* iter) {
+ // Test InternalIterator interface.
+ EXPECT_EQ(tombstone.start_key_, ExtractUserKey(iter->key()));
+ EXPECT_EQ(tombstone.end_key_, iter->value());
+ EXPECT_EQ(tombstone.seq_, iter->seq());
+
+ // Test FragmentedRangeTombstoneIterator interface.
+ EXPECT_EQ(tombstone.start_key_, iter->start_key());
+ EXPECT_EQ(tombstone.end_key_, iter->end_key());
+ EXPECT_EQ(tombstone.seq_, GetInternalKeySeqno(iter->key()));
+}
+
+void VerifyFragmentedRangeDels(
+ FragmentedRangeTombstoneIterator* iter,
+ const std::vector<RangeTombstone>& expected_tombstones) {
+ iter->SeekToFirst();
+ for (size_t i = 0; i < expected_tombstones.size(); i++, iter->Next()) {
+ ASSERT_TRUE(iter->Valid());
+ CheckIterPosition(expected_tombstones[i], iter);
+ }
+ EXPECT_FALSE(iter->Valid());
+}
+
+void VerifyVisibleTombstones(
+ FragmentedRangeTombstoneIterator* iter,
+ const std::vector<RangeTombstone>& expected_tombstones) {
+ iter->SeekToTopFirst();
+ for (size_t i = 0; i < expected_tombstones.size(); i++, iter->TopNext()) {
+ ASSERT_TRUE(iter->Valid());
+ CheckIterPosition(expected_tombstones[i], iter);
+ }
+ EXPECT_FALSE(iter->Valid());
+}
+
+struct SeekTestCase {
+ Slice seek_target;
+ RangeTombstone expected_position;
+ bool out_of_range;
+};
+
+void VerifySeek(FragmentedRangeTombstoneIterator* iter,
+ const std::vector<SeekTestCase>& cases) {
+ for (const auto& testcase : cases) {
+ iter->Seek(testcase.seek_target);
+ if (testcase.out_of_range) {
+ ASSERT_FALSE(iter->Valid());
+ } else {
+ ASSERT_TRUE(iter->Valid());
+ CheckIterPosition(testcase.expected_position, iter);
+ }
+ }
+}
+
+void VerifySeekForPrev(FragmentedRangeTombstoneIterator* iter,
+ const std::vector<SeekTestCase>& cases) {
+ for (const auto& testcase : cases) {
+ iter->SeekForPrev(testcase.seek_target);
+ if (testcase.out_of_range) {
+ ASSERT_FALSE(iter->Valid());
+ } else {
+ ASSERT_TRUE(iter->Valid());
+ CheckIterPosition(testcase.expected_position, iter);
+ }
+ }
+}
+
+struct MaxCoveringTombstoneSeqnumTestCase {
+ Slice user_key;
+ SequenceNumber result;
+};
+
+void VerifyMaxCoveringTombstoneSeqnum(
+ FragmentedRangeTombstoneIterator* iter,
+ const std::vector<MaxCoveringTombstoneSeqnumTestCase>& cases) {
+ for (const auto& testcase : cases) {
+ EXPECT_EQ(testcase.result,
+ iter->MaxCoveringTombstoneSeqnum(testcase.user_key));
+ }
+}
+
+} // anonymous namespace
+
+TEST_F(RangeTombstoneFragmenterTest, NonOverlappingTombstones) {
+ auto range_del_iter = MakeRangeDelIter({{"a", "b", 10}, {"c", "d", 5}});
+
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+ FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber);
+ ASSERT_EQ(0, iter.lower_bound());
+ ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+ VerifyFragmentedRangeDels(&iter, {{"a", "b", 10}, {"c", "d", 5}});
+ VerifyMaxCoveringTombstoneSeqnum(&iter,
+ {{"", 0}, {"a", 10}, {"b", 0}, {"c", 5}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, OverlappingTombstones) {
+ auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, {"c", "g", 15}});
+
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+ FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber);
+ ASSERT_EQ(0, iter.lower_bound());
+ ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+ VerifyFragmentedRangeDels(
+ &iter, {{"a", "c", 10}, {"c", "e", 15}, {"c", "e", 10}, {"e", "g", 15}});
+ VerifyMaxCoveringTombstoneSeqnum(&iter,
+ {{"a", 10}, {"c", 15}, {"e", 15}, {"g", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, ContiguousTombstones) {
+ auto range_del_iter = MakeRangeDelIter(
+ {{"a", "c", 10}, {"c", "e", 20}, {"c", "e", 5}, {"e", "g", 15}});
+
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+ FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber);
+ ASSERT_EQ(0, iter.lower_bound());
+ ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+ VerifyFragmentedRangeDels(
+ &iter, {{"a", "c", 10}, {"c", "e", 20}, {"c", "e", 5}, {"e", "g", 15}});
+ VerifyMaxCoveringTombstoneSeqnum(&iter,
+ {{"a", 10}, {"c", 20}, {"e", 15}, {"g", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, RepeatedStartAndEndKey) {
+ auto range_del_iter =
+ MakeRangeDelIter({{"a", "c", 10}, {"a", "c", 7}, {"a", "c", 3}});
+
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+ FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber);
+ ASSERT_EQ(0, iter.lower_bound());
+ ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+ VerifyFragmentedRangeDels(&iter,
+ {{"a", "c", 10}, {"a", "c", 7}, {"a", "c", 3}});
+ VerifyMaxCoveringTombstoneSeqnum(&iter, {{"a", 10}, {"b", 10}, {"c", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, RepeatedStartKeyDifferentEndKeys) {
+ auto range_del_iter =
+ MakeRangeDelIter({{"a", "e", 10}, {"a", "g", 7}, {"a", "c", 3}});
+
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+ FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber);
+ ASSERT_EQ(0, iter.lower_bound());
+ ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+ VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
+ {"a", "c", 7},
+ {"a", "c", 3},
+ {"c", "e", 10},
+ {"c", "e", 7},
+ {"e", "g", 7}});
+ VerifyMaxCoveringTombstoneSeqnum(&iter,
+ {{"a", 10}, {"c", 10}, {"e", 7}, {"g", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, RepeatedStartKeyMixedEndKeys) {
+ auto range_del_iter = MakeRangeDelIter({{"a", "c", 30},
+ {"a", "g", 20},
+ {"a", "e", 10},
+ {"a", "g", 7},
+ {"a", "c", 3}});
+
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+ FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber);
+ ASSERT_EQ(0, iter.lower_bound());
+ ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+ VerifyFragmentedRangeDels(&iter, {{"a", "c", 30},
+ {"a", "c", 20},
+ {"a", "c", 10},
+ {"a", "c", 7},
+ {"a", "c", 3},
+ {"c", "e", 20},
+ {"c", "e", 10},
+ {"c", "e", 7},
+ {"e", "g", 20},
+ {"e", "g", 7}});
+ VerifyMaxCoveringTombstoneSeqnum(&iter,
+ {{"a", 30}, {"c", 20}, {"e", 20}, {"g", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKey) {
+ auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+ {"c", "g", 8},
+ {"c", "i", 6},
+ {"j", "n", 4},
+ {"j", "l", 2}});
+
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+ FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber);
+ FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp,
+ 9 /* upper_bound */);
+ FragmentedRangeTombstoneIterator iter3(&fragment_list, bytewise_icmp,
+ 7 /* upper_bound */);
+ FragmentedRangeTombstoneIterator iter4(&fragment_list, bytewise_icmp,
+ 5 /* upper_bound */);
+ FragmentedRangeTombstoneIterator iter5(&fragment_list, bytewise_icmp,
+ 3 /* upper_bound */);
+ for (auto* iter : {&iter1, &iter2, &iter3, &iter4, &iter5}) {
+ VerifyFragmentedRangeDels(iter, {{"a", "c", 10},
+ {"c", "e", 10},
+ {"c", "e", 8},
+ {"c", "e", 6},
+ {"e", "g", 8},
+ {"e", "g", 6},
+ {"g", "i", 6},
+ {"j", "l", 4},
+ {"j", "l", 2},
+ {"l", "n", 4}});
+ }
+
+ ASSERT_EQ(0, iter1.lower_bound());
+ ASSERT_EQ(kMaxSequenceNumber, iter1.upper_bound());
+ VerifyVisibleTombstones(&iter1, {{"a", "c", 10},
+ {"c", "e", 10},
+ {"e", "g", 8},
+ {"g", "i", 6},
+ {"j", "l", 4},
+ {"l", "n", 4}});
+ VerifyMaxCoveringTombstoneSeqnum(
+ &iter1, {{"a", 10}, {"c", 10}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}});
+
+ ASSERT_EQ(0, iter2.lower_bound());
+ ASSERT_EQ(9, iter2.upper_bound());
+ VerifyVisibleTombstones(&iter2, {{"c", "e", 8},
+ {"e", "g", 8},
+ {"g", "i", 6},
+ {"j", "l", 4},
+ {"l", "n", 4}});
+ VerifyMaxCoveringTombstoneSeqnum(
+ &iter2, {{"a", 0}, {"c", 8}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}});
+
+ ASSERT_EQ(0, iter3.lower_bound());
+ ASSERT_EQ(7, iter3.upper_bound());
+ VerifyVisibleTombstones(&iter3, {{"c", "e", 6},
+ {"e", "g", 6},
+ {"g", "i", 6},
+ {"j", "l", 4},
+ {"l", "n", 4}});
+ VerifyMaxCoveringTombstoneSeqnum(
+ &iter3, {{"a", 0}, {"c", 6}, {"e", 6}, {"i", 0}, {"j", 4}, {"m", 4}});
+
+ ASSERT_EQ(0, iter4.lower_bound());
+ ASSERT_EQ(5, iter4.upper_bound());
+ VerifyVisibleTombstones(&iter4, {{"j", "l", 4}, {"l", "n", 4}});
+ VerifyMaxCoveringTombstoneSeqnum(
+ &iter4, {{"a", 0}, {"c", 0}, {"e", 0}, {"i", 0}, {"j", 4}, {"m", 4}});
+
+ ASSERT_EQ(0, iter5.lower_bound());
+ ASSERT_EQ(3, iter5.upper_bound());
+ VerifyVisibleTombstones(&iter5, {{"j", "l", 2}});
+ VerifyMaxCoveringTombstoneSeqnum(
+ &iter5, {{"a", 0}, {"c", 0}, {"e", 0}, {"i", 0}, {"j", 2}, {"m", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyUnordered) {
+ auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+ {"j", "n", 4},
+ {"c", "i", 6},
+ {"c", "g", 8},
+ {"j", "l", 2}});
+
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+ FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+ 9 /* upper_bound */);
+ ASSERT_EQ(0, iter.lower_bound());
+ ASSERT_EQ(9, iter.upper_bound());
+ VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
+ {"c", "e", 10},
+ {"c", "e", 8},
+ {"c", "e", 6},
+ {"e", "g", 8},
+ {"e", "g", 6},
+ {"g", "i", 6},
+ {"j", "l", 4},
+ {"j", "l", 2},
+ {"l", "n", 4}});
+ VerifyMaxCoveringTombstoneSeqnum(
+ &iter, {{"a", 0}, {"c", 8}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyForCompaction) {
+ auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+ {"j", "n", 4},
+ {"c", "i", 6},
+ {"c", "g", 8},
+ {"j", "l", 2}});
+
+ FragmentedRangeTombstoneList fragment_list(
+ std::move(range_del_iter), bytewise_icmp, true /* for_compaction */,
+ {} /* snapshots */);
+ FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber /* upper_bound */);
+ VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
+ {"c", "e", 10},
+ {"e", "g", 8},
+ {"g", "i", 6},
+ {"j", "l", 4},
+ {"l", "n", 4}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest,
+ OverlapAndRepeatedStartKeyForCompactionWithSnapshot) {
+ auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+ {"j", "n", 4},
+ {"c", "i", 6},
+ {"c", "g", 8},
+ {"j", "l", 2}});
+
+ FragmentedRangeTombstoneList fragment_list(
+ std::move(range_del_iter), bytewise_icmp, true /* for_compaction */,
+ {20, 9} /* upper_bounds */);
+ FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber /* upper_bound */);
+ VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
+ {"c", "e", 10},
+ {"c", "e", 8},
+ {"e", "g", 8},
+ {"g", "i", 6},
+ {"j", "l", 4},
+ {"l", "n", 4}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, IteratorSplitNoSnapshots) {
+ auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+ {"j", "n", 4},
+ {"c", "i", 6},
+ {"c", "g", 8},
+ {"j", "l", 2}});
+
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+ FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber /* upper_bound */);
+
+ auto split_iters = iter.SplitBySnapshot({} /* snapshots */);
+ ASSERT_EQ(1, split_iters.size());
+
+ auto* split_iter = split_iters[kMaxSequenceNumber].get();
+ ASSERT_EQ(0, split_iter->lower_bound());
+ ASSERT_EQ(kMaxSequenceNumber, split_iter->upper_bound());
+ VerifyVisibleTombstones(split_iter, {{"a", "c", 10},
+ {"c", "e", 10},
+ {"e", "g", 8},
+ {"g", "i", 6},
+ {"j", "l", 4},
+ {"l", "n", 4}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, IteratorSplitWithSnapshots) {
+ auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+ {"j", "n", 4},
+ {"c", "i", 6},
+ {"c", "g", 8},
+ {"j", "l", 2}});
+
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+ FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber /* upper_bound */);
+
+ auto split_iters = iter.SplitBySnapshot({3, 5, 7, 9} /* snapshots */);
+ ASSERT_EQ(5, split_iters.size());
+
+ auto* split_iter1 = split_iters[3].get();
+ ASSERT_EQ(0, split_iter1->lower_bound());
+ ASSERT_EQ(3, split_iter1->upper_bound());
+ VerifyVisibleTombstones(split_iter1, {{"j", "l", 2}});
+
+ auto* split_iter2 = split_iters[5].get();
+ ASSERT_EQ(4, split_iter2->lower_bound());
+ ASSERT_EQ(5, split_iter2->upper_bound());
+ VerifyVisibleTombstones(split_iter2, {{"j", "l", 4}, {"l", "n", 4}});
+
+ auto* split_iter3 = split_iters[7].get();
+ ASSERT_EQ(6, split_iter3->lower_bound());
+ ASSERT_EQ(7, split_iter3->upper_bound());
+ VerifyVisibleTombstones(split_iter3,
+ {{"c", "e", 6}, {"e", "g", 6}, {"g", "i", 6}});
+
+ auto* split_iter4 = split_iters[9].get();
+ ASSERT_EQ(8, split_iter4->lower_bound());
+ ASSERT_EQ(9, split_iter4->upper_bound());
+ VerifyVisibleTombstones(split_iter4, {{"c", "e", 8}, {"e", "g", 8}});
+
+ auto* split_iter5 = split_iters[kMaxSequenceNumber].get();
+ ASSERT_EQ(10, split_iter5->lower_bound());
+ ASSERT_EQ(kMaxSequenceNumber, split_iter5->upper_bound());
+ VerifyVisibleTombstones(split_iter5, {{"a", "c", 10}, {"c", "e", 10}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, SeekStartKey) {
+ // Same tombstones as OverlapAndRepeatedStartKey.
+ auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+ {"c", "g", 8},
+ {"c", "i", 6},
+ {"j", "n", 4},
+ {"j", "l", 2}});
+
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+
+ FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber);
+ VerifySeek(
+ &iter1,
+ {{"a", {"a", "c", 10}}, {"e", {"e", "g", 8}}, {"l", {"l", "n", 4}}});
+ VerifySeekForPrev(
+ &iter1,
+ {{"a", {"a", "c", 10}}, {"e", {"e", "g", 8}}, {"l", {"l", "n", 4}}});
+
+ FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp,
+ 3 /* upper_bound */);
+ VerifySeek(&iter2, {{"a", {"j", "l", 2}},
+ {"e", {"j", "l", 2}},
+ {"l", {}, true /* out of range */}});
+ VerifySeekForPrev(&iter2, {{"a", {}, true /* out of range */},
+ {"e", {}, true /* out of range */},
+ {"l", {"j", "l", 2}}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, SeekCovered) {
+ // Same tombstones as OverlapAndRepeatedStartKey.
+ auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+ {"c", "g", 8},
+ {"c", "i", 6},
+ {"j", "n", 4},
+ {"j", "l", 2}});
+
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+
+ FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber);
+ VerifySeek(
+ &iter1,
+ {{"b", {"a", "c", 10}}, {"f", {"e", "g", 8}}, {"m", {"l", "n", 4}}});
+ VerifySeekForPrev(
+ &iter1,
+ {{"b", {"a", "c", 10}}, {"f", {"e", "g", 8}}, {"m", {"l", "n", 4}}});
+
+ FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp,
+ 3 /* upper_bound */);
+ VerifySeek(&iter2, {{"b", {"j", "l", 2}},
+ {"f", {"j", "l", 2}},
+ {"m", {}, true /* out of range */}});
+ VerifySeekForPrev(&iter2, {{"b", {}, true /* out of range */},
+ {"f", {}, true /* out of range */},
+ {"m", {"j", "l", 2}}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, SeekEndKey) {
+ // Same tombstones as OverlapAndRepeatedStartKey.
+ auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+ {"c", "g", 8},
+ {"c", "i", 6},
+ {"j", "n", 4},
+ {"j", "l", 2}});
+
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+
+ FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber);
+ VerifySeek(&iter1, {{"c", {"c", "e", 10}},
+ {"g", {"g", "i", 6}},
+ {"i", {"j", "l", 4}},
+ {"n", {}, true /* out of range */}});
+ VerifySeekForPrev(&iter1, {{"c", {"c", "e", 10}},
+ {"g", {"g", "i", 6}},
+ {"i", {"g", "i", 6}},
+ {"n", {"l", "n", 4}}});
+
+ FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp,
+ 3 /* upper_bound */);
+ VerifySeek(&iter2, {{"c", {"j", "l", 2}},
+ {"g", {"j", "l", 2}},
+ {"i", {"j", "l", 2}},
+ {"n", {}, true /* out of range */}});
+ VerifySeekForPrev(&iter2, {{"c", {}, true /* out of range */},
+ {"g", {}, true /* out of range */},
+ {"i", {}, true /* out of range */},
+ {"n", {"j", "l", 2}}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, SeekOutOfBounds) {
+ // Same tombstones as OverlapAndRepeatedStartKey.
+ auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+ {"c", "g", 8},
+ {"c", "i", 6},
+ {"j", "n", 4},
+ {"j", "l", 2}});
+
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+
+ FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber);
+ VerifySeek(&iter, {{"", {"a", "c", 10}}, {"z", {}, true /* out of range */}});
+ VerifySeekForPrev(&iter,
+ {{"", {}, true /* out of range */}, {"z", {"l", "n", 4}}});
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/read_callback.h b/src/rocksdb/db/read_callback.h
new file mode 100644
index 000000000..fbef1dd0d
--- /dev/null
+++ b/src/rocksdb/db/read_callback.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ReadCallback {
+ public:
+ ReadCallback(SequenceNumber last_visible_seq)
+ : max_visible_seq_(last_visible_seq) {}
+ ReadCallback(SequenceNumber last_visible_seq, SequenceNumber min_uncommitted)
+ : max_visible_seq_(last_visible_seq), min_uncommitted_(min_uncommitted) {}
+
+ virtual ~ReadCallback() {}
+
+ // Will be called to see if the seq number visible; if not it moves on to
+ // the next seq number.
+ virtual bool IsVisibleFullCheck(SequenceNumber seq) = 0;
+
+ inline bool IsVisible(SequenceNumber seq) {
+ assert(min_uncommitted_ > 0);
+ assert(min_uncommitted_ >= kMinUnCommittedSeq);
+ if (seq < min_uncommitted_) { // handles seq == 0 as well
+ assert(seq <= max_visible_seq_);
+ return true;
+ } else if (max_visible_seq_ < seq) {
+ assert(seq != 0);
+ return false;
+ } else {
+ assert(seq != 0); // already handled in the first if-then clause
+ return IsVisibleFullCheck(seq);
+ }
+ }
+
+ inline SequenceNumber max_visible_seq() { return max_visible_seq_; }
+
+ // Refresh to a more recent visible seq
+ virtual void Refresh(SequenceNumber seq) { max_visible_seq_ = seq; }
+
+ protected:
+ // The max visible seq, it is usually the snapshot but could be larger if
+ // transaction has its own writes written to db.
+ SequenceNumber max_visible_seq_ = kMaxSequenceNumber;
+ // Any seq less than min_uncommitted_ is committed.
+ const SequenceNumber min_uncommitted_ = kMinUnCommittedSeq;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/repair.cc b/src/rocksdb/db/repair.cc
new file mode 100644
index 000000000..383ffe3a4
--- /dev/null
+++ b/src/rocksdb/db/repair.cc
@@ -0,0 +1,691 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Repairer does best effort recovery to recover as much data as possible after
+// a disaster without compromising consistency. It does not guarantee bringing
+// the database to a time consistent state.
+//
+// Repair process is broken into 4 phases:
+// (a) Find files
+// (b) Convert logs to tables
+// (c) Extract metadata
+// (d) Write Descriptor
+//
+// (a) Find files
+//
+// The repairer goes through all the files in the directory, and classifies them
+// based on their file name. Any file that cannot be identified by name will be
+// ignored.
+//
+// (b) Convert logs to table
+//
+// Every log file that is active is replayed. All sections of the file where the
+// checksum does not match is skipped over. We intentionally give preference to
+// data consistency.
+//
+// (c) Extract metadata
+//
+// We scan every table to compute
+// (1) smallest/largest for the table
+// (2) largest sequence number in the table
+// (3) oldest blob file referred to by the table (if applicable)
+//
+// If we are unable to scan the file, then we ignore the table.
+//
+// (d) Write Descriptor
+//
+// We generate descriptor contents:
+// - log number is set to zero
+// - next-file-number is set to 1 + largest file number we found
+// - last-sequence-number is set to largest sequence# found across
+// all tables (see 2c)
+// - compaction pointers are cleared
+// - every table file is added at level 0
+//
+// Possible optimization 1:
+// (a) Compute total size and use to pick appropriate max-level M
+// (b) Sort tables by largest sequence# in the table
+// (c) For each table: if it overlaps earlier table, place in level-0,
+// else place in level-M.
+// (d) We can provide options for time consistent recovery and unsafe recovery
+// (ignore checksum failure when applicable)
+// Possible optimization 2:
+// Store per-table metadata (smallest, largest, largest-seq#, ...)
+// in the table's meta section to speed up ScanTable.
+
+#ifndef ROCKSDB_LITE
+
+#include <cinttypes>
+#include "db/builder.h"
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/table_cache.h"
+#include "db/version_edit.h"
+#include "db/write_batch_internal.h"
+#include "env/composite_env_wrapper.h"
+#include "file/filename.h"
+#include "file/writable_file_writer.h"
+#include "options/cf_options.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/scoped_arena_iterator.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+class Repairer {
+ public:
+ Repairer(const std::string& dbname, const DBOptions& db_options,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ const ColumnFamilyOptions& default_cf_opts,
+ const ColumnFamilyOptions& unknown_cf_opts, bool create_unknown_cfs)
+ : dbname_(dbname),
+ env_(db_options.env),
+ env_options_(),
+ db_options_(SanitizeOptions(dbname_, db_options)),
+ immutable_db_options_(ImmutableDBOptions(db_options_)),
+ icmp_(default_cf_opts.comparator),
+ default_cf_opts_(
+ SanitizeOptions(immutable_db_options_, default_cf_opts)),
+ default_cf_iopts_(
+ ImmutableCFOptions(immutable_db_options_, default_cf_opts_)),
+ unknown_cf_opts_(
+ SanitizeOptions(immutable_db_options_, unknown_cf_opts)),
+ create_unknown_cfs_(create_unknown_cfs),
+ raw_table_cache_(
+ // TableCache can be small since we expect each table to be opened
+ // once.
+ NewLRUCache(10, db_options_.table_cache_numshardbits)),
+ table_cache_(new TableCache(default_cf_iopts_, env_options_,
+ raw_table_cache_.get(),
+ /*block_cache_tracer=*/nullptr)),
+ wb_(db_options_.db_write_buffer_size),
+ wc_(db_options_.delayed_write_rate),
+ vset_(dbname_, &immutable_db_options_, env_options_,
+ raw_table_cache_.get(), &wb_, &wc_,
+ /*block_cache_tracer=*/nullptr),
+ next_file_number_(1),
+ db_lock_(nullptr) {
+ for (const auto& cfd : column_families) {
+ cf_name_to_opts_[cfd.name] = cfd.options;
+ }
+ }
+
+ const ColumnFamilyOptions* GetColumnFamilyOptions(
+ const std::string& cf_name) {
+ if (cf_name_to_opts_.find(cf_name) == cf_name_to_opts_.end()) {
+ if (create_unknown_cfs_) {
+ return &unknown_cf_opts_;
+ }
+ return nullptr;
+ }
+ return &cf_name_to_opts_[cf_name];
+ }
+
+ // Adds a column family to the VersionSet with cf_options_ and updates
+ // manifest.
+ Status AddColumnFamily(const std::string& cf_name, uint32_t cf_id) {
+ const auto* cf_opts = GetColumnFamilyOptions(cf_name);
+ if (cf_opts == nullptr) {
+ return Status::Corruption("Encountered unknown column family with name=" +
+ cf_name + ", id=" + ToString(cf_id));
+ }
+ Options opts(db_options_, *cf_opts);
+ MutableCFOptions mut_cf_opts(opts);
+
+ VersionEdit edit;
+ edit.SetComparatorName(opts.comparator->Name());
+ edit.SetLogNumber(0);
+ edit.SetColumnFamily(cf_id);
+ ColumnFamilyData* cfd;
+ cfd = nullptr;
+ edit.AddColumnFamily(cf_name);
+
+ mutex_.Lock();
+ Status status = vset_.LogAndApply(cfd, mut_cf_opts, &edit, &mutex_,
+ nullptr /* db_directory */,
+ false /* new_descriptor_log */, cf_opts);
+ mutex_.Unlock();
+ return status;
+ }
+
+ ~Repairer() {
+ if (db_lock_ != nullptr) {
+ env_->UnlockFile(db_lock_);
+ }
+ delete table_cache_;
+ }
+
+ Status Run() {
+ Status status = env_->LockFile(LockFileName(dbname_), &db_lock_);
+ if (!status.ok()) {
+ return status;
+ }
+ status = FindFiles();
+ if (status.ok()) {
+ // Discard older manifests and start a fresh one
+ for (size_t i = 0; i < manifests_.size(); i++) {
+ ArchiveFile(dbname_ + "/" + manifests_[i]);
+ }
+ // Just create a DBImpl temporarily so we can reuse NewDB()
+ DBImpl* db_impl = new DBImpl(db_options_, dbname_);
+ status = db_impl->NewDB();
+ delete db_impl;
+ }
+
+ if (status.ok()) {
+ // Recover using the fresh manifest created by NewDB()
+ status =
+ vset_.Recover({{kDefaultColumnFamilyName, default_cf_opts_}}, false);
+ }
+ if (status.ok()) {
+ // Need to scan existing SST files first so the column families are
+ // created before we process WAL files
+ ExtractMetaData();
+
+ // ExtractMetaData() uses table_fds_ to know which SST files' metadata to
+ // extract -- we need to clear it here since metadata for existing SST
+ // files has been extracted already
+ table_fds_.clear();
+ ConvertLogFilesToTables();
+ ExtractMetaData();
+ status = AddTables();
+ }
+ if (status.ok()) {
+ uint64_t bytes = 0;
+ for (size_t i = 0; i < tables_.size(); i++) {
+ bytes += tables_[i].meta.fd.GetFileSize();
+ }
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "**** Repaired rocksdb %s; "
+ "recovered %" ROCKSDB_PRIszt " files; %" PRIu64
+ " bytes. "
+ "Some data may have been lost. "
+ "****",
+ dbname_.c_str(), tables_.size(), bytes);
+ }
+ return status;
+ }
+
+ private:
+ struct TableInfo {
+ FileMetaData meta;
+ uint32_t column_family_id;
+ std::string column_family_name;
+ };
+
+ std::string const dbname_;
+ Env* const env_;
+ const EnvOptions env_options_;
+ const DBOptions db_options_;
+ const ImmutableDBOptions immutable_db_options_;
+ const InternalKeyComparator icmp_;
+ const ColumnFamilyOptions default_cf_opts_;
+ const ImmutableCFOptions default_cf_iopts_; // table_cache_ holds reference
+ const ColumnFamilyOptions unknown_cf_opts_;
+ const bool create_unknown_cfs_;
+ std::shared_ptr<Cache> raw_table_cache_;
+ TableCache* table_cache_;
+ WriteBufferManager wb_;
+ WriteController wc_;
+ VersionSet vset_;
+ std::unordered_map<std::string, ColumnFamilyOptions> cf_name_to_opts_;
+ InstrumentedMutex mutex_;
+
+ std::vector<std::string> manifests_;
+ std::vector<FileDescriptor> table_fds_;
+ std::vector<uint64_t> logs_;
+ std::vector<TableInfo> tables_;
+ uint64_t next_file_number_;
+ // Lock over the persistent DB state. Non-nullptr iff successfully
+ // acquired.
+ FileLock* db_lock_;
+
+ Status FindFiles() {
+ std::vector<std::string> filenames;
+ bool found_file = false;
+ std::vector<std::string> to_search_paths;
+
+ for (size_t path_id = 0; path_id < db_options_.db_paths.size(); path_id++) {
+ to_search_paths.push_back(db_options_.db_paths[path_id].path);
+ }
+
+ // search wal_dir if user uses a customize wal_dir
+ bool same = false;
+ Status status = env_->AreFilesSame(db_options_.wal_dir, dbname_, &same);
+ if (status.IsNotSupported()) {
+ same = db_options_.wal_dir == dbname_;
+ status = Status::OK();
+ } else if (!status.ok()) {
+ return status;
+ }
+
+ if (!same) {
+ to_search_paths.push_back(db_options_.wal_dir);
+ }
+
+ for (size_t path_id = 0; path_id < to_search_paths.size(); path_id++) {
+ status = env_->GetChildren(to_search_paths[path_id], &filenames);
+ if (!status.ok()) {
+ return status;
+ }
+ if (!filenames.empty()) {
+ found_file = true;
+ }
+
+ uint64_t number;
+ FileType type;
+ for (size_t i = 0; i < filenames.size(); i++) {
+ if (ParseFileName(filenames[i], &number, &type)) {
+ if (type == kDescriptorFile) {
+ manifests_.push_back(filenames[i]);
+ } else {
+ if (number + 1 > next_file_number_) {
+ next_file_number_ = number + 1;
+ }
+ if (type == kLogFile) {
+ logs_.push_back(number);
+ } else if (type == kTableFile) {
+ table_fds_.emplace_back(number, static_cast<uint32_t>(path_id),
+ 0);
+ } else {
+ // Ignore other files
+ }
+ }
+ }
+ }
+ }
+ if (!found_file) {
+ return Status::Corruption(dbname_, "repair found no files");
+ }
+ return Status::OK();
+ }
+
+ void ConvertLogFilesToTables() {
+ for (size_t i = 0; i < logs_.size(); i++) {
+ // we should use LogFileName(wal_dir, logs_[i]) here. user might uses wal_dir option.
+ std::string logname = LogFileName(db_options_.wal_dir, logs_[i]);
+ Status status = ConvertLogToTable(logs_[i]);
+ if (!status.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Log #%" PRIu64 ": ignoring conversion error: %s",
+ logs_[i], status.ToString().c_str());
+ }
+ ArchiveFile(logname);
+ }
+ }
+
+ Status ConvertLogToTable(uint64_t log) {
+ struct LogReporter : public log::Reader::Reporter {
+ Env* env;
+ std::shared_ptr<Logger> info_log;
+ uint64_t lognum;
+ void Corruption(size_t bytes, const Status& s) override {
+ // We print error messages for corruption, but continue repairing.
+ ROCKS_LOG_ERROR(info_log, "Log #%" PRIu64 ": dropping %d bytes; %s",
+ lognum, static_cast<int>(bytes), s.ToString().c_str());
+ }
+ };
+
+ // Open the log file
+ std::string logname = LogFileName(db_options_.wal_dir, log);
+ std::unique_ptr<SequentialFile> lfile;
+ Status status = env_->NewSequentialFile(
+ logname, &lfile, env_->OptimizeForLogRead(env_options_));
+ if (!status.ok()) {
+ return status;
+ }
+ std::unique_ptr<SequentialFileReader> lfile_reader(new SequentialFileReader(
+ NewLegacySequentialFileWrapper(lfile), logname));
+
+ // Create the log reader.
+ LogReporter reporter;
+ reporter.env = env_;
+ reporter.info_log = db_options_.info_log;
+ reporter.lognum = log;
+ // We intentionally make log::Reader do checksumming so that
+ // corruptions cause entire commits to be skipped instead of
+ // propagating bad information (like overly large sequence
+ // numbers).
+ log::Reader reader(db_options_.info_log, std::move(lfile_reader), &reporter,
+ true /*enable checksum*/, log);
+
+ // Initialize per-column family memtables
+ for (auto* cfd : *vset_.GetColumnFamilySet()) {
+ cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
+ kMaxSequenceNumber);
+ }
+ auto cf_mems = new ColumnFamilyMemTablesImpl(vset_.GetColumnFamilySet());
+
+ // Read all the records and add to a memtable
+ std::string scratch;
+ Slice record;
+ WriteBatch batch;
+ int counter = 0;
+ while (reader.ReadRecord(&record, &scratch)) {
+ if (record.size() < WriteBatchInternal::kHeader) {
+ reporter.Corruption(
+ record.size(), Status::Corruption("log record too small"));
+ continue;
+ }
+ WriteBatchInternal::SetContents(&batch, record);
+ status =
+ WriteBatchInternal::InsertInto(&batch, cf_mems, nullptr, nullptr);
+ if (status.ok()) {
+ counter += WriteBatchInternal::Count(&batch);
+ } else {
+ ROCKS_LOG_WARN(db_options_.info_log, "Log #%" PRIu64 ": ignoring %s",
+ log, status.ToString().c_str());
+ status = Status::OK(); // Keep going with rest of file
+ }
+ }
+
+ // Dump a table for each column family with entries in this log file.
+ for (auto* cfd : *vset_.GetColumnFamilySet()) {
+ // Do not record a version edit for this conversion to a Table
+ // since ExtractMetaData() will also generate edits.
+ MemTable* mem = cfd->mem();
+ if (mem->IsEmpty()) {
+ continue;
+ }
+
+ FileMetaData meta;
+ meta.fd = FileDescriptor(next_file_number_++, 0, 0);
+ ReadOptions ro;
+ ro.total_order_seek = true;
+ Arena arena;
+ ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
+ int64_t _current_time = 0;
+ status = env_->GetCurrentTime(&_current_time); // ignore error
+ const uint64_t current_time = static_cast<uint64_t>(_current_time);
+ SnapshotChecker* snapshot_checker = DisableGCSnapshotChecker::Instance();
+
+ auto write_hint = cfd->CalculateSSTWriteHint(0);
+ std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+ range_del_iters;
+ auto range_del_iter =
+ mem->NewRangeTombstoneIterator(ro, kMaxSequenceNumber);
+ if (range_del_iter != nullptr) {
+ range_del_iters.emplace_back(range_del_iter);
+ }
+
+ LegacyFileSystemWrapper fs(env_);
+ status = BuildTable(
+ dbname_, env_, &fs, *cfd->ioptions(),
+ *cfd->GetLatestMutableCFOptions(), env_options_, table_cache_,
+ iter.get(), std::move(range_del_iters), &meta,
+ cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(),
+ cfd->GetID(), cfd->GetName(), {}, kMaxSequenceNumber,
+ snapshot_checker, kNoCompression, 0 /* sample_for_compression */,
+ CompressionOptions(), false, nullptr /* internal_stats */,
+ TableFileCreationReason::kRecovery, nullptr /* event_logger */,
+ 0 /* job_id */, Env::IO_HIGH, nullptr /* table_properties */,
+ -1 /* level */, current_time, write_hint);
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s",
+ log, counter, meta.fd.GetNumber(),
+ status.ToString().c_str());
+ if (status.ok()) {
+ if (meta.fd.GetFileSize() > 0) {
+ table_fds_.push_back(meta.fd);
+ }
+ } else {
+ break;
+ }
+ }
+ delete cf_mems;
+ return status;
+ }
+
+ void ExtractMetaData() {
+ for (size_t i = 0; i < table_fds_.size(); i++) {
+ TableInfo t;
+ t.meta.fd = table_fds_[i];
+ Status status = ScanTable(&t);
+ if (!status.ok()) {
+ std::string fname = TableFileName(
+ db_options_.db_paths, t.meta.fd.GetNumber(), t.meta.fd.GetPathId());
+ char file_num_buf[kFormatFileNumberBufSize];
+ FormatFileNumber(t.meta.fd.GetNumber(), t.meta.fd.GetPathId(),
+ file_num_buf, sizeof(file_num_buf));
+ ROCKS_LOG_WARN(db_options_.info_log, "Table #%s: ignoring %s",
+ file_num_buf, status.ToString().c_str());
+ ArchiveFile(fname);
+ } else {
+ tables_.push_back(t);
+ }
+ }
+ }
+
+ Status ScanTable(TableInfo* t) {
+ std::string fname = TableFileName(
+ db_options_.db_paths, t->meta.fd.GetNumber(), t->meta.fd.GetPathId());
+ int counter = 0;
+ uint64_t file_size;
+ Status status = env_->GetFileSize(fname, &file_size);
+ t->meta.fd = FileDescriptor(t->meta.fd.GetNumber(), t->meta.fd.GetPathId(),
+ file_size);
+ std::shared_ptr<const TableProperties> props;
+ if (status.ok()) {
+ status = table_cache_->GetTableProperties(env_options_, icmp_, t->meta.fd,
+ &props);
+ }
+ if (status.ok()) {
+ t->column_family_id = static_cast<uint32_t>(props->column_family_id);
+ if (t->column_family_id ==
+ TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) {
+ ROCKS_LOG_WARN(
+ db_options_.info_log,
+ "Table #%" PRIu64
+ ": column family unknown (probably due to legacy format); "
+ "adding to default column family id 0.",
+ t->meta.fd.GetNumber());
+ t->column_family_id = 0;
+ }
+
+ if (vset_.GetColumnFamilySet()->GetColumnFamily(t->column_family_id) ==
+ nullptr) {
+ status =
+ AddColumnFamily(props->column_family_name, t->column_family_id);
+ }
+ t->meta.oldest_ancester_time = props->creation_time;
+ }
+ ColumnFamilyData* cfd = nullptr;
+ if (status.ok()) {
+ cfd = vset_.GetColumnFamilySet()->GetColumnFamily(t->column_family_id);
+ if (cfd->GetName() != props->column_family_name) {
+ ROCKS_LOG_ERROR(
+ db_options_.info_log,
+ "Table #%" PRIu64
+ ": inconsistent column family name '%s'; expected '%s' for column "
+ "family id %" PRIu32 ".",
+ t->meta.fd.GetNumber(), props->column_family_name.c_str(),
+ cfd->GetName().c_str(), t->column_family_id);
+ status = Status::Corruption(dbname_, "inconsistent column family name");
+ }
+ }
+ if (status.ok()) {
+ ReadOptions ropts;
+ ropts.total_order_seek = true;
+ InternalIterator* iter = table_cache_->NewIterator(
+ ropts, env_options_, cfd->internal_comparator(), t->meta,
+ nullptr /* range_del_agg */,
+ cfd->GetLatestMutableCFOptions()->prefix_extractor.get(),
+ /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
+ TableReaderCaller::kRepair, /*arena=*/nullptr, /*skip_filters=*/false,
+ /*level=*/-1, /*smallest_compaction_key=*/nullptr,
+ /*largest_compaction_key=*/nullptr);
+ ParsedInternalKey parsed;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ Slice key = iter->key();
+ if (!ParseInternalKey(key, &parsed)) {
+ ROCKS_LOG_ERROR(db_options_.info_log,
+ "Table #%" PRIu64 ": unparsable key %s",
+ t->meta.fd.GetNumber(), EscapeString(key).c_str());
+ continue;
+ }
+
+ counter++;
+
+ t->meta.UpdateBoundaries(key, iter->value(), parsed.sequence,
+ parsed.type);
+ }
+ if (!iter->status().ok()) {
+ status = iter->status();
+ }
+ delete iter;
+
+ ROCKS_LOG_INFO(db_options_.info_log, "Table #%" PRIu64 ": %d entries %s",
+ t->meta.fd.GetNumber(), counter,
+ status.ToString().c_str());
+ }
+ return status;
+ }
+
+ Status AddTables() {
+ std::unordered_map<uint32_t, std::vector<const TableInfo*>> cf_id_to_tables;
+ SequenceNumber max_sequence = 0;
+ for (size_t i = 0; i < tables_.size(); i++) {
+ cf_id_to_tables[tables_[i].column_family_id].push_back(&tables_[i]);
+ if (max_sequence < tables_[i].meta.fd.largest_seqno) {
+ max_sequence = tables_[i].meta.fd.largest_seqno;
+ }
+ }
+ vset_.SetLastAllocatedSequence(max_sequence);
+ vset_.SetLastPublishedSequence(max_sequence);
+ vset_.SetLastSequence(max_sequence);
+
+ for (const auto& cf_id_and_tables : cf_id_to_tables) {
+ auto* cfd =
+ vset_.GetColumnFamilySet()->GetColumnFamily(cf_id_and_tables.first);
+ VersionEdit edit;
+ edit.SetComparatorName(cfd->user_comparator()->Name());
+ edit.SetLogNumber(0);
+ edit.SetNextFile(next_file_number_);
+ edit.SetColumnFamily(cfd->GetID());
+
+ // TODO(opt): separate out into multiple levels
+ for (const auto* table : cf_id_and_tables.second) {
+ edit.AddFile(
+ 0, table->meta.fd.GetNumber(), table->meta.fd.GetPathId(),
+ table->meta.fd.GetFileSize(), table->meta.smallest,
+ table->meta.largest, table->meta.fd.smallest_seqno,
+ table->meta.fd.largest_seqno, table->meta.marked_for_compaction,
+ table->meta.oldest_blob_file_number,
+ table->meta.oldest_ancester_time, table->meta.file_creation_time,
+ table->meta.file_checksum, table->meta.file_checksum_func_name);
+ }
+ assert(next_file_number_ > 0);
+ vset_.MarkFileNumberUsed(next_file_number_ - 1);
+ mutex_.Lock();
+ Status status = vset_.LogAndApply(
+ cfd, *cfd->GetLatestMutableCFOptions(), &edit, &mutex_,
+ nullptr /* db_directory */, false /* new_descriptor_log */);
+ mutex_.Unlock();
+ if (!status.ok()) {
+ return status;
+ }
+ }
+ return Status::OK();
+ }
+
+ void ArchiveFile(const std::string& fname) {
+ // Move into another directory. E.g., for
+ // dir/foo
+ // rename to
+ // dir/lost/foo
+ const char* slash = strrchr(fname.c_str(), '/');
+ std::string new_dir;
+ if (slash != nullptr) {
+ new_dir.assign(fname.data(), slash - fname.data());
+ }
+ new_dir.append("/lost");
+ env_->CreateDir(new_dir); // Ignore error
+ std::string new_file = new_dir;
+ new_file.append("/");
+ new_file.append((slash == nullptr) ? fname.c_str() : slash + 1);
+ Status s = env_->RenameFile(fname, new_file);
+ ROCKS_LOG_INFO(db_options_.info_log, "Archiving %s: %s\n", fname.c_str(),
+ s.ToString().c_str());
+ }
+};
+
+Status GetDefaultCFOptions(
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ ColumnFamilyOptions* res) {
+ assert(res != nullptr);
+ auto iter = std::find_if(column_families.begin(), column_families.end(),
+ [](const ColumnFamilyDescriptor& cfd) {
+ return cfd.name == kDefaultColumnFamilyName;
+ });
+ if (iter == column_families.end()) {
+ return Status::InvalidArgument(
+ "column_families", "Must contain entry for default column family");
+ }
+ *res = iter->options;
+ return Status::OK();
+}
+} // anonymous namespace
+
+Status RepairDB(const std::string& dbname, const DBOptions& db_options,
+ const std::vector<ColumnFamilyDescriptor>& column_families
+ ) {
+ ColumnFamilyOptions default_cf_opts;
+ Status status = GetDefaultCFOptions(column_families, &default_cf_opts);
+ if (status.ok()) {
+ Repairer repairer(dbname, db_options, column_families,
+ default_cf_opts,
+ ColumnFamilyOptions() /* unknown_cf_opts */,
+ false /* create_unknown_cfs */);
+ status = repairer.Run();
+ }
+ return status;
+}
+
+Status RepairDB(const std::string& dbname, const DBOptions& db_options,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ const ColumnFamilyOptions& unknown_cf_opts) {
+ ColumnFamilyOptions default_cf_opts;
+ Status status = GetDefaultCFOptions(column_families, &default_cf_opts);
+ if (status.ok()) {
+ Repairer repairer(dbname, db_options,
+ column_families, default_cf_opts,
+ unknown_cf_opts, true /* create_unknown_cfs */);
+ status = repairer.Run();
+ }
+ return status;
+}
+
+Status RepairDB(const std::string& dbname, const Options& options) {
+ Options opts(options);
+ if (opts.file_system == nullptr) {
+ opts.file_system.reset(new LegacyFileSystemWrapper(opts.env));
+ ;
+ }
+
+ DBOptions db_options(opts);
+ ColumnFamilyOptions cf_options(opts);
+ Repairer repairer(dbname, db_options,
+ {}, cf_options /* default_cf_opts */,
+ cf_options /* unknown_cf_opts */,
+ true /* create_unknown_cfs */);
+ return repairer.Run();
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/db/repair_test.cc b/src/rocksdb/db/repair_test.cc
new file mode 100644
index 000000000..ba2bae3d0
--- /dev/null
+++ b/src/rocksdb/db/repair_test.cc
@@ -0,0 +1,369 @@
+// Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "file/file_util.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/transaction_log.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+class RepairTest : public DBTestBase {
+ public:
+ RepairTest() : DBTestBase("/repair_test") {}
+
+ std::string GetFirstSstPath() {
+ uint64_t manifest_size;
+ std::vector<std::string> files;
+ db_->GetLiveFiles(files, &manifest_size);
+ auto sst_iter =
+ std::find_if(files.begin(), files.end(), [](const std::string& file) {
+ uint64_t number;
+ FileType type;
+ bool ok = ParseFileName(file, &number, &type);
+ return ok && type == kTableFile;
+ });
+ return sst_iter == files.end() ? "" : dbname_ + *sst_iter;
+ }
+};
+
+TEST_F(RepairTest, LostManifest) {
+ // Add a couple SST files, delete the manifest, and verify RepairDB() saves
+ // the day.
+ Put("key", "val");
+ Flush();
+ Put("key2", "val2");
+ Flush();
+ // Need to get path before Close() deletes db_, but delete it after Close() to
+ // ensure Close() didn't change the manifest.
+ std::string manifest_path =
+ DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+
+ Close();
+ ASSERT_OK(env_->FileExists(manifest_path));
+ ASSERT_OK(env_->DeleteFile(manifest_path));
+ ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+ Reopen(CurrentOptions());
+
+ ASSERT_EQ(Get("key"), "val");
+ ASSERT_EQ(Get("key2"), "val2");
+}
+
+TEST_F(RepairTest, CorruptManifest) {
+ // Manifest is in an invalid format. Expect a full recovery.
+ Put("key", "val");
+ Flush();
+ Put("key2", "val2");
+ Flush();
+ // Need to get path before Close() deletes db_, but overwrite it after Close()
+ // to ensure Close() didn't change the manifest.
+ std::string manifest_path =
+ DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+
+ Close();
+ ASSERT_OK(env_->FileExists(manifest_path));
+
+ LegacyFileSystemWrapper fs(env_);
+ CreateFile(&fs, manifest_path, "blah", false /* use_fsync */);
+ ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+ Reopen(CurrentOptions());
+
+ ASSERT_EQ(Get("key"), "val");
+ ASSERT_EQ(Get("key2"), "val2");
+}
+
+TEST_F(RepairTest, IncompleteManifest) {
+ // In this case, the manifest is valid but does not reference all of the SST
+ // files. Expect a full recovery.
+ Put("key", "val");
+ Flush();
+ std::string orig_manifest_path =
+ DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+ CopyFile(orig_manifest_path, orig_manifest_path + ".tmp");
+ Put("key2", "val2");
+ Flush();
+ // Need to get path before Close() deletes db_, but overwrite it after Close()
+ // to ensure Close() didn't change the manifest.
+ std::string new_manifest_path =
+ DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+
+ Close();
+ ASSERT_OK(env_->FileExists(new_manifest_path));
+ // Replace the manifest with one that is only aware of the first SST file.
+ CopyFile(orig_manifest_path + ".tmp", new_manifest_path);
+ ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+ Reopen(CurrentOptions());
+
+ ASSERT_EQ(Get("key"), "val");
+ ASSERT_EQ(Get("key2"), "val2");
+}
+
+TEST_F(RepairTest, PostRepairSstFileNumbering) {
+ // Verify after a DB is repaired, new files will be assigned higher numbers
+ // than old files.
+ Put("key", "val");
+ Flush();
+ Put("key2", "val2");
+ Flush();
+ uint64_t pre_repair_file_num = dbfull()->TEST_Current_Next_FileNo();
+ Close();
+
+ ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+
+ Reopen(CurrentOptions());
+ uint64_t post_repair_file_num = dbfull()->TEST_Current_Next_FileNo();
+ ASSERT_GE(post_repair_file_num, pre_repair_file_num);
+}
+
+TEST_F(RepairTest, LostSst) {
+ // Delete one of the SST files but preserve the manifest that refers to it,
+ // then verify the DB is still usable for the intact SST.
+ Put("key", "val");
+ Flush();
+ Put("key2", "val2");
+ Flush();
+ auto sst_path = GetFirstSstPath();
+ ASSERT_FALSE(sst_path.empty());
+ ASSERT_OK(env_->DeleteFile(sst_path));
+
+ Close();
+ ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+ Reopen(CurrentOptions());
+
+ // Exactly one of the key-value pairs should be in the DB now.
+ ASSERT_TRUE((Get("key") == "val") != (Get("key2") == "val2"));
+}
+
+TEST_F(RepairTest, CorruptSst) {
+ // Corrupt one of the SST files but preserve the manifest that refers to it,
+ // then verify the DB is still usable for the intact SST.
+ Put("key", "val");
+ Flush();
+ Put("key2", "val2");
+ Flush();
+ auto sst_path = GetFirstSstPath();
+ ASSERT_FALSE(sst_path.empty());
+
+ LegacyFileSystemWrapper fs(env_);
+ CreateFile(&fs, sst_path, "blah", false /* use_fsync */);
+
+ Close();
+ ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+ Reopen(CurrentOptions());
+
+ // Exactly one of the key-value pairs should be in the DB now.
+ ASSERT_TRUE((Get("key") == "val") != (Get("key2") == "val2"));
+}
+
+TEST_F(RepairTest, UnflushedSst) {
+ // This test case invokes repair while some data is unflushed, then verifies
+ // that data is in the db.
+ Put("key", "val");
+ VectorLogPtr wal_files;
+ ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
+ ASSERT_EQ(wal_files.size(), 1);
+ uint64_t total_ssts_size;
+ GetAllSSTFiles(&total_ssts_size);
+ ASSERT_EQ(total_ssts_size, 0);
+ // Need to get path before Close() deletes db_, but delete it after Close() to
+ // ensure Close() didn't change the manifest.
+ std::string manifest_path =
+ DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+
+ Close();
+ ASSERT_OK(env_->FileExists(manifest_path));
+ ASSERT_OK(env_->DeleteFile(manifest_path));
+ ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+ Reopen(CurrentOptions());
+
+ ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
+ ASSERT_EQ(wal_files.size(), 0);
+ GetAllSSTFiles(&total_ssts_size);
+ ASSERT_GT(total_ssts_size, 0);
+ ASSERT_EQ(Get("key"), "val");
+}
+
+TEST_F(RepairTest, SeparateWalDir) {
+ do {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+ Put("key", "val");
+ Put("foo", "bar");
+ VectorLogPtr wal_files;
+ ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
+ ASSERT_EQ(wal_files.size(), 1);
+ uint64_t total_ssts_size;
+ GetAllSSTFiles(&total_ssts_size);
+ ASSERT_EQ(total_ssts_size, 0);
+ std::string manifest_path =
+ DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+
+ Close();
+ ASSERT_OK(env_->FileExists(manifest_path));
+ ASSERT_OK(env_->DeleteFile(manifest_path));
+ ASSERT_OK(RepairDB(dbname_, options));
+
+ // make sure that all WALs are converted to SSTables.
+ options.wal_dir = "";
+
+ Reopen(options);
+ ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
+ ASSERT_EQ(wal_files.size(), 0);
+ GetAllSSTFiles(&total_ssts_size);
+ ASSERT_GT(total_ssts_size, 0);
+ ASSERT_EQ(Get("key"), "val");
+ ASSERT_EQ(Get("foo"), "bar");
+
+ } while(ChangeWalOptions());
+}
+
+TEST_F(RepairTest, RepairMultipleColumnFamilies) {
+ // Verify repair logic associates SST files with their original column
+ // families.
+ const int kNumCfs = 3;
+ const int kEntriesPerCf = 2;
+ DestroyAndReopen(CurrentOptions());
+ CreateAndReopenWithCF({"pikachu1", "pikachu2"}, CurrentOptions());
+ for (int i = 0; i < kNumCfs; ++i) {
+ for (int j = 0; j < kEntriesPerCf; ++j) {
+ Put(i, "key" + ToString(j), "val" + ToString(j));
+ if (j == kEntriesPerCf - 1 && i == kNumCfs - 1) {
+ // Leave one unflushed so we can verify WAL entries are properly
+ // associated with column families.
+ continue;
+ }
+ Flush(i);
+ }
+ }
+
+ // Need to get path before Close() deletes db_, but delete it after Close() to
+ // ensure Close() doesn't re-create the manifest.
+ std::string manifest_path =
+ DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+ Close();
+ ASSERT_OK(env_->FileExists(manifest_path));
+ ASSERT_OK(env_->DeleteFile(manifest_path));
+
+ ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+
+ ReopenWithColumnFamilies({"default", "pikachu1", "pikachu2"},
+ CurrentOptions());
+ for (int i = 0; i < kNumCfs; ++i) {
+ for (int j = 0; j < kEntriesPerCf; ++j) {
+ ASSERT_EQ(Get(i, "key" + ToString(j)), "val" + ToString(j));
+ }
+ }
+}
+
+TEST_F(RepairTest, RepairColumnFamilyOptions) {
+ // Verify repair logic uses correct ColumnFamilyOptions when repairing a
+ // database with different options for column families.
+ const int kNumCfs = 2;
+ const int kEntriesPerCf = 2;
+
+ Options opts(CurrentOptions()), rev_opts(CurrentOptions());
+ opts.comparator = BytewiseComparator();
+ rev_opts.comparator = ReverseBytewiseComparator();
+
+ DestroyAndReopen(opts);
+ CreateColumnFamilies({"reverse"}, rev_opts);
+ ReopenWithColumnFamilies({"default", "reverse"},
+ std::vector<Options>{opts, rev_opts});
+ for (int i = 0; i < kNumCfs; ++i) {
+ for (int j = 0; j < kEntriesPerCf; ++j) {
+ Put(i, "key" + ToString(j), "val" + ToString(j));
+ if (i == kNumCfs - 1 && j == kEntriesPerCf - 1) {
+ // Leave one unflushed so we can verify RepairDB's flush logic
+ continue;
+ }
+ Flush(i);
+ }
+ }
+ Close();
+
+ // RepairDB() records the comparator in the manifest, and DB::Open would fail
+ // if a different comparator were used.
+ ASSERT_OK(RepairDB(dbname_, opts, {{"default", opts}, {"reverse", rev_opts}},
+ opts /* unknown_cf_opts */));
+ ASSERT_OK(TryReopenWithColumnFamilies({"default", "reverse"},
+ std::vector<Options>{opts, rev_opts}));
+ for (int i = 0; i < kNumCfs; ++i) {
+ for (int j = 0; j < kEntriesPerCf; ++j) {
+ ASSERT_EQ(Get(i, "key" + ToString(j)), "val" + ToString(j));
+ }
+ }
+
+ // Examine table properties to verify RepairDB() used the right options when
+ // converting WAL->SST
+ TablePropertiesCollection fname_to_props;
+ db_->GetPropertiesOfAllTables(handles_[1], &fname_to_props);
+ ASSERT_EQ(fname_to_props.size(), 2U);
+ for (const auto& fname_and_props : fname_to_props) {
+ std::string comparator_name (
+ InternalKeyComparator(rev_opts.comparator).Name());
+ comparator_name = comparator_name.substr(comparator_name.find(':') + 1);
+ ASSERT_EQ(comparator_name,
+ fname_and_props.second->comparator_name);
+ }
+ Close();
+
+ // Also check comparator when it's provided via "unknown" CF options
+ ASSERT_OK(RepairDB(dbname_, opts, {{"default", opts}},
+ rev_opts /* unknown_cf_opts */));
+ ASSERT_OK(TryReopenWithColumnFamilies({"default", "reverse"},
+ std::vector<Options>{opts, rev_opts}));
+ for (int i = 0; i < kNumCfs; ++i) {
+ for (int j = 0; j < kEntriesPerCf; ++j) {
+ ASSERT_EQ(Get(i, "key" + ToString(j)), "val" + ToString(j));
+ }
+ }
+}
+
+TEST_F(RepairTest, DbNameContainsTrailingSlash) {
+ {
+ bool tmp;
+ if (env_->AreFilesSame("", "", &tmp).IsNotSupported()) {
+ fprintf(stderr,
+ "skipping RepairTest.DbNameContainsTrailingSlash due to "
+ "unsupported Env::AreFilesSame\n");
+ return;
+ }
+ }
+
+ Put("key", "val");
+ Flush();
+ Close();
+
+ ASSERT_OK(RepairDB(dbname_ + "/", CurrentOptions()));
+ Reopen(CurrentOptions());
+ ASSERT_EQ(Get("key"), "val");
+}
+#endif // ROCKSDB_LITE
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "SKIPPED as RepairDB is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/db/snapshot_checker.h b/src/rocksdb/db/snapshot_checker.h
new file mode 100644
index 000000000..1d2c2c316
--- /dev/null
+++ b/src/rocksdb/db/snapshot_checker.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+enum class SnapshotCheckerResult : int {
+ kInSnapshot = 0,
+ kNotInSnapshot = 1,
+ // In case snapshot is released and the checker has no clue whether
+ // the given sequence is visible to the snapshot.
+ kSnapshotReleased = 2,
+};
+
+// Callback class that control GC of duplicate keys in flush/compaction.
+class SnapshotChecker {
+ public:
+ virtual ~SnapshotChecker() {}
+ virtual SnapshotCheckerResult CheckInSnapshot(
+ SequenceNumber sequence, SequenceNumber snapshot_sequence) const = 0;
+};
+
+class DisableGCSnapshotChecker : public SnapshotChecker {
+ public:
+ virtual ~DisableGCSnapshotChecker() {}
+ virtual SnapshotCheckerResult CheckInSnapshot(
+ SequenceNumber /*sequence*/,
+ SequenceNumber /*snapshot_sequence*/) const override {
+ // By returning kNotInSnapshot, we prevent all the values from being GCed
+ return SnapshotCheckerResult::kNotInSnapshot;
+ }
+ static DisableGCSnapshotChecker* Instance() { return &instance_; }
+
+ protected:
+ static DisableGCSnapshotChecker instance_;
+ explicit DisableGCSnapshotChecker() {}
+};
+
+class WritePreparedTxnDB;
+
+// Callback class created by WritePreparedTxnDB to check if a key
+// is visible by a snapshot.
+class WritePreparedSnapshotChecker : public SnapshotChecker {
+ public:
+ explicit WritePreparedSnapshotChecker(WritePreparedTxnDB* txn_db);
+ virtual ~WritePreparedSnapshotChecker() {}
+
+ virtual SnapshotCheckerResult CheckInSnapshot(
+ SequenceNumber sequence, SequenceNumber snapshot_sequence) const override;
+
+ private:
+#ifndef ROCKSDB_LITE
+ const WritePreparedTxnDB* const txn_db_;
+#endif // !ROCKSDB_LITE
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/snapshot_impl.cc b/src/rocksdb/db/snapshot_impl.cc
new file mode 100644
index 000000000..b9228c797
--- /dev/null
+++ b/src/rocksdb/db/snapshot_impl.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/snapshot.h"
+
+#include "rocksdb/db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+ManagedSnapshot::ManagedSnapshot(DB* db) : db_(db),
+ snapshot_(db->GetSnapshot()) {}
+
+ManagedSnapshot::ManagedSnapshot(DB* db, const Snapshot* _snapshot)
+ : db_(db), snapshot_(_snapshot) {}
+
+ManagedSnapshot::~ManagedSnapshot() {
+ if (snapshot_) {
+ db_->ReleaseSnapshot(snapshot_);
+ }
+}
+
+const Snapshot* ManagedSnapshot::snapshot() { return snapshot_;}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/snapshot_impl.h b/src/rocksdb/db/snapshot_impl.h
new file mode 100644
index 000000000..785f814f8
--- /dev/null
+++ b/src/rocksdb/db/snapshot_impl.h
@@ -0,0 +1,167 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <vector>
+
+#include "rocksdb/db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class SnapshotList;
+
+// Snapshots are kept in a doubly-linked list in the DB.
+// Each SnapshotImpl corresponds to a particular sequence number.
+class SnapshotImpl : public Snapshot {
+ public:
+ SequenceNumber number_; // const after creation
+ // It indicates the smallest uncommitted data at the time the snapshot was
+ // taken. This is currently used by WritePrepared transactions to limit the
+ // scope of queries to IsInSnpashot.
+ SequenceNumber min_uncommitted_ = kMinUnCommittedSeq;
+
+ virtual SequenceNumber GetSequenceNumber() const override { return number_; }
+
+ private:
+ friend class SnapshotList;
+
+ // SnapshotImpl is kept in a doubly-linked circular list
+ SnapshotImpl* prev_;
+ SnapshotImpl* next_;
+
+ SnapshotList* list_; // just for sanity checks
+
+ int64_t unix_time_;
+
+ // Will this snapshot be used by a Transaction to do write-conflict checking?
+ bool is_write_conflict_boundary_;
+};
+
+class SnapshotList {
+ public:
+ SnapshotList() {
+ list_.prev_ = &list_;
+ list_.next_ = &list_;
+ list_.number_ = 0xFFFFFFFFL; // placeholder marker, for debugging
+ // Set all the variables to make UBSAN happy.
+ list_.list_ = nullptr;
+ list_.unix_time_ = 0;
+ list_.is_write_conflict_boundary_ = false;
+ count_ = 0;
+ }
+
+ // No copy-construct.
+ SnapshotList(const SnapshotList&) = delete;
+
+ bool empty() const { return list_.next_ == &list_; }
+ SnapshotImpl* oldest() const { assert(!empty()); return list_.next_; }
+ SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; }
+
+ SnapshotImpl* New(SnapshotImpl* s, SequenceNumber seq, uint64_t unix_time,
+ bool is_write_conflict_boundary) {
+ s->number_ = seq;
+ s->unix_time_ = unix_time;
+ s->is_write_conflict_boundary_ = is_write_conflict_boundary;
+ s->list_ = this;
+ s->next_ = &list_;
+ s->prev_ = list_.prev_;
+ s->prev_->next_ = s;
+ s->next_->prev_ = s;
+ count_++;
+ return s;
+ }
+
+ // Do not responsible to free the object.
+ void Delete(const SnapshotImpl* s) {
+ assert(s->list_ == this);
+ s->prev_->next_ = s->next_;
+ s->next_->prev_ = s->prev_;
+ count_--;
+ }
+
+ // retrieve all snapshot numbers up until max_seq. They are sorted in
+ // ascending order (with no duplicates).
+ std::vector<SequenceNumber> GetAll(
+ SequenceNumber* oldest_write_conflict_snapshot = nullptr,
+ const SequenceNumber& max_seq = kMaxSequenceNumber) const {
+ std::vector<SequenceNumber> ret;
+ GetAll(&ret, oldest_write_conflict_snapshot, max_seq);
+ return ret;
+ }
+
+ void GetAll(std::vector<SequenceNumber>* snap_vector,
+ SequenceNumber* oldest_write_conflict_snapshot = nullptr,
+ const SequenceNumber& max_seq = kMaxSequenceNumber) const {
+ std::vector<SequenceNumber>& ret = *snap_vector;
+ // So far we have no use case that would pass a non-empty vector
+ assert(ret.size() == 0);
+
+ if (oldest_write_conflict_snapshot != nullptr) {
+ *oldest_write_conflict_snapshot = kMaxSequenceNumber;
+ }
+
+ if (empty()) {
+ return;
+ }
+ const SnapshotImpl* s = &list_;
+ while (s->next_ != &list_) {
+ if (s->next_->number_ > max_seq) {
+ break;
+ }
+ // Avoid duplicates
+ if (ret.empty() || ret.back() != s->next_->number_) {
+ ret.push_back(s->next_->number_);
+ }
+
+ if (oldest_write_conflict_snapshot != nullptr &&
+ *oldest_write_conflict_snapshot == kMaxSequenceNumber &&
+ s->next_->is_write_conflict_boundary_) {
+ // If this is the first write-conflict boundary snapshot in the list,
+ // it is the oldest
+ *oldest_write_conflict_snapshot = s->next_->number_;
+ }
+
+ s = s->next_;
+ }
+ return;
+ }
+
+ // get the sequence number of the most recent snapshot
+ SequenceNumber GetNewest() {
+ if (empty()) {
+ return 0;
+ }
+ return newest()->number_;
+ }
+
+ int64_t GetOldestSnapshotTime() const {
+ if (empty()) {
+ return 0;
+ } else {
+ return oldest()->unix_time_;
+ }
+ }
+
+ int64_t GetOldestSnapshotSequence() const {
+ if (empty()) {
+ return 0;
+ } else {
+ return oldest()->GetSequenceNumber();
+ }
+ }
+
+ uint64_t count() const { return count_; }
+
+ private:
+ // Dummy head of doubly-linked list of snapshots
+ SnapshotImpl list_;
+ uint64_t count_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/table_cache.cc b/src/rocksdb/db/table_cache.cc
new file mode 100644
index 000000000..411959a33
--- /dev/null
+++ b/src/rocksdb/db/table_cache.cc
@@ -0,0 +1,668 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/table_cache.h"
+
+#include "db/dbformat.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/snapshot_impl.h"
+#include "db/version_edit.h"
+#include "file/filename.h"
+#include "file/random_access_file_reader.h"
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/statistics.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/get_context.h"
+#include "table/internal_iterator.h"
+#include "table/iterator_wrapper.h"
+#include "table/multiget_context.h"
+#include "table/table_builder.h"
+#include "table/table_reader.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/coding.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+template <class T>
+static void DeleteEntry(const Slice& /*key*/, void* value) {
+ T* typed_value = reinterpret_cast<T*>(value);
+ delete typed_value;
+}
+
+static void UnrefEntry(void* arg1, void* arg2) {
+ Cache* cache = reinterpret_cast<Cache*>(arg1);
+ Cache::Handle* h = reinterpret_cast<Cache::Handle*>(arg2);
+ cache->Release(h);
+}
+
+static Slice GetSliceForFileNumber(const uint64_t* file_number) {
+ return Slice(reinterpret_cast<const char*>(file_number),
+ sizeof(*file_number));
+}
+
+#ifndef ROCKSDB_LITE
+
+void AppendVarint64(IterKey* key, uint64_t v) {
+ char buf[10];
+ auto ptr = EncodeVarint64(buf, v);
+ key->TrimAppend(key->Size(), buf, ptr - buf);
+}
+
+#endif // ROCKSDB_LITE
+
+} // namespace
+
+TableCache::TableCache(const ImmutableCFOptions& ioptions,
+ const FileOptions& file_options, Cache* const cache,
+ BlockCacheTracer* const block_cache_tracer)
+ : ioptions_(ioptions),
+ file_options_(file_options),
+ cache_(cache),
+ immortal_tables_(false),
+ block_cache_tracer_(block_cache_tracer) {
+ if (ioptions_.row_cache) {
+ // If the same cache is shared by multiple instances, we need to
+ // disambiguate its entries.
+ PutVarint64(&row_cache_id_, ioptions_.row_cache->NewId());
+ }
+}
+
+TableCache::~TableCache() {
+}
+
+TableReader* TableCache::GetTableReaderFromHandle(Cache::Handle* handle) {
+ return reinterpret_cast<TableReader*>(cache_->Value(handle));
+}
+
+void TableCache::ReleaseHandle(Cache::Handle* handle) {
+ cache_->Release(handle);
+}
+
+Status TableCache::GetTableReader(
+ const FileOptions& file_options,
+ const InternalKeyComparator& internal_comparator, const FileDescriptor& fd,
+ bool sequential_mode, bool record_read_stats, HistogramImpl* file_read_hist,
+ std::unique_ptr<TableReader>* table_reader,
+ const SliceTransform* prefix_extractor, bool skip_filters, int level,
+ bool prefetch_index_and_filter_in_cache) {
+ std::string fname =
+ TableFileName(ioptions_.cf_paths, fd.GetNumber(), fd.GetPathId());
+ std::unique_ptr<FSRandomAccessFile> file;
+ Status s = ioptions_.fs->NewRandomAccessFile(fname, file_options, &file,
+ nullptr);
+ RecordTick(ioptions_.statistics, NO_FILE_OPENS);
+ if (s.IsPathNotFound()) {
+ fname = Rocks2LevelTableFileName(fname);
+ s = ioptions_.fs->NewRandomAccessFile(fname, file_options, &file, nullptr);
+ RecordTick(ioptions_.statistics, NO_FILE_OPENS);
+ }
+
+ if (s.ok()) {
+ if (!sequential_mode && ioptions_.advise_random_on_open) {
+ file->Hint(FSRandomAccessFile::kRandom);
+ }
+ StopWatch sw(ioptions_.env, ioptions_.statistics, TABLE_OPEN_IO_MICROS);
+ std::unique_ptr<RandomAccessFileReader> file_reader(
+ new RandomAccessFileReader(
+ std::move(file), fname, ioptions_.env,
+ record_read_stats ? ioptions_.statistics : nullptr, SST_READ_MICROS,
+ file_read_hist, ioptions_.rate_limiter, ioptions_.listeners));
+ s = ioptions_.table_factory->NewTableReader(
+ TableReaderOptions(ioptions_, prefix_extractor, file_options,
+ internal_comparator, skip_filters, immortal_tables_,
+ level, fd.largest_seqno, block_cache_tracer_),
+ std::move(file_reader), fd.GetFileSize(), table_reader,
+ prefetch_index_and_filter_in_cache);
+ TEST_SYNC_POINT("TableCache::GetTableReader:0");
+ }
+ return s;
+}
+
+void TableCache::EraseHandle(const FileDescriptor& fd, Cache::Handle* handle) {
+ ReleaseHandle(handle);
+ uint64_t number = fd.GetNumber();
+ Slice key = GetSliceForFileNumber(&number);
+ cache_->Erase(key);
+}
+
+Status TableCache::FindTable(const FileOptions& file_options,
+ const InternalKeyComparator& internal_comparator,
+ const FileDescriptor& fd, Cache::Handle** handle,
+ const SliceTransform* prefix_extractor,
+ const bool no_io, bool record_read_stats,
+ HistogramImpl* file_read_hist, bool skip_filters,
+ int level,
+ bool prefetch_index_and_filter_in_cache) {
+ PERF_TIMER_GUARD_WITH_ENV(find_table_nanos, ioptions_.env);
+ Status s;
+ uint64_t number = fd.GetNumber();
+ Slice key = GetSliceForFileNumber(&number);
+ *handle = cache_->Lookup(key);
+ TEST_SYNC_POINT_CALLBACK("TableCache::FindTable:0",
+ const_cast<bool*>(&no_io));
+
+ if (*handle == nullptr) {
+ if (no_io) { // Don't do IO and return a not-found status
+ return Status::Incomplete("Table not found in table_cache, no_io is set");
+ }
+ std::unique_ptr<TableReader> table_reader;
+ s = GetTableReader(file_options, internal_comparator, fd,
+ false /* sequential mode */, record_read_stats,
+ file_read_hist, &table_reader, prefix_extractor,
+ skip_filters, level, prefetch_index_and_filter_in_cache);
+ if (!s.ok()) {
+ assert(table_reader == nullptr);
+ RecordTick(ioptions_.statistics, NO_FILE_ERRORS);
+ // We do not cache error results so that if the error is transient,
+ // or somebody repairs the file, we recover automatically.
+ } else {
+ s = cache_->Insert(key, table_reader.get(), 1, &DeleteEntry<TableReader>,
+ handle);
+ if (s.ok()) {
+ // Release ownership of table reader.
+ table_reader.release();
+ }
+ }
+ }
+ return s;
+}
+
+InternalIterator* TableCache::NewIterator(
+ const ReadOptions& options, const FileOptions& file_options,
+ const InternalKeyComparator& icomparator, const FileMetaData& file_meta,
+ RangeDelAggregator* range_del_agg, const SliceTransform* prefix_extractor,
+ TableReader** table_reader_ptr, HistogramImpl* file_read_hist,
+ TableReaderCaller caller, Arena* arena, bool skip_filters, int level,
+ const InternalKey* smallest_compaction_key,
+ const InternalKey* largest_compaction_key) {
+ PERF_TIMER_GUARD(new_table_iterator_nanos);
+
+ Status s;
+ TableReader* table_reader = nullptr;
+ Cache::Handle* handle = nullptr;
+ if (table_reader_ptr != nullptr) {
+ *table_reader_ptr = nullptr;
+ }
+ bool for_compaction = caller == TableReaderCaller::kCompaction;
+ auto& fd = file_meta.fd;
+ table_reader = fd.table_reader;
+ if (table_reader == nullptr) {
+ s = FindTable(file_options, icomparator, fd, &handle, prefix_extractor,
+ options.read_tier == kBlockCacheTier /* no_io */,
+ !for_compaction /* record_read_stats */, file_read_hist,
+ skip_filters, level);
+ if (s.ok()) {
+ table_reader = GetTableReaderFromHandle(handle);
+ }
+ }
+ InternalIterator* result = nullptr;
+ if (s.ok()) {
+ if (options.table_filter &&
+ !options.table_filter(*table_reader->GetTableProperties())) {
+ result = NewEmptyInternalIterator<Slice>(arena);
+ } else {
+ result = table_reader->NewIterator(options, prefix_extractor, arena,
+ skip_filters, caller,
+ file_options.compaction_readahead_size);
+ }
+ if (handle != nullptr) {
+ result->RegisterCleanup(&UnrefEntry, cache_, handle);
+ handle = nullptr; // prevent from releasing below
+ }
+
+ if (for_compaction) {
+ table_reader->SetupForCompaction();
+ }
+ if (table_reader_ptr != nullptr) {
+ *table_reader_ptr = table_reader;
+ }
+ }
+ if (s.ok() && range_del_agg != nullptr && !options.ignore_range_deletions) {
+ if (range_del_agg->AddFile(fd.GetNumber())) {
+ std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+ static_cast<FragmentedRangeTombstoneIterator*>(
+ table_reader->NewRangeTombstoneIterator(options)));
+ if (range_del_iter != nullptr) {
+ s = range_del_iter->status();
+ }
+ if (s.ok()) {
+ const InternalKey* smallest = &file_meta.smallest;
+ const InternalKey* largest = &file_meta.largest;
+ if (smallest_compaction_key != nullptr) {
+ smallest = smallest_compaction_key;
+ }
+ if (largest_compaction_key != nullptr) {
+ largest = largest_compaction_key;
+ }
+ range_del_agg->AddTombstones(std::move(range_del_iter), smallest,
+ largest);
+ }
+ }
+ }
+
+ if (handle != nullptr) {
+ ReleaseHandle(handle);
+ }
+ if (!s.ok()) {
+ assert(result == nullptr);
+ result = NewErrorInternalIterator<Slice>(s, arena);
+ }
+ return result;
+}
+
+Status TableCache::GetRangeTombstoneIterator(
+ const ReadOptions& options,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta,
+ std::unique_ptr<FragmentedRangeTombstoneIterator>* out_iter) {
+ const FileDescriptor& fd = file_meta.fd;
+ Status s;
+ TableReader* t = fd.table_reader;
+ Cache::Handle* handle = nullptr;
+ if (t == nullptr) {
+ s = FindTable(file_options_, internal_comparator, fd, &handle);
+ if (s.ok()) {
+ t = GetTableReaderFromHandle(handle);
+ }
+ }
+ if (s.ok()) {
+ out_iter->reset(t->NewRangeTombstoneIterator(options));
+ assert(out_iter);
+ }
+ return s;
+}
+
+#ifndef ROCKSDB_LITE
+void TableCache::CreateRowCacheKeyPrefix(const ReadOptions& options,
+ const FileDescriptor& fd,
+ const Slice& internal_key,
+ GetContext* get_context,
+ IterKey& row_cache_key) {
+ uint64_t fd_number = fd.GetNumber();
+ // We use the user key as cache key instead of the internal key,
+ // otherwise the whole cache would be invalidated every time the
+ // sequence key increases. However, to support caching snapshot
+ // reads, we append the sequence number (incremented by 1 to
+ // distinguish from 0) only in this case.
+ // If the snapshot is larger than the largest seqno in the file,
+ // all data should be exposed to the snapshot, so we treat it
+ // the same as there is no snapshot. The exception is that if
+ // a seq-checking callback is registered, some internal keys
+ // may still be filtered out.
+ uint64_t seq_no = 0;
+ // Maybe we can include the whole file ifsnapshot == fd.largest_seqno.
+ if (options.snapshot != nullptr &&
+ (get_context->has_callback() ||
+ static_cast_with_check<const SnapshotImpl, const Snapshot>(
+ options.snapshot)
+ ->GetSequenceNumber() <= fd.largest_seqno)) {
+ // We should consider to use options.snapshot->GetSequenceNumber()
+ // instead of GetInternalKeySeqno(k), which will make the code
+ // easier to understand.
+ seq_no = 1 + GetInternalKeySeqno(internal_key);
+ }
+
+ // Compute row cache key.
+ row_cache_key.TrimAppend(row_cache_key.Size(), row_cache_id_.data(),
+ row_cache_id_.size());
+ AppendVarint64(&row_cache_key, fd_number);
+ AppendVarint64(&row_cache_key, seq_no);
+}
+
+bool TableCache::GetFromRowCache(const Slice& user_key, IterKey& row_cache_key,
+ size_t prefix_size, GetContext* get_context) {
+ bool found = false;
+
+ row_cache_key.TrimAppend(prefix_size, user_key.data(), user_key.size());
+ if (auto row_handle =
+ ioptions_.row_cache->Lookup(row_cache_key.GetUserKey())) {
+ // Cleanable routine to release the cache entry
+ Cleanable value_pinner;
+ auto release_cache_entry_func = [](void* cache_to_clean,
+ void* cache_handle) {
+ ((Cache*)cache_to_clean)->Release((Cache::Handle*)cache_handle);
+ };
+ auto found_row_cache_entry =
+ static_cast<const std::string*>(ioptions_.row_cache->Value(row_handle));
+ // If it comes here value is located on the cache.
+ // found_row_cache_entry points to the value on cache,
+ // and value_pinner has cleanup procedure for the cached entry.
+ // After replayGetContextLog() returns, get_context.pinnable_slice_
+ // will point to cache entry buffer (or a copy based on that) and
+ // cleanup routine under value_pinner will be delegated to
+ // get_context.pinnable_slice_. Cache entry is released when
+ // get_context.pinnable_slice_ is reset.
+ value_pinner.RegisterCleanup(release_cache_entry_func,
+ ioptions_.row_cache.get(), row_handle);
+ replayGetContextLog(*found_row_cache_entry, user_key, get_context,
+ &value_pinner);
+ RecordTick(ioptions_.statistics, ROW_CACHE_HIT);
+ found = true;
+ } else {
+ RecordTick(ioptions_.statistics, ROW_CACHE_MISS);
+ }
+ return found;
+}
+#endif // ROCKSDB_LITE
+
+Status TableCache::Get(const ReadOptions& options,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta, const Slice& k,
+ GetContext* get_context,
+ const SliceTransform* prefix_extractor,
+ HistogramImpl* file_read_hist, bool skip_filters,
+ int level) {
+ auto& fd = file_meta.fd;
+ std::string* row_cache_entry = nullptr;
+ bool done = false;
+#ifndef ROCKSDB_LITE
+ IterKey row_cache_key;
+ std::string row_cache_entry_buffer;
+
+ // Check row cache if enabled. Since row cache does not currently store
+ // sequence numbers, we cannot use it if we need to fetch the sequence.
+ if (ioptions_.row_cache && !get_context->NeedToReadSequence()) {
+ auto user_key = ExtractUserKey(k);
+ CreateRowCacheKeyPrefix(options, fd, k, get_context, row_cache_key);
+ done = GetFromRowCache(user_key, row_cache_key, row_cache_key.Size(),
+ get_context);
+ if (!done) {
+ row_cache_entry = &row_cache_entry_buffer;
+ }
+ }
+#endif // ROCKSDB_LITE
+ Status s;
+ TableReader* t = fd.table_reader;
+ Cache::Handle* handle = nullptr;
+ if (!done && s.ok()) {
+ if (t == nullptr) {
+ s = FindTable(
+ file_options_, internal_comparator, fd, &handle, prefix_extractor,
+ options.read_tier == kBlockCacheTier /* no_io */,
+ true /* record_read_stats */, file_read_hist, skip_filters, level);
+ if (s.ok()) {
+ t = GetTableReaderFromHandle(handle);
+ }
+ }
+ SequenceNumber* max_covering_tombstone_seq =
+ get_context->max_covering_tombstone_seq();
+ if (s.ok() && max_covering_tombstone_seq != nullptr &&
+ !options.ignore_range_deletions) {
+ std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+ t->NewRangeTombstoneIterator(options));
+ if (range_del_iter != nullptr) {
+ *max_covering_tombstone_seq = std::max(
+ *max_covering_tombstone_seq,
+ range_del_iter->MaxCoveringTombstoneSeqnum(ExtractUserKey(k)));
+ }
+ }
+ if (s.ok()) {
+ get_context->SetReplayLog(row_cache_entry); // nullptr if no cache.
+ s = t->Get(options, k, get_context, prefix_extractor, skip_filters);
+ get_context->SetReplayLog(nullptr);
+ } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) {
+ // Couldn't find Table in cache but treat as kFound if no_io set
+ get_context->MarkKeyMayExist();
+ s = Status::OK();
+ done = true;
+ }
+ }
+
+#ifndef ROCKSDB_LITE
+ // Put the replay log in row cache only if something was found.
+ if (!done && s.ok() && row_cache_entry && !row_cache_entry->empty()) {
+ size_t charge =
+ row_cache_key.Size() + row_cache_entry->size() + sizeof(std::string);
+ void* row_ptr = new std::string(std::move(*row_cache_entry));
+ ioptions_.row_cache->Insert(row_cache_key.GetUserKey(), row_ptr, charge,
+ &DeleteEntry<std::string>);
+ }
+#endif // ROCKSDB_LITE
+
+ if (handle != nullptr) {
+ ReleaseHandle(handle);
+ }
+ return s;
+}
+
+// Batched version of TableCache::MultiGet.
+Status TableCache::MultiGet(const ReadOptions& options,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta,
+ const MultiGetContext::Range* mget_range,
+ const SliceTransform* prefix_extractor,
+ HistogramImpl* file_read_hist, bool skip_filters,
+ int level) {
+ auto& fd = file_meta.fd;
+ Status s;
+ TableReader* t = fd.table_reader;
+ Cache::Handle* handle = nullptr;
+ MultiGetRange table_range(*mget_range, mget_range->begin(),
+ mget_range->end());
+#ifndef ROCKSDB_LITE
+ autovector<std::string, MultiGetContext::MAX_BATCH_SIZE> row_cache_entries;
+ IterKey row_cache_key;
+ size_t row_cache_key_prefix_size = 0;
+ KeyContext& first_key = *table_range.begin();
+ bool lookup_row_cache =
+ ioptions_.row_cache && !first_key.get_context->NeedToReadSequence();
+
+ // Check row cache if enabled. Since row cache does not currently store
+ // sequence numbers, we cannot use it if we need to fetch the sequence.
+ if (lookup_row_cache) {
+ GetContext* first_context = first_key.get_context;
+ CreateRowCacheKeyPrefix(options, fd, first_key.ikey, first_context,
+ row_cache_key);
+ row_cache_key_prefix_size = row_cache_key.Size();
+
+ for (auto miter = table_range.begin(); miter != table_range.end();
+ ++miter) {
+ const Slice& user_key = miter->ukey;
+ ;
+ GetContext* get_context = miter->get_context;
+
+ if (GetFromRowCache(user_key, row_cache_key, row_cache_key_prefix_size,
+ get_context)) {
+ table_range.SkipKey(miter);
+ } else {
+ row_cache_entries.emplace_back();
+ get_context->SetReplayLog(&(row_cache_entries.back()));
+ }
+ }
+ }
+#endif // ROCKSDB_LITE
+
+ // Check that table_range is not empty. Its possible all keys may have been
+ // found in the row cache and thus the range may now be empty
+ if (s.ok() && !table_range.empty()) {
+ if (t == nullptr) {
+ s = FindTable(
+ file_options_, internal_comparator, fd, &handle, prefix_extractor,
+ options.read_tier == kBlockCacheTier /* no_io */,
+ true /* record_read_stats */, file_read_hist, skip_filters, level);
+ TEST_SYNC_POINT_CALLBACK("TableCache::MultiGet:FindTable", &s);
+ if (s.ok()) {
+ t = GetTableReaderFromHandle(handle);
+ assert(t);
+ }
+ }
+ if (s.ok() && !options.ignore_range_deletions) {
+ std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+ t->NewRangeTombstoneIterator(options));
+ if (range_del_iter != nullptr) {
+ for (auto iter = table_range.begin(); iter != table_range.end();
+ ++iter) {
+ SequenceNumber* max_covering_tombstone_seq =
+ iter->get_context->max_covering_tombstone_seq();
+ *max_covering_tombstone_seq =
+ std::max(*max_covering_tombstone_seq,
+ range_del_iter->MaxCoveringTombstoneSeqnum(iter->ukey));
+ }
+ }
+ }
+ if (s.ok()) {
+ t->MultiGet(options, &table_range, prefix_extractor, skip_filters);
+ } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) {
+ for (auto iter = table_range.begin(); iter != table_range.end(); ++iter) {
+ Status* status = iter->s;
+ if (status->IsIncomplete()) {
+ // Couldn't find Table in cache but treat as kFound if no_io set
+ iter->get_context->MarkKeyMayExist();
+ s = Status::OK();
+ }
+ }
+ }
+ }
+
+#ifndef ROCKSDB_LITE
+ if (lookup_row_cache) {
+ size_t row_idx = 0;
+
+ for (auto miter = table_range.begin(); miter != table_range.end();
+ ++miter) {
+ std::string& row_cache_entry = row_cache_entries[row_idx++];
+ const Slice& user_key = miter->ukey;
+ ;
+ GetContext* get_context = miter->get_context;
+
+ get_context->SetReplayLog(nullptr);
+ // Compute row cache key.
+ row_cache_key.TrimAppend(row_cache_key_prefix_size, user_key.data(),
+ user_key.size());
+ // Put the replay log in row cache only if something was found.
+ if (s.ok() && !row_cache_entry.empty()) {
+ size_t charge =
+ row_cache_key.Size() + row_cache_entry.size() + sizeof(std::string);
+ void* row_ptr = new std::string(std::move(row_cache_entry));
+ ioptions_.row_cache->Insert(row_cache_key.GetUserKey(), row_ptr, charge,
+ &DeleteEntry<std::string>);
+ }
+ }
+ }
+#endif // ROCKSDB_LITE
+
+ if (handle != nullptr) {
+ ReleaseHandle(handle);
+ }
+ return s;
+}
+
+Status TableCache::GetTableProperties(
+ const FileOptions& file_options,
+ const InternalKeyComparator& internal_comparator, const FileDescriptor& fd,
+ std::shared_ptr<const TableProperties>* properties,
+ const SliceTransform* prefix_extractor, bool no_io) {
+ Status s;
+ auto table_reader = fd.table_reader;
+ // table already been pre-loaded?
+ if (table_reader) {
+ *properties = table_reader->GetTableProperties();
+
+ return s;
+ }
+
+ Cache::Handle* table_handle = nullptr;
+ s = FindTable(file_options, internal_comparator, fd, &table_handle,
+ prefix_extractor, no_io);
+ if (!s.ok()) {
+ return s;
+ }
+ assert(table_handle);
+ auto table = GetTableReaderFromHandle(table_handle);
+ *properties = table->GetTableProperties();
+ ReleaseHandle(table_handle);
+ return s;
+}
+
+size_t TableCache::GetMemoryUsageByTableReader(
+ const FileOptions& file_options,
+ const InternalKeyComparator& internal_comparator, const FileDescriptor& fd,
+ const SliceTransform* prefix_extractor) {
+ Status s;
+ auto table_reader = fd.table_reader;
+ // table already been pre-loaded?
+ if (table_reader) {
+ return table_reader->ApproximateMemoryUsage();
+ }
+
+ Cache::Handle* table_handle = nullptr;
+ s = FindTable(file_options, internal_comparator, fd, &table_handle,
+ prefix_extractor, true);
+ if (!s.ok()) {
+ return 0;
+ }
+ assert(table_handle);
+ auto table = GetTableReaderFromHandle(table_handle);
+ auto ret = table->ApproximateMemoryUsage();
+ ReleaseHandle(table_handle);
+ return ret;
+}
+
+void TableCache::Evict(Cache* cache, uint64_t file_number) {
+ cache->Erase(GetSliceForFileNumber(&file_number));
+}
+
+uint64_t TableCache::ApproximateOffsetOf(
+ const Slice& key, const FileDescriptor& fd, TableReaderCaller caller,
+ const InternalKeyComparator& internal_comparator,
+ const SliceTransform* prefix_extractor) {
+ uint64_t result = 0;
+ TableReader* table_reader = fd.table_reader;
+ Cache::Handle* table_handle = nullptr;
+ if (table_reader == nullptr) {
+ const bool for_compaction = (caller == TableReaderCaller::kCompaction);
+ Status s = FindTable(file_options_, internal_comparator, fd, &table_handle,
+ prefix_extractor, false /* no_io */,
+ !for_compaction /* record_read_stats */);
+ if (s.ok()) {
+ table_reader = GetTableReaderFromHandle(table_handle);
+ }
+ }
+
+ if (table_reader != nullptr) {
+ result = table_reader->ApproximateOffsetOf(key, caller);
+ }
+ if (table_handle != nullptr) {
+ ReleaseHandle(table_handle);
+ }
+
+ return result;
+}
+
+uint64_t TableCache::ApproximateSize(
+ const Slice& start, const Slice& end, const FileDescriptor& fd,
+ TableReaderCaller caller, const InternalKeyComparator& internal_comparator,
+ const SliceTransform* prefix_extractor) {
+ uint64_t result = 0;
+ TableReader* table_reader = fd.table_reader;
+ Cache::Handle* table_handle = nullptr;
+ if (table_reader == nullptr) {
+ const bool for_compaction = (caller == TableReaderCaller::kCompaction);
+ Status s = FindTable(file_options_, internal_comparator, fd, &table_handle,
+ prefix_extractor, false /* no_io */,
+ !for_compaction /* record_read_stats */);
+ if (s.ok()) {
+ table_reader = GetTableReaderFromHandle(table_handle);
+ }
+ }
+
+ if (table_reader != nullptr) {
+ result = table_reader->ApproximateSize(start, end, caller);
+ }
+ if (table_handle != nullptr) {
+ ReleaseHandle(table_handle);
+ }
+
+ return result;
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/table_cache.h b/src/rocksdb/db/table_cache.h
new file mode 100644
index 000000000..b9de824ee
--- /dev/null
+++ b/src/rocksdb/db/table_cache.h
@@ -0,0 +1,226 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Thread-safe (provides internal synchronization)
+
+#pragma once
+#include <string>
+#include <vector>
+#include <stdint.h>
+
+#include "db/dbformat.h"
+#include "db/range_del_aggregator.h"
+#include "options/cf_options.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "table/table_reader.h"
+#include "trace_replay/block_cache_tracer.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Env;
+class Arena;
+struct FileDescriptor;
+class GetContext;
+class HistogramImpl;
+
+// Manages caching for TableReader objects for a column family. The actual
+// cache is allocated separately and passed to the constructor. TableCache
+// wraps around the underlying SST file readers by providing Get(),
+// MultiGet() and NewIterator() methods that hide the instantiation,
+// caching and access to the TableReader. The main purpose of this is
+// performance - by caching the TableReader, it avoids unnecessary file opens
+// and object allocation and instantiation. One exception is compaction, where
+// a new TableReader may be instantiated - see NewIterator() comments
+//
+// Another service provided by TableCache is managing the row cache - if the
+// DB is configured with a row cache, and the lookup key is present in the row
+// cache, lookup is very fast. The row cache is obtained from
+// ioptions.row_cache
+class TableCache {
+ public:
+ TableCache(const ImmutableCFOptions& ioptions,
+ const FileOptions& storage_options, Cache* cache,
+ BlockCacheTracer* const block_cache_tracer);
+ ~TableCache();
+
+ // Return an iterator for the specified file number (the corresponding
+ // file length must be exactly "file_size" bytes). If "table_reader_ptr"
+ // is non-nullptr, also sets "*table_reader_ptr" to point to the Table object
+ // underlying the returned iterator, or nullptr if no Table object underlies
+ // the returned iterator. The returned "*table_reader_ptr" object is owned
+ // by the cache and should not be deleted, and is valid for as long as the
+ // returned iterator is live.
+ // @param range_del_agg If non-nullptr, adds range deletions to the
+ // aggregator. If an error occurs, returns it in a NewErrorInternalIterator
+ // @param for_compaction If true, a new TableReader may be allocated (but
+ // not cached), depending on the CF options
+ // @param skip_filters Disables loading/accessing the filter block
+ // @param level The level this table is at, -1 for "not set / don't know"
+ InternalIterator* NewIterator(
+ const ReadOptions& options, const FileOptions& toptions,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta, RangeDelAggregator* range_del_agg,
+ const SliceTransform* prefix_extractor, TableReader** table_reader_ptr,
+ HistogramImpl* file_read_hist, TableReaderCaller caller, Arena* arena,
+ bool skip_filters, int level, const InternalKey* smallest_compaction_key,
+ const InternalKey* largest_compaction_key);
+
+ // If a seek to internal key "k" in specified file finds an entry,
+ // call get_context->SaveValue() repeatedly until
+ // it returns false. As a side effect, it will insert the TableReader
+ // into the cache and potentially evict another entry
+ // @param get_context Context for get operation. The result of the lookup
+ // can be retrieved by calling get_context->State()
+ // @param file_read_hist If non-nullptr, the file reader statistics are
+ // recorded
+ // @param skip_filters Disables loading/accessing the filter block
+ // @param level The level this table is at, -1 for "not set / don't know"
+ Status Get(const ReadOptions& options,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta, const Slice& k,
+ GetContext* get_context,
+ const SliceTransform* prefix_extractor = nullptr,
+ HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
+ int level = -1);
+
+ // Return the range delete tombstone iterator of the file specified by
+ // `file_meta`.
+ Status GetRangeTombstoneIterator(
+ const ReadOptions& options,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta,
+ std::unique_ptr<FragmentedRangeTombstoneIterator>* out_iter);
+
+ // If a seek to internal key "k" in specified file finds an entry,
+ // call get_context->SaveValue() repeatedly until
+ // it returns false. As a side effect, it will insert the TableReader
+ // into the cache and potentially evict another entry
+ // @param mget_range Pointer to the structure describing a batch of keys to
+ // be looked up in this table file. The result is stored
+ // in the embedded GetContext
+ // @param skip_filters Disables loading/accessing the filter block
+ // @param level The level this table is at, -1 for "not set / don't know"
+ Status MultiGet(const ReadOptions& options,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta,
+ const MultiGetContext::Range* mget_range,
+ const SliceTransform* prefix_extractor = nullptr,
+ HistogramImpl* file_read_hist = nullptr,
+ bool skip_filters = false, int level = -1);
+
+ // Evict any entry for the specified file number
+ static void Evict(Cache* cache, uint64_t file_number);
+
+ // Clean table handle and erase it from the table cache
+ // Used in DB close, or the file is not live anymore.
+ void EraseHandle(const FileDescriptor& fd, Cache::Handle* handle);
+
+ // Find table reader
+ // @param skip_filters Disables loading/accessing the filter block
+ // @param level == -1 means not specified
+ Status FindTable(const FileOptions& toptions,
+ const InternalKeyComparator& internal_comparator,
+ const FileDescriptor& file_fd, Cache::Handle**,
+ const SliceTransform* prefix_extractor = nullptr,
+ const bool no_io = false, bool record_read_stats = true,
+ HistogramImpl* file_read_hist = nullptr,
+ bool skip_filters = false, int level = -1,
+ bool prefetch_index_and_filter_in_cache = true);
+
+ // Get TableReader from a cache handle.
+ TableReader* GetTableReaderFromHandle(Cache::Handle* handle);
+
+ // Get the table properties of a given table.
+ // @no_io: indicates if we should load table to the cache if it is not present
+ // in table cache yet.
+ // @returns: `properties` will be reset on success. Please note that we will
+ // return Status::Incomplete() if table is not present in cache and
+ // we set `no_io` to be true.
+ Status GetTableProperties(const FileOptions& toptions,
+ const InternalKeyComparator& internal_comparator,
+ const FileDescriptor& file_meta,
+ std::shared_ptr<const TableProperties>* properties,
+ const SliceTransform* prefix_extractor = nullptr,
+ bool no_io = false);
+
+ // Return total memory usage of the table reader of the file.
+ // 0 if table reader of the file is not loaded.
+ size_t GetMemoryUsageByTableReader(
+ const FileOptions& toptions,
+ const InternalKeyComparator& internal_comparator,
+ const FileDescriptor& fd,
+ const SliceTransform* prefix_extractor = nullptr);
+
+ // Returns approximated offset of a key in a file represented by fd.
+ uint64_t ApproximateOffsetOf(
+ const Slice& key, const FileDescriptor& fd, TableReaderCaller caller,
+ const InternalKeyComparator& internal_comparator,
+ const SliceTransform* prefix_extractor = nullptr);
+
+ // Returns approximated data size between start and end keys in a file
+ // represented by fd (the start key must not be greater than the end key).
+ uint64_t ApproximateSize(const Slice& start, const Slice& end,
+ const FileDescriptor& fd, TableReaderCaller caller,
+ const InternalKeyComparator& internal_comparator,
+ const SliceTransform* prefix_extractor = nullptr);
+
+ // Release the handle from a cache
+ void ReleaseHandle(Cache::Handle* handle);
+
+ Cache* get_cache() const { return cache_; }
+
+ // Capacity of the backing Cache that indicates inifinite TableCache capacity.
+ // For example when max_open_files is -1 we set the backing Cache to this.
+ static const int kInfiniteCapacity = 0x400000;
+
+ // The tables opened with this TableCache will be immortal, i.e., their
+ // lifetime is as long as that of the DB.
+ void SetTablesAreImmortal() {
+ if (cache_->GetCapacity() >= kInfiniteCapacity) {
+ immortal_tables_ = true;
+ }
+ }
+
+ private:
+ // Build a table reader
+ Status GetTableReader(const FileOptions& file_options,
+ const InternalKeyComparator& internal_comparator,
+ const FileDescriptor& fd, bool sequential_mode,
+ bool record_read_stats, HistogramImpl* file_read_hist,
+ std::unique_ptr<TableReader>* table_reader,
+ const SliceTransform* prefix_extractor = nullptr,
+ bool skip_filters = false, int level = -1,
+ bool prefetch_index_and_filter_in_cache = true);
+
+ // Create a key prefix for looking up the row cache. The prefix is of the
+ // format row_cache_id + fd_number + seq_no. Later, the user key can be
+ // appended to form the full key
+ void CreateRowCacheKeyPrefix(const ReadOptions& options,
+ const FileDescriptor& fd,
+ const Slice& internal_key,
+ GetContext* get_context, IterKey& row_cache_key);
+
+ // Helper function to lookup the row cache for a key. It appends the
+ // user key to row_cache_key at offset prefix_size
+ bool GetFromRowCache(const Slice& user_key, IterKey& row_cache_key,
+ size_t prefix_size, GetContext* get_context);
+
+ const ImmutableCFOptions& ioptions_;
+ const FileOptions& file_options_;
+ Cache* const cache_;
+ std::string row_cache_id_;
+ bool immortal_tables_;
+ BlockCacheTracer* const block_cache_tracer_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/table_properties_collector.cc b/src/rocksdb/db/table_properties_collector.cc
new file mode 100644
index 000000000..d98ff5e9b
--- /dev/null
+++ b/src/rocksdb/db/table_properties_collector.cc
@@ -0,0 +1,74 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/table_properties_collector.h"
+
+#include "db/dbformat.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+uint64_t GetUint64Property(const UserCollectedProperties& props,
+ const std::string& property_name,
+ bool* property_present) {
+ auto pos = props.find(property_name);
+ if (pos == props.end()) {
+ *property_present = false;
+ return 0;
+ }
+ Slice raw = pos->second;
+ uint64_t val = 0;
+ *property_present = true;
+ return GetVarint64(&raw, &val) ? val : 0;
+}
+
+} // namespace
+
+Status UserKeyTablePropertiesCollector::InternalAdd(const Slice& key,
+ const Slice& value,
+ uint64_t file_size) {
+ ParsedInternalKey ikey;
+ if (!ParseInternalKey(key, &ikey)) {
+ return Status::InvalidArgument("Invalid internal key");
+ }
+
+ return collector_->AddUserKey(ikey.user_key, value, GetEntryType(ikey.type),
+ ikey.sequence, file_size);
+}
+
+void UserKeyTablePropertiesCollector::BlockAdd(
+ uint64_t bLockRawBytes, uint64_t blockCompressedBytesFast,
+ uint64_t blockCompressedBytesSlow) {
+ return collector_->BlockAdd(bLockRawBytes, blockCompressedBytesFast,
+ blockCompressedBytesSlow);
+}
+
+Status UserKeyTablePropertiesCollector::Finish(
+ UserCollectedProperties* properties) {
+ return collector_->Finish(properties);
+}
+
+UserCollectedProperties
+UserKeyTablePropertiesCollector::GetReadableProperties() const {
+ return collector_->GetReadableProperties();
+}
+
+uint64_t GetDeletedKeys(
+ const UserCollectedProperties& props) {
+ bool property_present_ignored;
+ return GetUint64Property(props, TablePropertiesNames::kDeletedKeys,
+ &property_present_ignored);
+}
+
+uint64_t GetMergeOperands(const UserCollectedProperties& props,
+ bool* property_present) {
+ return GetUint64Property(
+ props, TablePropertiesNames::kMergeOperands, property_present);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/table_properties_collector.h b/src/rocksdb/db/table_properties_collector.h
new file mode 100644
index 000000000..130eb64d4
--- /dev/null
+++ b/src/rocksdb/db/table_properties_collector.h
@@ -0,0 +1,107 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// This file defines a collection of statistics collectors.
+#pragma once
+
+#include "rocksdb/table_properties.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace ROCKSDB_NAMESPACE {
+
+// Base class for internal table properties collector.
+class IntTblPropCollector {
+ public:
+ virtual ~IntTblPropCollector() {}
+ virtual Status Finish(UserCollectedProperties* properties) = 0;
+
+ virtual const char* Name() const = 0;
+
+ // @params key the user key that is inserted into the table.
+ // @params value the value that is inserted into the table.
+ virtual Status InternalAdd(const Slice& key, const Slice& value,
+ uint64_t file_size) = 0;
+
+ virtual void BlockAdd(uint64_t blockRawBytes,
+ uint64_t blockCompressedBytesFast,
+ uint64_t blockCompressedBytesSlow) = 0;
+
+ virtual UserCollectedProperties GetReadableProperties() const = 0;
+
+ virtual bool NeedCompact() const { return false; }
+};
+
+// Factory for internal table properties collector.
+class IntTblPropCollectorFactory {
+ public:
+ virtual ~IntTblPropCollectorFactory() {}
+ // has to be thread-safe
+ virtual IntTblPropCollector* CreateIntTblPropCollector(
+ uint32_t column_family_id) = 0;
+
+ // The name of the properties collector can be used for debugging purpose.
+ virtual const char* Name() const = 0;
+};
+
+// When rocksdb creates a new table, it will encode all "user keys" into
+// "internal keys", which contains meta information of a given entry.
+//
+// This class extracts user key from the encoded internal key when Add() is
+// invoked.
+class UserKeyTablePropertiesCollector : public IntTblPropCollector {
+ public:
+ // transfer of ownership
+ explicit UserKeyTablePropertiesCollector(TablePropertiesCollector* collector)
+ : collector_(collector) {}
+
+ virtual ~UserKeyTablePropertiesCollector() {}
+
+ virtual Status InternalAdd(const Slice& key, const Slice& value,
+ uint64_t file_size) override;
+
+ virtual void BlockAdd(uint64_t blockRawBytes,
+ uint64_t blockCompressedBytesFast,
+ uint64_t blockCompressedBytesSlow) override;
+
+ virtual Status Finish(UserCollectedProperties* properties) override;
+
+ virtual const char* Name() const override { return collector_->Name(); }
+
+ UserCollectedProperties GetReadableProperties() const override;
+
+ virtual bool NeedCompact() const override {
+ return collector_->NeedCompact();
+ }
+
+ protected:
+ std::unique_ptr<TablePropertiesCollector> collector_;
+};
+
+class UserKeyTablePropertiesCollectorFactory
+ : public IntTblPropCollectorFactory {
+ public:
+ explicit UserKeyTablePropertiesCollectorFactory(
+ std::shared_ptr<TablePropertiesCollectorFactory> user_collector_factory)
+ : user_collector_factory_(user_collector_factory) {}
+ virtual IntTblPropCollector* CreateIntTblPropCollector(
+ uint32_t column_family_id) override {
+ TablePropertiesCollectorFactory::Context context;
+ context.column_family_id = column_family_id;
+ return new UserKeyTablePropertiesCollector(
+ user_collector_factory_->CreateTablePropertiesCollector(context));
+ }
+
+ virtual const char* Name() const override {
+ return user_collector_factory_->Name();
+ }
+
+ private:
+ std::shared_ptr<TablePropertiesCollectorFactory> user_collector_factory_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/table_properties_collector_test.cc b/src/rocksdb/db/table_properties_collector_test.cc
new file mode 100644
index 000000000..5c202de81
--- /dev/null
+++ b/src/rocksdb/db/table_properties_collector_test.cc
@@ -0,0 +1,515 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "db/table_properties_collector.h"
+#include "env/composite_env_wrapper.h"
+#include "file/sequence_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "options/cf_options.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/meta_blocks.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/table_builder.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TablePropertiesTest : public testing::Test,
+ public testing::WithParamInterface<bool> {
+ public:
+ void SetUp() override { backward_mode_ = GetParam(); }
+
+ bool backward_mode_;
+};
+
+// Utilities test functions
+namespace {
+static const uint32_t kTestColumnFamilyId = 66;
+static const std::string kTestColumnFamilyName = "test_column_fam";
+
+void MakeBuilder(const Options& options, const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& moptions,
+ const InternalKeyComparator& internal_comparator,
+ const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+ int_tbl_prop_collector_factories,
+ std::unique_ptr<WritableFileWriter>* writable,
+ std::unique_ptr<TableBuilder>* builder) {
+ std::unique_ptr<WritableFile> wf(new test::StringSink);
+ writable->reset(
+ new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(wf)),
+ "" /* don't care */, EnvOptions()));
+ int unknown_level = -1;
+ builder->reset(NewTableBuilder(
+ ioptions, moptions, internal_comparator, int_tbl_prop_collector_factories,
+ kTestColumnFamilyId, kTestColumnFamilyName, writable->get(),
+ options.compression, options.sample_for_compression,
+ options.compression_opts, unknown_level));
+}
+} // namespace
+
+// Collects keys that starts with "A" in a table.
+class RegularKeysStartWithA: public TablePropertiesCollector {
+ public:
+ const char* Name() const override { return "RegularKeysStartWithA"; }
+
+ Status Finish(UserCollectedProperties* properties) override {
+ std::string encoded;
+ std::string encoded_num_puts;
+ std::string encoded_num_deletes;
+ std::string encoded_num_single_deletes;
+ std::string encoded_num_size_changes;
+ PutVarint32(&encoded, count_);
+ PutVarint32(&encoded_num_puts, num_puts_);
+ PutVarint32(&encoded_num_deletes, num_deletes_);
+ PutVarint32(&encoded_num_single_deletes, num_single_deletes_);
+ PutVarint32(&encoded_num_size_changes, num_size_changes_);
+ *properties = UserCollectedProperties{
+ {"TablePropertiesTest", message_},
+ {"Count", encoded},
+ {"NumPuts", encoded_num_puts},
+ {"NumDeletes", encoded_num_deletes},
+ {"NumSingleDeletes", encoded_num_single_deletes},
+ {"NumSizeChanges", encoded_num_size_changes},
+ };
+ return Status::OK();
+ }
+
+ Status AddUserKey(const Slice& user_key, const Slice& /*value*/,
+ EntryType type, SequenceNumber /*seq*/,
+ uint64_t file_size) override {
+ // simply asssume all user keys are not empty.
+ if (user_key.data()[0] == 'A') {
+ ++count_;
+ }
+ if (type == kEntryPut) {
+ num_puts_++;
+ } else if (type == kEntryDelete) {
+ num_deletes_++;
+ } else if (type == kEntrySingleDelete) {
+ num_single_deletes_++;
+ }
+ if (file_size < file_size_) {
+ message_ = "File size should not decrease.";
+ } else if (file_size != file_size_) {
+ num_size_changes_++;
+ }
+
+ return Status::OK();
+ }
+
+ UserCollectedProperties GetReadableProperties() const override {
+ return UserCollectedProperties{};
+ }
+
+ private:
+ std::string message_ = "Rocksdb";
+ uint32_t count_ = 0;
+ uint32_t num_puts_ = 0;
+ uint32_t num_deletes_ = 0;
+ uint32_t num_single_deletes_ = 0;
+ uint32_t num_size_changes_ = 0;
+ uint64_t file_size_ = 0;
+};
+
+// Collects keys that starts with "A" in a table. Backward compatible mode
+// It is also used to test internal key table property collector
+class RegularKeysStartWithABackwardCompatible
+ : public TablePropertiesCollector {
+ public:
+ const char* Name() const override { return "RegularKeysStartWithA"; }
+
+ Status Finish(UserCollectedProperties* properties) override {
+ std::string encoded;
+ PutVarint32(&encoded, count_);
+ *properties = UserCollectedProperties{{"TablePropertiesTest", "Rocksdb"},
+ {"Count", encoded}};
+ return Status::OK();
+ }
+
+ Status Add(const Slice& user_key, const Slice& /*value*/) override {
+ // simply asssume all user keys are not empty.
+ if (user_key.data()[0] == 'A') {
+ ++count_;
+ }
+ return Status::OK();
+ }
+
+ UserCollectedProperties GetReadableProperties() const override {
+ return UserCollectedProperties{};
+ }
+
+ private:
+ uint32_t count_ = 0;
+};
+
+class RegularKeysStartWithAInternal : public IntTblPropCollector {
+ public:
+ const char* Name() const override { return "RegularKeysStartWithA"; }
+
+ Status Finish(UserCollectedProperties* properties) override {
+ std::string encoded;
+ PutVarint32(&encoded, count_);
+ *properties = UserCollectedProperties{{"TablePropertiesTest", "Rocksdb"},
+ {"Count", encoded}};
+ return Status::OK();
+ }
+
+ Status InternalAdd(const Slice& user_key, const Slice& /*value*/,
+ uint64_t /*file_size*/) override {
+ // simply asssume all user keys are not empty.
+ if (user_key.data()[0] == 'A') {
+ ++count_;
+ }
+ return Status::OK();
+ }
+
+ void BlockAdd(uint64_t /* blockRawBytes */,
+ uint64_t /* blockCompressedBytesFast */,
+ uint64_t /* blockCompressedBytesSlow */) override {
+ // Nothing to do.
+ return;
+ }
+
+ UserCollectedProperties GetReadableProperties() const override {
+ return UserCollectedProperties{};
+ }
+
+ private:
+ uint32_t count_ = 0;
+};
+
+class RegularKeysStartWithAFactory : public IntTblPropCollectorFactory,
+ public TablePropertiesCollectorFactory {
+ public:
+ explicit RegularKeysStartWithAFactory(bool backward_mode)
+ : backward_mode_(backward_mode) {}
+ TablePropertiesCollector* CreateTablePropertiesCollector(
+ TablePropertiesCollectorFactory::Context context) override {
+ EXPECT_EQ(kTestColumnFamilyId, context.column_family_id);
+ if (!backward_mode_) {
+ return new RegularKeysStartWithA();
+ } else {
+ return new RegularKeysStartWithABackwardCompatible();
+ }
+ }
+ IntTblPropCollector* CreateIntTblPropCollector(
+ uint32_t /*column_family_id*/) override {
+ return new RegularKeysStartWithAInternal();
+ }
+ const char* Name() const override { return "RegularKeysStartWithA"; }
+
+ bool backward_mode_;
+};
+
+class FlushBlockEveryThreePolicy : public FlushBlockPolicy {
+ public:
+ bool Update(const Slice& /*key*/, const Slice& /*value*/) override {
+ return (++count_ % 3U == 0);
+ }
+
+ private:
+ uint64_t count_ = 0;
+};
+
+class FlushBlockEveryThreePolicyFactory : public FlushBlockPolicyFactory {
+ public:
+ explicit FlushBlockEveryThreePolicyFactory() {}
+
+ const char* Name() const override {
+ return "FlushBlockEveryThreePolicyFactory";
+ }
+
+ FlushBlockPolicy* NewFlushBlockPolicy(
+ const BlockBasedTableOptions& /*table_options*/,
+ const BlockBuilder& /*data_block_builder*/) const override {
+ return new FlushBlockEveryThreePolicy;
+ }
+};
+
+extern const uint64_t kBlockBasedTableMagicNumber;
+extern const uint64_t kPlainTableMagicNumber;
+namespace {
+void TestCustomizedTablePropertiesCollector(
+ bool backward_mode, uint64_t magic_number, bool test_int_tbl_prop_collector,
+ const Options& options, const InternalKeyComparator& internal_comparator) {
+ // make sure the entries will be inserted with order.
+ std::map<std::pair<std::string, ValueType>, std::string> kvs = {
+ {{"About ", kTypeValue}, "val5"}, // starts with 'A'
+ {{"Abstract", kTypeValue}, "val2"}, // starts with 'A'
+ {{"Around ", kTypeValue}, "val7"}, // starts with 'A'
+ {{"Beyond ", kTypeValue}, "val3"},
+ {{"Builder ", kTypeValue}, "val1"},
+ {{"Love ", kTypeDeletion}, ""},
+ {{"Cancel ", kTypeValue}, "val4"},
+ {{"Find ", kTypeValue}, "val6"},
+ {{"Rocks ", kTypeDeletion}, ""},
+ {{"Foo ", kTypeSingleDeletion}, ""},
+ };
+
+ // -- Step 1: build table
+ std::unique_ptr<TableBuilder> builder;
+ std::unique_ptr<WritableFileWriter> writer;
+ const ImmutableCFOptions ioptions(options);
+ const MutableCFOptions moptions(options);
+ std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
+ int_tbl_prop_collector_factories;
+ if (test_int_tbl_prop_collector) {
+ int_tbl_prop_collector_factories.emplace_back(
+ new RegularKeysStartWithAFactory(backward_mode));
+ } else {
+ GetIntTblPropCollectorFactory(ioptions, &int_tbl_prop_collector_factories);
+ }
+ MakeBuilder(options, ioptions, moptions, internal_comparator,
+ &int_tbl_prop_collector_factories, &writer, &builder);
+
+ SequenceNumber seqNum = 0U;
+ for (const auto& kv : kvs) {
+ InternalKey ikey(kv.first.first, seqNum++, kv.first.second);
+ builder->Add(ikey.Encode(), kv.second);
+ }
+ ASSERT_OK(builder->Finish());
+ writer->Flush();
+
+ // -- Step 2: Read properties
+ LegacyWritableFileWrapper* file =
+ static_cast<LegacyWritableFileWrapper*>(writer->writable_file());
+ test::StringSink* fwf = static_cast<test::StringSink*>(file->target());
+ std::unique_ptr<RandomAccessFileReader> fake_file_reader(
+ test::GetRandomAccessFileReader(
+ new test::StringSource(fwf->contents())));
+ TableProperties* props;
+ Status s = ReadTableProperties(fake_file_reader.get(), fwf->contents().size(),
+ magic_number, ioptions, &props,
+ true /* compression_type_missing */);
+ std::unique_ptr<TableProperties> props_guard(props);
+ ASSERT_OK(s);
+
+ auto user_collected = props->user_collected_properties;
+
+ ASSERT_NE(user_collected.find("TablePropertiesTest"), user_collected.end());
+ ASSERT_EQ("Rocksdb", user_collected.at("TablePropertiesTest"));
+
+ uint32_t starts_with_A = 0;
+ ASSERT_NE(user_collected.find("Count"), user_collected.end());
+ Slice key(user_collected.at("Count"));
+ ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
+ ASSERT_EQ(3u, starts_with_A);
+
+ if (!backward_mode && !test_int_tbl_prop_collector) {
+ uint32_t num_puts;
+ ASSERT_NE(user_collected.find("NumPuts"), user_collected.end());
+ Slice key_puts(user_collected.at("NumPuts"));
+ ASSERT_TRUE(GetVarint32(&key_puts, &num_puts));
+ ASSERT_EQ(7u, num_puts);
+
+ uint32_t num_deletes;
+ ASSERT_NE(user_collected.find("NumDeletes"), user_collected.end());
+ Slice key_deletes(user_collected.at("NumDeletes"));
+ ASSERT_TRUE(GetVarint32(&key_deletes, &num_deletes));
+ ASSERT_EQ(2u, num_deletes);
+
+ uint32_t num_single_deletes;
+ ASSERT_NE(user_collected.find("NumSingleDeletes"), user_collected.end());
+ Slice key_single_deletes(user_collected.at("NumSingleDeletes"));
+ ASSERT_TRUE(GetVarint32(&key_single_deletes, &num_single_deletes));
+ ASSERT_EQ(1u, num_single_deletes);
+
+ uint32_t num_size_changes;
+ ASSERT_NE(user_collected.find("NumSizeChanges"), user_collected.end());
+ Slice key_size_changes(user_collected.at("NumSizeChanges"));
+ ASSERT_TRUE(GetVarint32(&key_size_changes, &num_size_changes));
+ ASSERT_GE(num_size_changes, 2u);
+ }
+}
+} // namespace
+
+TEST_P(TablePropertiesTest, CustomizedTablePropertiesCollector) {
+ // Test properties collectors with internal keys or regular keys
+ // for block based table
+ for (bool encode_as_internal : { true, false }) {
+ Options options;
+ BlockBasedTableOptions table_options;
+ table_options.flush_block_policy_factory =
+ std::make_shared<FlushBlockEveryThreePolicyFactory>();
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ test::PlainInternalKeyComparator ikc(options.comparator);
+ std::shared_ptr<TablePropertiesCollectorFactory> collector_factory(
+ new RegularKeysStartWithAFactory(backward_mode_));
+ options.table_properties_collector_factories.resize(1);
+ options.table_properties_collector_factories[0] = collector_factory;
+
+ TestCustomizedTablePropertiesCollector(backward_mode_,
+ kBlockBasedTableMagicNumber,
+ encode_as_internal, options, ikc);
+
+#ifndef ROCKSDB_LITE // PlainTable is not supported in Lite
+ // test plain table
+ PlainTableOptions plain_table_options;
+ plain_table_options.user_key_len = 8;
+ plain_table_options.bloom_bits_per_key = 8;
+ plain_table_options.hash_table_ratio = 0;
+
+ options.table_factory =
+ std::make_shared<PlainTableFactory>(plain_table_options);
+ TestCustomizedTablePropertiesCollector(backward_mode_,
+ kPlainTableMagicNumber,
+ encode_as_internal, options, ikc);
+#endif // !ROCKSDB_LITE
+ }
+}
+
+namespace {
+void TestInternalKeyPropertiesCollector(
+ bool backward_mode, uint64_t magic_number, bool sanitized,
+ std::shared_ptr<TableFactory> table_factory) {
+ InternalKey keys[] = {
+ InternalKey("A ", 0, ValueType::kTypeValue),
+ InternalKey("B ", 1, ValueType::kTypeValue),
+ InternalKey("C ", 2, ValueType::kTypeValue),
+ InternalKey("W ", 3, ValueType::kTypeDeletion),
+ InternalKey("X ", 4, ValueType::kTypeDeletion),
+ InternalKey("Y ", 5, ValueType::kTypeDeletion),
+ InternalKey("Z ", 6, ValueType::kTypeDeletion),
+ InternalKey("a ", 7, ValueType::kTypeSingleDeletion),
+ InternalKey("b ", 8, ValueType::kTypeMerge),
+ InternalKey("c ", 9, ValueType::kTypeMerge),
+ };
+
+ std::unique_ptr<TableBuilder> builder;
+ std::unique_ptr<WritableFileWriter> writable;
+ Options options;
+ test::PlainInternalKeyComparator pikc(options.comparator);
+
+ std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
+ int_tbl_prop_collector_factories;
+ options.table_factory = table_factory;
+ if (sanitized) {
+ options.table_properties_collector_factories.emplace_back(
+ new RegularKeysStartWithAFactory(backward_mode));
+ // with sanitization, even regular properties collector will be able to
+ // handle internal keys.
+ auto comparator = options.comparator;
+ // HACK: Set options.info_log to avoid writing log in
+ // SanitizeOptions().
+ options.info_log = std::make_shared<test::NullLogger>();
+ options = SanitizeOptions("db", // just a place holder
+ options);
+ ImmutableCFOptions ioptions(options);
+ GetIntTblPropCollectorFactory(ioptions, &int_tbl_prop_collector_factories);
+ options.comparator = comparator;
+ }
+ const ImmutableCFOptions ioptions(options);
+ MutableCFOptions moptions(options);
+
+ for (int iter = 0; iter < 2; ++iter) {
+ MakeBuilder(options, ioptions, moptions, pikc,
+ &int_tbl_prop_collector_factories, &writable, &builder);
+ for (const auto& k : keys) {
+ builder->Add(k.Encode(), "val");
+ }
+
+ ASSERT_OK(builder->Finish());
+ writable->Flush();
+
+ LegacyWritableFileWrapper* file =
+ static_cast<LegacyWritableFileWrapper*>(writable->writable_file());
+ test::StringSink* fwf = static_cast<test::StringSink*>(file->target());
+ std::unique_ptr<RandomAccessFileReader> reader(
+ test::GetRandomAccessFileReader(
+ new test::StringSource(fwf->contents())));
+ TableProperties* props;
+ Status s =
+ ReadTableProperties(reader.get(), fwf->contents().size(), magic_number,
+ ioptions, &props, true /* compression_type_missing */);
+ ASSERT_OK(s);
+
+ std::unique_ptr<TableProperties> props_guard(props);
+ auto user_collected = props->user_collected_properties;
+ uint64_t deleted = GetDeletedKeys(user_collected);
+ ASSERT_EQ(5u, deleted); // deletes + single-deletes
+
+ bool property_present;
+ uint64_t merges = GetMergeOperands(user_collected, &property_present);
+ ASSERT_TRUE(property_present);
+ ASSERT_EQ(2u, merges);
+
+ if (sanitized) {
+ uint32_t starts_with_A = 0;
+ ASSERT_NE(user_collected.find("Count"), user_collected.end());
+ Slice key(user_collected.at("Count"));
+ ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
+ ASSERT_EQ(1u, starts_with_A);
+
+ if (!backward_mode) {
+ uint32_t num_puts;
+ ASSERT_NE(user_collected.find("NumPuts"), user_collected.end());
+ Slice key_puts(user_collected.at("NumPuts"));
+ ASSERT_TRUE(GetVarint32(&key_puts, &num_puts));
+ ASSERT_EQ(3u, num_puts);
+
+ uint32_t num_deletes;
+ ASSERT_NE(user_collected.find("NumDeletes"), user_collected.end());
+ Slice key_deletes(user_collected.at("NumDeletes"));
+ ASSERT_TRUE(GetVarint32(&key_deletes, &num_deletes));
+ ASSERT_EQ(4u, num_deletes);
+
+ uint32_t num_single_deletes;
+ ASSERT_NE(user_collected.find("NumSingleDeletes"),
+ user_collected.end());
+ Slice key_single_deletes(user_collected.at("NumSingleDeletes"));
+ ASSERT_TRUE(GetVarint32(&key_single_deletes, &num_single_deletes));
+ ASSERT_EQ(1u, num_single_deletes);
+ }
+ }
+ }
+}
+} // namespace
+
+TEST_P(TablePropertiesTest, InternalKeyPropertiesCollector) {
+ TestInternalKeyPropertiesCollector(
+ backward_mode_, kBlockBasedTableMagicNumber, true /* sanitize */,
+ std::make_shared<BlockBasedTableFactory>());
+ if (backward_mode_) {
+ TestInternalKeyPropertiesCollector(
+ backward_mode_, kBlockBasedTableMagicNumber, false /* not sanitize */,
+ std::make_shared<BlockBasedTableFactory>());
+ }
+
+#ifndef ROCKSDB_LITE // PlainTable is not supported in Lite
+ PlainTableOptions plain_table_options;
+ plain_table_options.user_key_len = 8;
+ plain_table_options.bloom_bits_per_key = 8;
+ plain_table_options.hash_table_ratio = 0;
+
+ TestInternalKeyPropertiesCollector(
+ backward_mode_, kPlainTableMagicNumber, false /* not sanitize */,
+ std::make_shared<PlainTableFactory>(plain_table_options));
+#endif // !ROCKSDB_LITE
+}
+
+INSTANTIATE_TEST_CASE_P(InternalKeyPropertiesCollector, TablePropertiesTest,
+ ::testing::Bool());
+
+INSTANTIATE_TEST_CASE_P(CustomizedTablePropertiesCollector, TablePropertiesTest,
+ ::testing::Bool());
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/transaction_log_impl.cc b/src/rocksdb/db/transaction_log_impl.cc
new file mode 100644
index 000000000..56bc161a3
--- /dev/null
+++ b/src/rocksdb/db/transaction_log_impl.cc
@@ -0,0 +1,315 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/transaction_log_impl.h"
+#include <cinttypes>
+#include "db/write_batch_internal.h"
+#include "file/sequence_file_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+TransactionLogIteratorImpl::TransactionLogIteratorImpl(
+ const std::string& dir, const ImmutableDBOptions* options,
+ const TransactionLogIterator::ReadOptions& read_options,
+ const EnvOptions& soptions, const SequenceNumber seq,
+ std::unique_ptr<VectorLogPtr> files, VersionSet const* const versions,
+ const bool seq_per_batch)
+ : dir_(dir),
+ options_(options),
+ read_options_(read_options),
+ soptions_(soptions),
+ starting_sequence_number_(seq),
+ files_(std::move(files)),
+ started_(false),
+ is_valid_(false),
+ current_file_index_(0),
+ current_batch_seq_(0),
+ current_last_seq_(0),
+ versions_(versions),
+ seq_per_batch_(seq_per_batch) {
+ assert(files_ != nullptr);
+ assert(versions_ != nullptr);
+
+ reporter_.env = options_->env;
+ reporter_.info_log = options_->info_log.get();
+ SeekToStartSequence(); // Seek till starting sequence
+}
+
+Status TransactionLogIteratorImpl::OpenLogFile(
+ const LogFile* log_file,
+ std::unique_ptr<SequentialFileReader>* file_reader) {
+ FileSystem* fs = options_->fs.get();
+ std::unique_ptr<FSSequentialFile> file;
+ std::string fname;
+ Status s;
+ EnvOptions optimized_env_options = fs->OptimizeForLogRead(soptions_);
+ if (log_file->Type() == kArchivedLogFile) {
+ fname = ArchivedLogFileName(dir_, log_file->LogNumber());
+ s = fs->NewSequentialFile(fname, optimized_env_options, &file, nullptr);
+ } else {
+ fname = LogFileName(dir_, log_file->LogNumber());
+ s = fs->NewSequentialFile(fname, optimized_env_options, &file, nullptr);
+ if (!s.ok()) {
+ // If cannot open file in DB directory.
+ // Try the archive dir, as it could have moved in the meanwhile.
+ fname = ArchivedLogFileName(dir_, log_file->LogNumber());
+ s = fs->NewSequentialFile(fname, optimized_env_options,
+ &file, nullptr);
+ }
+ }
+ if (s.ok()) {
+ file_reader->reset(new SequentialFileReader(std::move(file), fname));
+ }
+ return s;
+}
+
+BatchResult TransactionLogIteratorImpl::GetBatch() {
+ assert(is_valid_); // cannot call in a non valid state.
+ BatchResult result;
+ result.sequence = current_batch_seq_;
+ result.writeBatchPtr = std::move(current_batch_);
+ return result;
+}
+
+Status TransactionLogIteratorImpl::status() { return current_status_; }
+
+bool TransactionLogIteratorImpl::Valid() { return started_ && is_valid_; }
+
+bool TransactionLogIteratorImpl::RestrictedRead(Slice* record) {
+ // Don't read if no more complete entries to read from logs
+ if (current_last_seq_ >= versions_->LastSequence()) {
+ return false;
+ }
+ return current_log_reader_->ReadRecord(record, &scratch_);
+}
+
+void TransactionLogIteratorImpl::SeekToStartSequence(uint64_t start_file_index,
+ bool strict) {
+ Slice record;
+ started_ = false;
+ is_valid_ = false;
+ if (files_->size() <= start_file_index) {
+ return;
+ }
+ Status s =
+ OpenLogReader(files_->at(static_cast<size_t>(start_file_index)).get());
+ if (!s.ok()) {
+ current_status_ = s;
+ reporter_.Info(current_status_.ToString().c_str());
+ return;
+ }
+ while (RestrictedRead(&record)) {
+ if (record.size() < WriteBatchInternal::kHeader) {
+ reporter_.Corruption(
+ record.size(), Status::Corruption("very small log record"));
+ continue;
+ }
+ UpdateCurrentWriteBatch(record);
+ if (current_last_seq_ >= starting_sequence_number_) {
+ if (strict && current_batch_seq_ != starting_sequence_number_) {
+ current_status_ = Status::Corruption(
+ "Gap in sequence number. Could not "
+ "seek to required sequence number");
+ reporter_.Info(current_status_.ToString().c_str());
+ return;
+ } else if (strict) {
+ reporter_.Info("Could seek required sequence number. Iterator will "
+ "continue.");
+ }
+ is_valid_ = true;
+ started_ = true; // set started_ as we could seek till starting sequence
+ return;
+ } else {
+ is_valid_ = false;
+ }
+ }
+
+ // Could not find start sequence in first file. Normally this must be the
+ // only file. Otherwise log the error and let the iterator return next entry
+ // If strict is set, we want to seek exactly till the start sequence and it
+ // should have been present in the file we scanned above
+ if (strict) {
+ current_status_ = Status::Corruption(
+ "Gap in sequence number. Could not "
+ "seek to required sequence number");
+ reporter_.Info(current_status_.ToString().c_str());
+ } else if (files_->size() != 1) {
+ current_status_ = Status::Corruption(
+ "Start sequence was not found, "
+ "skipping to the next available");
+ reporter_.Info(current_status_.ToString().c_str());
+ // Let NextImpl find the next available entry. started_ remains false
+ // because we don't want to check for gaps while moving to start sequence
+ NextImpl(true);
+ }
+}
+
+void TransactionLogIteratorImpl::Next() {
+ return NextImpl(false);
+}
+
+void TransactionLogIteratorImpl::NextImpl(bool internal) {
+ Slice record;
+ is_valid_ = false;
+ if (!internal && !started_) {
+ // Runs every time until we can seek to the start sequence
+ return SeekToStartSequence();
+ }
+ while(true) {
+ assert(current_log_reader_);
+ if (current_log_reader_->IsEOF()) {
+ current_log_reader_->UnmarkEOF();
+ }
+ while (RestrictedRead(&record)) {
+ if (record.size() < WriteBatchInternal::kHeader) {
+ reporter_.Corruption(
+ record.size(), Status::Corruption("very small log record"));
+ continue;
+ } else {
+ // started_ should be true if called by application
+ assert(internal || started_);
+ // started_ should be false if called internally
+ assert(!internal || !started_);
+ UpdateCurrentWriteBatch(record);
+ if (internal && !started_) {
+ started_ = true;
+ }
+ return;
+ }
+ }
+
+ // Open the next file
+ if (current_file_index_ < files_->size() - 1) {
+ ++current_file_index_;
+ Status s = OpenLogReader(files_->at(current_file_index_).get());
+ if (!s.ok()) {
+ is_valid_ = false;
+ current_status_ = s;
+ return;
+ }
+ } else {
+ is_valid_ = false;
+ if (current_last_seq_ == versions_->LastSequence()) {
+ current_status_ = Status::OK();
+ } else {
+ const char* msg = "Create a new iterator to fetch the new tail.";
+ current_status_ = Status::TryAgain(msg);
+ }
+ return;
+ }
+ }
+}
+
+bool TransactionLogIteratorImpl::IsBatchExpected(
+ const WriteBatch* batch, const SequenceNumber expected_seq) {
+ assert(batch);
+ SequenceNumber batchSeq = WriteBatchInternal::Sequence(batch);
+ if (batchSeq != expected_seq) {
+ char buf[200];
+ snprintf(buf, sizeof(buf),
+ "Discontinuity in log records. Got seq=%" PRIu64
+ ", Expected seq=%" PRIu64 ", Last flushed seq=%" PRIu64
+ ".Log iterator will reseek the correct batch.",
+ batchSeq, expected_seq, versions_->LastSequence());
+ reporter_.Info(buf);
+ return false;
+ }
+ return true;
+}
+
+void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) {
+ std::unique_ptr<WriteBatch> batch(new WriteBatch());
+ WriteBatchInternal::SetContents(batch.get(), record);
+
+ SequenceNumber expected_seq = current_last_seq_ + 1;
+ // If the iterator has started, then confirm that we get continuous batches
+ if (started_ && !IsBatchExpected(batch.get(), expected_seq)) {
+ // Seek to the batch having expected sequence number
+ if (expected_seq < files_->at(current_file_index_)->StartSequence()) {
+ // Expected batch must lie in the previous log file
+ // Avoid underflow.
+ if (current_file_index_ != 0) {
+ current_file_index_--;
+ }
+ }
+ starting_sequence_number_ = expected_seq;
+ // currentStatus_ will be set to Ok if reseek succeeds
+ // Note: this is still ok in seq_pre_batch_ && two_write_queuesp_ mode
+ // that allows gaps in the WAL since it will still skip over the gap.
+ current_status_ = Status::NotFound("Gap in sequence numbers");
+ // In seq_per_batch_ mode, gaps in the seq are possible so the strict mode
+ // should be disabled
+ return SeekToStartSequence(current_file_index_, !seq_per_batch_);
+ }
+
+ struct BatchCounter : public WriteBatch::Handler {
+ SequenceNumber sequence_;
+ BatchCounter(SequenceNumber sequence) : sequence_(sequence) {}
+ Status MarkNoop(bool empty_batch) override {
+ if (!empty_batch) {
+ sequence_++;
+ }
+ return Status::OK();
+ }
+ Status MarkEndPrepare(const Slice&) override {
+ sequence_++;
+ return Status::OK();
+ }
+ Status MarkCommit(const Slice&) override {
+ sequence_++;
+ return Status::OK();
+ }
+
+ Status PutCF(uint32_t /*cf*/, const Slice& /*key*/,
+ const Slice& /*val*/) override {
+ return Status::OK();
+ }
+ Status DeleteCF(uint32_t /*cf*/, const Slice& /*key*/) override {
+ return Status::OK();
+ }
+ Status SingleDeleteCF(uint32_t /*cf*/, const Slice& /*key*/) override {
+ return Status::OK();
+ }
+ Status MergeCF(uint32_t /*cf*/, const Slice& /*key*/,
+ const Slice& /*val*/) override {
+ return Status::OK();
+ }
+ Status MarkBeginPrepare(bool) override { return Status::OK(); }
+ Status MarkRollback(const Slice&) override { return Status::OK(); }
+ };
+
+ current_batch_seq_ = WriteBatchInternal::Sequence(batch.get());
+ if (seq_per_batch_) {
+ BatchCounter counter(current_batch_seq_);
+ batch->Iterate(&counter);
+ current_last_seq_ = counter.sequence_;
+ } else {
+ current_last_seq_ =
+ current_batch_seq_ + WriteBatchInternal::Count(batch.get()) - 1;
+ }
+ // currentBatchSeq_ can only change here
+ assert(current_last_seq_ <= versions_->LastSequence());
+
+ current_batch_ = std::move(batch);
+ is_valid_ = true;
+ current_status_ = Status::OK();
+}
+
+Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* log_file) {
+ std::unique_ptr<SequentialFileReader> file;
+ Status s = OpenLogFile(log_file, &file);
+ if (!s.ok()) {
+ return s;
+ }
+ assert(file);
+ current_log_reader_.reset(
+ new log::Reader(options_->info_log, std::move(file), &reporter_,
+ read_options_.verify_checksums_, log_file->LogNumber()));
+ return Status::OK();
+}
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/db/transaction_log_impl.h b/src/rocksdb/db/transaction_log_impl.h
new file mode 100644
index 000000000..eb53daf2b
--- /dev/null
+++ b/src/rocksdb/db/transaction_log_impl.h
@@ -0,0 +1,127 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#ifndef ROCKSDB_LITE
+#include <vector>
+
+#include "db/log_reader.h"
+#include "db/version_set.h"
+#include "file/filename.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/transaction_log.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class LogFileImpl : public LogFile {
+ public:
+ LogFileImpl(uint64_t logNum, WalFileType logType, SequenceNumber startSeq,
+ uint64_t sizeBytes) :
+ logNumber_(logNum),
+ type_(logType),
+ startSequence_(startSeq),
+ sizeFileBytes_(sizeBytes) {
+ }
+
+ std::string PathName() const override {
+ if (type_ == kArchivedLogFile) {
+ return ArchivedLogFileName("", logNumber_);
+ }
+ return LogFileName("", logNumber_);
+ }
+
+ uint64_t LogNumber() const override { return logNumber_; }
+
+ WalFileType Type() const override { return type_; }
+
+ SequenceNumber StartSequence() const override { return startSequence_; }
+
+ uint64_t SizeFileBytes() const override { return sizeFileBytes_; }
+
+ bool operator < (const LogFile& that) const {
+ return LogNumber() < that.LogNumber();
+ }
+
+ private:
+ uint64_t logNumber_;
+ WalFileType type_;
+ SequenceNumber startSequence_;
+ uint64_t sizeFileBytes_;
+
+};
+
+class TransactionLogIteratorImpl : public TransactionLogIterator {
+ public:
+ TransactionLogIteratorImpl(
+ const std::string& dir, const ImmutableDBOptions* options,
+ const TransactionLogIterator::ReadOptions& read_options,
+ const EnvOptions& soptions, const SequenceNumber seqNum,
+ std::unique_ptr<VectorLogPtr> files, VersionSet const* const versions,
+ const bool seq_per_batch);
+
+ virtual bool Valid() override;
+
+ virtual void Next() override;
+
+ virtual Status status() override;
+
+ virtual BatchResult GetBatch() override;
+
+ private:
+ const std::string& dir_;
+ const ImmutableDBOptions* options_;
+ const TransactionLogIterator::ReadOptions read_options_;
+ const EnvOptions& soptions_;
+ SequenceNumber starting_sequence_number_;
+ std::unique_ptr<VectorLogPtr> files_;
+ bool started_;
+ bool is_valid_; // not valid when it starts of.
+ Status current_status_;
+ size_t current_file_index_;
+ std::unique_ptr<WriteBatch> current_batch_;
+ std::unique_ptr<log::Reader> current_log_reader_;
+ std::string scratch_;
+ Status OpenLogFile(const LogFile* log_file,
+ std::unique_ptr<SequentialFileReader>* file);
+
+ struct LogReporter : public log::Reader::Reporter {
+ Env* env;
+ Logger* info_log;
+ virtual void Corruption(size_t bytes, const Status& s) override {
+ ROCKS_LOG_ERROR(info_log, "dropping %" ROCKSDB_PRIszt " bytes; %s", bytes,
+ s.ToString().c_str());
+ }
+ virtual void Info(const char* s) { ROCKS_LOG_INFO(info_log, "%s", s); }
+ } reporter_;
+
+ SequenceNumber
+ current_batch_seq_; // sequence number at start of current batch
+ SequenceNumber current_last_seq_; // last sequence in the current batch
+ // Used only to get latest seq. num
+ // TODO(icanadi) can this be just a callback?
+ VersionSet const* const versions_;
+ const bool seq_per_batch_;
+ // Reads from transaction log only if the writebatch record has been written
+ bool RestrictedRead(Slice* record);
+ // Seeks to startingSequenceNumber reading from startFileIndex in files_.
+ // If strict is set,then must get a batch starting with startingSequenceNumber
+ void SeekToStartSequence(uint64_t start_file_index = 0, bool strict = false);
+ // Implementation of Next. SeekToStartSequence calls it internally with
+ // internal=true to let it find next entry even if it has to jump gaps because
+ // the iterator may start off from the first available entry but promises to
+ // be continuous after that
+ void NextImpl(bool internal = false);
+ // Check if batch is expected, else return false
+ bool IsBatchExpected(const WriteBatch* batch, SequenceNumber expected_seq);
+ // Update current batch if a continuous batch is found, else return false
+ void UpdateCurrentWriteBatch(const Slice& record);
+ Status OpenLogReader(const LogFile* file);
+};
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/db/trim_history_scheduler.cc b/src/rocksdb/db/trim_history_scheduler.cc
new file mode 100644
index 000000000..d7ca0899f
--- /dev/null
+++ b/src/rocksdb/db/trim_history_scheduler.cc
@@ -0,0 +1,54 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/trim_history_scheduler.h"
+
+#include <cassert>
+
+#include "db/column_family.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void TrimHistoryScheduler::ScheduleWork(ColumnFamilyData* cfd) {
+ std::lock_guard<std::mutex> lock(checking_mutex_);
+ cfd->Ref();
+ cfds_.push_back(cfd);
+ is_empty_.store(false, std::memory_order_relaxed);
+}
+
+ColumnFamilyData* TrimHistoryScheduler::TakeNextColumnFamily() {
+ std::lock_guard<std::mutex> lock(checking_mutex_);
+ while (true) {
+ if (cfds_.empty()) {
+ return nullptr;
+ }
+ ColumnFamilyData* cfd = cfds_.back();
+ cfds_.pop_back();
+ if (cfds_.empty()) {
+ is_empty_.store(true, std::memory_order_relaxed);
+ }
+
+ if (!cfd->IsDropped()) {
+ // success
+ return cfd;
+ }
+ cfd->UnrefAndTryDelete();
+ }
+}
+
+bool TrimHistoryScheduler::Empty() {
+ bool is_empty = is_empty_.load(std::memory_order_relaxed);
+ return is_empty;
+}
+
+void TrimHistoryScheduler::Clear() {
+ ColumnFamilyData* cfd;
+ while ((cfd = TakeNextColumnFamily()) != nullptr) {
+ cfd->UnrefAndTryDelete();
+ }
+ assert(Empty());
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/trim_history_scheduler.h b/src/rocksdb/db/trim_history_scheduler.h
new file mode 100644
index 000000000..b17f6170f
--- /dev/null
+++ b/src/rocksdb/db/trim_history_scheduler.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdint.h>
+#include <atomic>
+#include <mutex>
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ColumnFamilyData;
+
+// Similar to FlushScheduler, TrimHistoryScheduler is a FIFO queue that keeps
+// track of column families whose flushed immutable memtables may need to be
+// removed (aka trimmed). The actual trimming may be slightly delayed. Due to
+// the use of the mutex and atomic variable, ScheduleWork,
+// TakeNextColumnFamily, and, Empty can be called concurrently.
+class TrimHistoryScheduler {
+ public:
+ TrimHistoryScheduler() : is_empty_(true) {}
+
+ // When a column family needs history trimming, add cfd to the FIFO queue
+ void ScheduleWork(ColumnFamilyData* cfd);
+
+ // Remove the column family from the queue, the caller is responsible for
+ // calling `MemtableList::TrimHistory`
+ ColumnFamilyData* TakeNextColumnFamily();
+
+ bool Empty();
+
+ void Clear();
+
+ // Not on critical path, use mutex to ensure thread safety
+ private:
+ std::atomic<bool> is_empty_;
+ autovector<ColumnFamilyData*> cfds_;
+ std::mutex checking_mutex_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_builder.cc b/src/rocksdb/db/version_builder.cc
new file mode 100644
index 000000000..4694218a1
--- /dev/null
+++ b/src/rocksdb/db/version_builder.cc
@@ -0,0 +1,545 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_builder.h"
+
+#include <algorithm>
+#include <atomic>
+#include <cinttypes>
+#include <functional>
+#include <map>
+#include <set>
+#include <thread>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/internal_stats.h"
+#include "db/table_cache.h"
+#include "db/version_set.h"
+#include "port/port.h"
+#include "table/table_reader.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b) {
+ if (a->fd.largest_seqno != b->fd.largest_seqno) {
+ return a->fd.largest_seqno > b->fd.largest_seqno;
+ }
+ if (a->fd.smallest_seqno != b->fd.smallest_seqno) {
+ return a->fd.smallest_seqno > b->fd.smallest_seqno;
+ }
+ // Break ties by file number
+ return a->fd.GetNumber() > b->fd.GetNumber();
+}
+
+namespace {
+bool BySmallestKey(FileMetaData* a, FileMetaData* b,
+ const InternalKeyComparator* cmp) {
+ int r = cmp->Compare(a->smallest, b->smallest);
+ if (r != 0) {
+ return (r < 0);
+ }
+ // Break ties by file number
+ return (a->fd.GetNumber() < b->fd.GetNumber());
+}
+} // namespace
+
+class VersionBuilder::Rep {
+ private:
+ // Helper to sort files_ in v
+ // kLevel0 -- NewestFirstBySeqNo
+ // kLevelNon0 -- BySmallestKey
+ struct FileComparator {
+ enum SortMethod { kLevel0 = 0, kLevelNon0 = 1, } sort_method;
+ const InternalKeyComparator* internal_comparator;
+
+ FileComparator() : internal_comparator(nullptr) {}
+
+ bool operator()(FileMetaData* f1, FileMetaData* f2) const {
+ switch (sort_method) {
+ case kLevel0:
+ return NewestFirstBySeqNo(f1, f2);
+ case kLevelNon0:
+ return BySmallestKey(f1, f2, internal_comparator);
+ }
+ assert(false);
+ return false;
+ }
+ };
+
+ struct LevelState {
+ std::unordered_set<uint64_t> deleted_files;
+ // Map from file number to file meta data.
+ std::unordered_map<uint64_t, FileMetaData*> added_files;
+ };
+
+ const FileOptions& file_options_;
+ Logger* info_log_;
+ TableCache* table_cache_;
+ VersionStorageInfo* base_vstorage_;
+ int num_levels_;
+ LevelState* levels_;
+ // Store states of levels larger than num_levels_. We do this instead of
+ // storing them in levels_ to avoid regression in case there are no files
+ // on invalid levels. The version is not consistent if in the end the files
+ // on invalid levels don't cancel out.
+ std::map<int, std::unordered_set<uint64_t>> invalid_levels_;
+ // Whether there are invalid new files or invalid deletion on levels larger
+ // than num_levels_.
+ bool has_invalid_levels_;
+ FileComparator level_zero_cmp_;
+ FileComparator level_nonzero_cmp_;
+
+ public:
+ Rep(const FileOptions& file_options, Logger* info_log,
+ TableCache* table_cache,
+ VersionStorageInfo* base_vstorage)
+ : file_options_(file_options),
+ info_log_(info_log),
+ table_cache_(table_cache),
+ base_vstorage_(base_vstorage),
+ num_levels_(base_vstorage->num_levels()),
+ has_invalid_levels_(false) {
+ levels_ = new LevelState[num_levels_];
+ level_zero_cmp_.sort_method = FileComparator::kLevel0;
+ level_nonzero_cmp_.sort_method = FileComparator::kLevelNon0;
+ level_nonzero_cmp_.internal_comparator =
+ base_vstorage_->InternalComparator();
+ }
+
+ ~Rep() {
+ for (int level = 0; level < num_levels_; level++) {
+ const auto& added = levels_[level].added_files;
+ for (auto& pair : added) {
+ UnrefFile(pair.second);
+ }
+ }
+
+ delete[] levels_;
+ }
+
+ void UnrefFile(FileMetaData* f) {
+ f->refs--;
+ if (f->refs <= 0) {
+ if (f->table_reader_handle) {
+ assert(table_cache_ != nullptr);
+ table_cache_->ReleaseHandle(f->table_reader_handle);
+ f->table_reader_handle = nullptr;
+ }
+ delete f;
+ }
+ }
+
+ Status CheckConsistency(VersionStorageInfo* vstorage) {
+#ifdef NDEBUG
+ if (!vstorage->force_consistency_checks()) {
+ // Dont run consistency checks in release mode except if
+ // explicitly asked to
+ return Status::OK();
+ }
+#endif
+ // make sure the files are sorted correctly
+ for (int level = 0; level < num_levels_; level++) {
+ auto& level_files = vstorage->LevelFiles(level);
+ for (size_t i = 1; i < level_files.size(); i++) {
+ auto f1 = level_files[i - 1];
+ auto f2 = level_files[i];
+#ifndef NDEBUG
+ auto pair = std::make_pair(&f1, &f2);
+ TEST_SYNC_POINT_CALLBACK("VersionBuilder::CheckConsistency", &pair);
+#endif
+ if (level == 0) {
+ if (!level_zero_cmp_(f1, f2)) {
+ fprintf(stderr, "L0 files are not sorted properly");
+ return Status::Corruption("L0 files are not sorted properly");
+ }
+
+ if (f2->fd.smallest_seqno == f2->fd.largest_seqno) {
+ // This is an external file that we ingested
+ SequenceNumber external_file_seqno = f2->fd.smallest_seqno;
+ if (!(external_file_seqno < f1->fd.largest_seqno ||
+ external_file_seqno == 0)) {
+ fprintf(stderr,
+ "L0 file with seqno %" PRIu64 " %" PRIu64
+ " vs. file with global_seqno %" PRIu64 "\n",
+ f1->fd.smallest_seqno, f1->fd.largest_seqno,
+ external_file_seqno);
+ return Status::Corruption(
+ "L0 file with seqno " +
+ NumberToString(f1->fd.smallest_seqno) + " " +
+ NumberToString(f1->fd.largest_seqno) +
+ " vs. file with global_seqno" +
+ NumberToString(external_file_seqno) + " with fileNumber " +
+ NumberToString(f1->fd.GetNumber()));
+ }
+ } else if (f1->fd.smallest_seqno <= f2->fd.smallest_seqno) {
+ fprintf(stderr,
+ "L0 files seqno %" PRIu64 " %" PRIu64 " vs. %" PRIu64
+ " %" PRIu64 "\n",
+ f1->fd.smallest_seqno, f1->fd.largest_seqno,
+ f2->fd.smallest_seqno, f2->fd.largest_seqno);
+ return Status::Corruption(
+ "L0 files seqno " + NumberToString(f1->fd.smallest_seqno) +
+ " " + NumberToString(f1->fd.largest_seqno) + " " +
+ NumberToString(f1->fd.GetNumber()) + " vs. " +
+ NumberToString(f2->fd.smallest_seqno) + " " +
+ NumberToString(f2->fd.largest_seqno) + " " +
+ NumberToString(f2->fd.GetNumber()));
+ }
+ } else {
+ if (!level_nonzero_cmp_(f1, f2)) {
+ fprintf(stderr, "L%d files are not sorted properly", level);
+ return Status::Corruption("L" + NumberToString(level) +
+ " files are not sorted properly");
+ }
+
+ // Make sure there is no overlap in levels > 0
+ if (vstorage->InternalComparator()->Compare(f1->largest,
+ f2->smallest) >= 0) {
+ fprintf(stderr, "L%d have overlapping ranges %s vs. %s\n", level,
+ (f1->largest).DebugString(true).c_str(),
+ (f2->smallest).DebugString(true).c_str());
+ return Status::Corruption(
+ "L" + NumberToString(level) + " have overlapping ranges " +
+ (f1->largest).DebugString(true) + " vs. " +
+ (f2->smallest).DebugString(true));
+ }
+ }
+ }
+ }
+ return Status::OK();
+ }
+
+ Status CheckConsistencyForDeletes(VersionEdit* /*edit*/, uint64_t number,
+ int level) {
+#ifdef NDEBUG
+ if (!base_vstorage_->force_consistency_checks()) {
+ // Dont run consistency checks in release mode except if
+ // explicitly asked to
+ return Status::OK();
+ }
+#endif
+ // a file to be deleted better exist in the previous version
+ bool found = false;
+ for (int l = 0; !found && l < num_levels_; l++) {
+ const std::vector<FileMetaData*>& base_files =
+ base_vstorage_->LevelFiles(l);
+ for (size_t i = 0; i < base_files.size(); i++) {
+ FileMetaData* f = base_files[i];
+ if (f->fd.GetNumber() == number) {
+ found = true;
+ break;
+ }
+ }
+ }
+ // if the file did not exist in the previous version, then it
+ // is possibly moved from lower level to higher level in current
+ // version
+ for (int l = level + 1; !found && l < num_levels_; l++) {
+ auto& level_added = levels_[l].added_files;
+ auto got = level_added.find(number);
+ if (got != level_added.end()) {
+ found = true;
+ break;
+ }
+ }
+
+ // maybe this file was added in a previous edit that was Applied
+ if (!found) {
+ auto& level_added = levels_[level].added_files;
+ auto got = level_added.find(number);
+ if (got != level_added.end()) {
+ found = true;
+ }
+ }
+ if (!found) {
+ fprintf(stderr, "not found %" PRIu64 "\n", number);
+ return Status::Corruption("not found " + NumberToString(number));
+ }
+ return Status::OK();
+ }
+
+ bool CheckConsistencyForNumLevels() {
+ // Make sure there are no files on or beyond num_levels().
+ if (has_invalid_levels_) {
+ return false;
+ }
+ for (auto& level : invalid_levels_) {
+ if (level.second.size() > 0) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ // Apply all of the edits in *edit to the current state.
+ Status Apply(VersionEdit* edit) {
+ Status s = CheckConsistency(base_vstorage_);
+ if (!s.ok()) {
+ return s;
+ }
+
+ // Delete files
+ const auto& del = edit->GetDeletedFiles();
+ for (const auto& del_file : del) {
+ const auto level = del_file.first;
+ const auto number = del_file.second;
+ if (level < num_levels_) {
+ levels_[level].deleted_files.insert(number);
+ CheckConsistencyForDeletes(edit, number, level);
+
+ auto exising = levels_[level].added_files.find(number);
+ if (exising != levels_[level].added_files.end()) {
+ UnrefFile(exising->second);
+ levels_[level].added_files.erase(exising);
+ }
+ } else {
+ if (invalid_levels_[level].erase(number) == 0) {
+ // Deleting an non-existing file on invalid level.
+ has_invalid_levels_ = true;
+ }
+ }
+ }
+
+ // Add new files
+ for (const auto& new_file : edit->GetNewFiles()) {
+ const int level = new_file.first;
+ if (level < num_levels_) {
+ FileMetaData* f = new FileMetaData(new_file.second);
+ f->refs = 1;
+
+ assert(levels_[level].added_files.find(f->fd.GetNumber()) ==
+ levels_[level].added_files.end());
+ levels_[level].deleted_files.erase(f->fd.GetNumber());
+ levels_[level].added_files[f->fd.GetNumber()] = f;
+ } else {
+ uint64_t number = new_file.second.fd.GetNumber();
+ auto& lvls = invalid_levels_[level];
+ if (lvls.count(number) == 0) {
+ lvls.insert(number);
+ } else {
+ // Creating an already existing file on invalid level.
+ has_invalid_levels_ = true;
+ }
+ }
+ }
+ return s;
+ }
+
+ // Save the current state in *v.
+ Status SaveTo(VersionStorageInfo* vstorage) {
+ Status s = CheckConsistency(base_vstorage_);
+ if (!s.ok()) {
+ return s;
+ }
+
+ s = CheckConsistency(vstorage);
+ if (!s.ok()) {
+ return s;
+ }
+
+ for (int level = 0; level < num_levels_; level++) {
+ const auto& cmp = (level == 0) ? level_zero_cmp_ : level_nonzero_cmp_;
+ // Merge the set of added files with the set of pre-existing files.
+ // Drop any deleted files. Store the result in *v.
+ const auto& base_files = base_vstorage_->LevelFiles(level);
+ const auto& unordered_added_files = levels_[level].added_files;
+ vstorage->Reserve(level,
+ base_files.size() + unordered_added_files.size());
+
+ // Sort added files for the level.
+ std::vector<FileMetaData*> added_files;
+ added_files.reserve(unordered_added_files.size());
+ for (const auto& pair : unordered_added_files) {
+ added_files.push_back(pair.second);
+ }
+ std::sort(added_files.begin(), added_files.end(), cmp);
+
+#ifndef NDEBUG
+ FileMetaData* prev_added_file = nullptr;
+ for (const auto& added : added_files) {
+ if (level > 0 && prev_added_file != nullptr) {
+ assert(base_vstorage_->InternalComparator()->Compare(
+ prev_added_file->smallest, added->smallest) <= 0);
+ }
+ prev_added_file = added;
+ }
+#endif
+
+ auto base_iter = base_files.begin();
+ auto base_end = base_files.end();
+ auto added_iter = added_files.begin();
+ auto added_end = added_files.end();
+ while (added_iter != added_end || base_iter != base_end) {
+ if (base_iter == base_end ||
+ (added_iter != added_end && cmp(*added_iter, *base_iter))) {
+ MaybeAddFile(vstorage, level, *added_iter++);
+ } else {
+ MaybeAddFile(vstorage, level, *base_iter++);
+ }
+ }
+ }
+
+ s = CheckConsistency(vstorage);
+ return s;
+ }
+
+ Status LoadTableHandlers(InternalStats* internal_stats, int max_threads,
+ bool prefetch_index_and_filter_in_cache,
+ bool is_initial_load,
+ const SliceTransform* prefix_extractor) {
+ assert(table_cache_ != nullptr);
+
+ size_t table_cache_capacity = table_cache_->get_cache()->GetCapacity();
+ bool always_load = (table_cache_capacity == TableCache::kInfiniteCapacity);
+ size_t max_load = port::kMaxSizet;
+
+ if (!always_load) {
+ // If it is initial loading and not set to always laoding all the
+ // files, we only load up to kInitialLoadLimit files, to limit the
+ // time reopening the DB.
+ const size_t kInitialLoadLimit = 16;
+ size_t load_limit;
+ // If the table cache is not 1/4 full, we pin the table handle to
+ // file metadata to avoid the cache read costs when reading the file.
+ // The downside of pinning those files is that LRU won't be followed
+ // for those files. This doesn't matter much because if number of files
+ // of the DB excceeds table cache capacity, eventually no table reader
+ // will be pinned and LRU will be followed.
+ if (is_initial_load) {
+ load_limit = std::min(kInitialLoadLimit, table_cache_capacity / 4);
+ } else {
+ load_limit = table_cache_capacity / 4;
+ }
+
+ size_t table_cache_usage = table_cache_->get_cache()->GetUsage();
+ if (table_cache_usage >= load_limit) {
+ // TODO (yanqin) find a suitable status code.
+ return Status::OK();
+ } else {
+ max_load = load_limit - table_cache_usage;
+ }
+ }
+
+ // <file metadata, level>
+ std::vector<std::pair<FileMetaData*, int>> files_meta;
+ std::vector<Status> statuses;
+ for (int level = 0; level < num_levels_; level++) {
+ for (auto& file_meta_pair : levels_[level].added_files) {
+ auto* file_meta = file_meta_pair.second;
+ // If the file has been opened before, just skip it.
+ if (!file_meta->table_reader_handle) {
+ files_meta.emplace_back(file_meta, level);
+ statuses.emplace_back(Status::OK());
+ }
+ if (files_meta.size() >= max_load) {
+ break;
+ }
+ }
+ if (files_meta.size() >= max_load) {
+ break;
+ }
+ }
+
+ std::atomic<size_t> next_file_meta_idx(0);
+ std::function<void()> load_handlers_func([&]() {
+ while (true) {
+ size_t file_idx = next_file_meta_idx.fetch_add(1);
+ if (file_idx >= files_meta.size()) {
+ break;
+ }
+
+ auto* file_meta = files_meta[file_idx].first;
+ int level = files_meta[file_idx].second;
+ statuses[file_idx] = table_cache_->FindTable(
+ file_options_, *(base_vstorage_->InternalComparator()),
+ file_meta->fd, &file_meta->table_reader_handle, prefix_extractor,
+ false /*no_io */, true /* record_read_stats */,
+ internal_stats->GetFileReadHist(level), false, level,
+ prefetch_index_and_filter_in_cache);
+ if (file_meta->table_reader_handle != nullptr) {
+ // Load table_reader
+ file_meta->fd.table_reader = table_cache_->GetTableReaderFromHandle(
+ file_meta->table_reader_handle);
+ }
+ }
+ });
+
+ std::vector<port::Thread> threads;
+ for (int i = 1; i < max_threads; i++) {
+ threads.emplace_back(load_handlers_func);
+ }
+ load_handlers_func();
+ for (auto& t : threads) {
+ t.join();
+ }
+ for (const auto& s : statuses) {
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ return Status::OK();
+ }
+
+ void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f) {
+ if (levels_[level].deleted_files.count(f->fd.GetNumber()) > 0) {
+ // f is to-be-deleted table file
+ vstorage->RemoveCurrentStats(f);
+ } else {
+ vstorage->AddFile(level, f, info_log_);
+ }
+ }
+};
+
+VersionBuilder::VersionBuilder(const FileOptions& file_options,
+ TableCache* table_cache,
+ VersionStorageInfo* base_vstorage,
+ Logger* info_log)
+ : rep_(new Rep(file_options, info_log, table_cache, base_vstorage)) {}
+
+VersionBuilder::~VersionBuilder() { delete rep_; }
+
+Status VersionBuilder::CheckConsistency(VersionStorageInfo* vstorage) {
+ return rep_->CheckConsistency(vstorage);
+}
+
+Status VersionBuilder::CheckConsistencyForDeletes(VersionEdit* edit,
+ uint64_t number, int level) {
+ return rep_->CheckConsistencyForDeletes(edit, number, level);
+}
+
+bool VersionBuilder::CheckConsistencyForNumLevels() {
+ return rep_->CheckConsistencyForNumLevels();
+}
+
+Status VersionBuilder::Apply(VersionEdit* edit) { return rep_->Apply(edit); }
+
+Status VersionBuilder::SaveTo(VersionStorageInfo* vstorage) {
+ return rep_->SaveTo(vstorage);
+}
+
+Status VersionBuilder::LoadTableHandlers(
+ InternalStats* internal_stats, int max_threads,
+ bool prefetch_index_and_filter_in_cache, bool is_initial_load,
+ const SliceTransform* prefix_extractor) {
+ return rep_->LoadTableHandlers(internal_stats, max_threads,
+ prefetch_index_and_filter_in_cache,
+ is_initial_load, prefix_extractor);
+}
+
+void VersionBuilder::MaybeAddFile(VersionStorageInfo* vstorage, int level,
+ FileMetaData* f) {
+ rep_->MaybeAddFile(vstorage, level, f);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_builder.h b/src/rocksdb/db/version_builder.h
new file mode 100644
index 000000000..87415ed55
--- /dev/null
+++ b/src/rocksdb/db/version_builder.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+#pragma once
+#include "rocksdb/file_system.h"
+#include "rocksdb/slice_transform.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TableCache;
+class VersionStorageInfo;
+class VersionEdit;
+struct FileMetaData;
+class InternalStats;
+
+// A helper class so we can efficiently apply a whole sequence
+// of edits to a particular state without creating intermediate
+// Versions that contain full copies of the intermediate state.
+class VersionBuilder {
+ public:
+ VersionBuilder(const FileOptions& file_options, TableCache* table_cache,
+ VersionStorageInfo* base_vstorage, Logger* info_log = nullptr);
+ ~VersionBuilder();
+ Status CheckConsistency(VersionStorageInfo* vstorage);
+ Status CheckConsistencyForDeletes(VersionEdit* edit, uint64_t number,
+ int level);
+ bool CheckConsistencyForNumLevels();
+ Status Apply(VersionEdit* edit);
+ Status SaveTo(VersionStorageInfo* vstorage);
+ Status LoadTableHandlers(InternalStats* internal_stats, int max_threads,
+ bool prefetch_index_and_filter_in_cache,
+ bool is_initial_load,
+ const SliceTransform* prefix_extractor);
+ void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f);
+
+ private:
+ class Rep;
+ Rep* rep_;
+};
+
+extern bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b);
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_builder_test.cc b/src/rocksdb/db/version_builder_test.cc
new file mode 100644
index 000000000..2dda03f31
--- /dev/null
+++ b/src/rocksdb/db/version_builder_test.cc
@@ -0,0 +1,349 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <string>
+#include "db/version_edit.h"
+#include "db/version_set.h"
+#include "logging/logging.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class VersionBuilderTest : public testing::Test {
+ public:
+ const Comparator* ucmp_;
+ InternalKeyComparator icmp_;
+ Options options_;
+ ImmutableCFOptions ioptions_;
+ MutableCFOptions mutable_cf_options_;
+ VersionStorageInfo vstorage_;
+ uint32_t file_num_;
+ CompactionOptionsFIFO fifo_options_;
+ std::vector<uint64_t> size_being_compacted_;
+
+ VersionBuilderTest()
+ : ucmp_(BytewiseComparator()),
+ icmp_(ucmp_),
+ ioptions_(options_),
+ mutable_cf_options_(options_),
+ vstorage_(&icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel,
+ nullptr, false),
+ file_num_(1) {
+ mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+ size_being_compacted_.resize(options_.num_levels);
+ }
+
+ ~VersionBuilderTest() override {
+ for (int i = 0; i < vstorage_.num_levels(); i++) {
+ for (auto* f : vstorage_.LevelFiles(i)) {
+ if (--f->refs == 0) {
+ delete f;
+ }
+ }
+ }
+ }
+
+ InternalKey GetInternalKey(const char* ukey,
+ SequenceNumber smallest_seq = 100) {
+ return InternalKey(ukey, smallest_seq, kTypeValue);
+ }
+
+ void Add(int level, uint32_t file_number, const char* smallest,
+ const char* largest, uint64_t file_size = 0, uint32_t path_id = 0,
+ SequenceNumber smallest_seq = 100, SequenceNumber largest_seq = 100,
+ uint64_t num_entries = 0, uint64_t num_deletions = 0,
+ bool sampled = false, SequenceNumber smallest_seqno = 0,
+ SequenceNumber largest_seqno = 0) {
+ assert(level < vstorage_.num_levels());
+ FileMetaData* f = new FileMetaData(
+ file_number, path_id, file_size, GetInternalKey(smallest, smallest_seq),
+ GetInternalKey(largest, largest_seq), smallest_seqno, largest_seqno,
+ /* marked_for_compact */ false, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+ f->compensated_file_size = file_size;
+ f->num_entries = num_entries;
+ f->num_deletions = num_deletions;
+ vstorage_.AddFile(level, f);
+ if (sampled) {
+ f->init_stats_from_file = true;
+ vstorage_.UpdateAccumulatedStats(f);
+ }
+ }
+
+ void UpdateVersionStorageInfo() {
+ vstorage_.UpdateFilesByCompactionPri(ioptions_.compaction_pri);
+ vstorage_.UpdateNumNonEmptyLevels();
+ vstorage_.GenerateFileIndexer();
+ vstorage_.GenerateLevelFilesBrief();
+ vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+ vstorage_.GenerateLevel0NonOverlapping();
+ vstorage_.SetFinalized();
+ }
+};
+
+void UnrefFilesInVersion(VersionStorageInfo* new_vstorage) {
+ for (int i = 0; i < new_vstorage->num_levels(); i++) {
+ for (auto* f : new_vstorage->LevelFiles(i)) {
+ if (--f->refs == 0) {
+ delete f;
+ }
+ }
+ }
+}
+
+TEST_F(VersionBuilderTest, ApplyAndSaveTo) {
+ Add(0, 1U, "150", "200", 100U);
+
+ Add(1, 66U, "150", "200", 100U);
+ Add(1, 88U, "201", "300", 100U);
+
+ Add(2, 6U, "150", "179", 100U);
+ Add(2, 7U, "180", "220", 100U);
+ Add(2, 8U, "221", "300", 100U);
+
+ Add(3, 26U, "150", "170", 100U);
+ Add(3, 27U, "171", "179", 100U);
+ Add(3, 28U, "191", "220", 100U);
+ Add(3, 29U, "221", "300", 100U);
+ UpdateVersionStorageInfo();
+
+ VersionEdit version_edit;
+ version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
+ GetInternalKey("350"), 200, 200, false,
+ kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+ kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName);
+ version_edit.DeleteFile(3, 27U);
+
+ EnvOptions env_options;
+
+ VersionBuilder version_builder(env_options, nullptr, &vstorage_);
+
+ VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+ kCompactionStyleLevel, nullptr, false);
+ version_builder.Apply(&version_edit);
+ version_builder.SaveTo(&new_vstorage);
+
+ ASSERT_EQ(400U, new_vstorage.NumLevelBytes(2));
+ ASSERT_EQ(300U, new_vstorage.NumLevelBytes(3));
+
+ UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic) {
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+
+ Add(0, 1U, "150", "200", 100U, 0, 200U, 200U, 0, 0, false, 200U, 200U);
+ Add(0, 88U, "201", "300", 100U, 0, 100U, 100U, 0, 0, false, 100U, 100U);
+
+ Add(4, 6U, "150", "179", 100U);
+ Add(4, 7U, "180", "220", 100U);
+ Add(4, 8U, "221", "300", 100U);
+
+ Add(5, 26U, "150", "170", 100U);
+ Add(5, 27U, "171", "179", 100U);
+ UpdateVersionStorageInfo();
+
+ VersionEdit version_edit;
+ version_edit.AddFile(3, 666, 0, 100U, GetInternalKey("301"),
+ GetInternalKey("350"), 200, 200, false,
+ kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+ kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName);
+ version_edit.DeleteFile(0, 1U);
+ version_edit.DeleteFile(0, 88U);
+
+ EnvOptions env_options;
+
+ VersionBuilder version_builder(env_options, nullptr, &vstorage_);
+
+ VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+ kCompactionStyleLevel, nullptr, false);
+ version_builder.Apply(&version_edit);
+ version_builder.SaveTo(&new_vstorage);
+
+ ASSERT_EQ(0U, new_vstorage.NumLevelBytes(0));
+ ASSERT_EQ(100U, new_vstorage.NumLevelBytes(3));
+ ASSERT_EQ(300U, new_vstorage.NumLevelBytes(4));
+ ASSERT_EQ(200U, new_vstorage.NumLevelBytes(5));
+
+ UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic2) {
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+
+ Add(0, 1U, "150", "200", 100U, 0, 200U, 200U, 0, 0, false, 200U, 200U);
+ Add(0, 88U, "201", "300", 100U, 0, 100U, 100U, 0, 0, false, 100U, 100U);
+
+ Add(4, 6U, "150", "179", 100U);
+ Add(4, 7U, "180", "220", 100U);
+ Add(4, 8U, "221", "300", 100U);
+
+ Add(5, 26U, "150", "170", 100U);
+ Add(5, 27U, "171", "179", 100U);
+ UpdateVersionStorageInfo();
+
+ VersionEdit version_edit;
+ version_edit.AddFile(4, 666, 0, 100U, GetInternalKey("301"),
+ GetInternalKey("350"), 200, 200, false,
+ kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+ kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName);
+ version_edit.DeleteFile(0, 1U);
+ version_edit.DeleteFile(0, 88U);
+ version_edit.DeleteFile(4, 6U);
+ version_edit.DeleteFile(4, 7U);
+ version_edit.DeleteFile(4, 8U);
+
+ EnvOptions env_options;
+
+ VersionBuilder version_builder(env_options, nullptr, &vstorage_);
+
+ VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+ kCompactionStyleLevel, nullptr, false);
+ version_builder.Apply(&version_edit);
+ version_builder.SaveTo(&new_vstorage);
+
+ ASSERT_EQ(0U, new_vstorage.NumLevelBytes(0));
+ ASSERT_EQ(100U, new_vstorage.NumLevelBytes(4));
+ ASSERT_EQ(200U, new_vstorage.NumLevelBytes(5));
+
+ UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyMultipleAndSaveTo) {
+ UpdateVersionStorageInfo();
+
+ VersionEdit version_edit;
+ version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
+ GetInternalKey("350"), 200, 200, false,
+ kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+ kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName);
+ version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"),
+ GetInternalKey("450"), 200, 200, false,
+ kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+ kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName);
+ version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"),
+ GetInternalKey("650"), 200, 200, false,
+ kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+ kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName);
+ version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"),
+ GetInternalKey("550"), 200, 200, false,
+ kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+ kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName);
+ version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"),
+ GetInternalKey("750"), 200, 200, false,
+ kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+ kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName);
+
+ EnvOptions env_options;
+
+ VersionBuilder version_builder(env_options, nullptr, &vstorage_);
+
+ VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+ kCompactionStyleLevel, nullptr, false);
+ version_builder.Apply(&version_edit);
+ version_builder.SaveTo(&new_vstorage);
+
+ ASSERT_EQ(500U, new_vstorage.NumLevelBytes(2));
+
+ UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyDeleteAndSaveTo) {
+ UpdateVersionStorageInfo();
+
+ EnvOptions env_options;
+ VersionBuilder version_builder(env_options, nullptr, &vstorage_);
+ VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+ kCompactionStyleLevel, nullptr, false);
+
+ VersionEdit version_edit;
+ version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
+ GetInternalKey("350"), 200, 200, false,
+ kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+ kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName);
+ version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"),
+ GetInternalKey("450"), 200, 200, false,
+ kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+ kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName);
+ version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"),
+ GetInternalKey("650"), 200, 200, false,
+ kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+ kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName);
+ version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"),
+ GetInternalKey("550"), 200, 200, false,
+ kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+ kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName);
+ version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"),
+ GetInternalKey("750"), 200, 200, false,
+ kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+ kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName);
+ version_builder.Apply(&version_edit);
+
+ VersionEdit version_edit2;
+ version_edit.AddFile(2, 808, 0, 100U, GetInternalKey("901"),
+ GetInternalKey("950"), 200, 200, false,
+ kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+ kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName);
+ version_edit2.DeleteFile(2, 616);
+ version_edit2.DeleteFile(2, 636);
+ version_edit.AddFile(2, 806, 0, 100U, GetInternalKey("801"),
+ GetInternalKey("850"), 200, 200, false,
+ kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+ kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName);
+ version_builder.Apply(&version_edit2);
+
+ version_builder.SaveTo(&new_vstorage);
+
+ ASSERT_EQ(300U, new_vstorage.NumLevelBytes(2));
+
+ UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, EstimatedActiveKeys) {
+ const uint32_t kTotalSamples = 20;
+ const uint32_t kNumLevels = 5;
+ const uint32_t kFilesPerLevel = 8;
+ const uint32_t kNumFiles = kNumLevels * kFilesPerLevel;
+ const uint32_t kEntriesPerFile = 1000;
+ const uint32_t kDeletionsPerFile = 100;
+ for (uint32_t i = 0; i < kNumFiles; ++i) {
+ Add(static_cast<int>(i / kFilesPerLevel), i + 1,
+ ToString((i + 100) * 1000).c_str(),
+ ToString((i + 100) * 1000 + 999).c_str(),
+ 100U, 0, 100, 100,
+ kEntriesPerFile, kDeletionsPerFile,
+ (i < kTotalSamples));
+ }
+ // minus 2X for the number of deletion entries because:
+ // 1x for deletion entry does not count as a data entry.
+ // 1x for each deletion entry will actually remove one data entry.
+ ASSERT_EQ(vstorage_.GetEstimatedActiveKeys(),
+ (kEntriesPerFile - 2 * kDeletionsPerFile) * kNumFiles);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/version_edit.cc b/src/rocksdb/db/version_edit.cc
new file mode 100644
index 000000000..e45e82656
--- /dev/null
+++ b/src/rocksdb/db/version_edit.cc
@@ -0,0 +1,826 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_edit.h"
+
+#include "db/blob_index.h"
+#include "db/version_set.h"
+#include "logging/event_logger.h"
+#include "rocksdb/slice.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+// The unknown file checksum.
+const std::string kUnknownFileChecksum("");
+// The unknown sst file checksum function name.
+const std::string kUnknownFileChecksumFuncName("Unknown");
+// Mask for an identified tag from the future which can be safely ignored.
+const uint32_t kTagSafeIgnoreMask = 1 << 13;
+
+// Tag numbers for serialized VersionEdit. These numbers are written to
+// disk and should not be changed. The number should be forward compatible so
+// users can down-grade RocksDB safely. A future Tag is ignored by doing '&'
+// between Tag and kTagSafeIgnoreMask field.
+enum Tag : uint32_t {
+ kComparator = 1,
+ kLogNumber = 2,
+ kNextFileNumber = 3,
+ kLastSequence = 4,
+ kCompactPointer = 5,
+ kDeletedFile = 6,
+ kNewFile = 7,
+ // 8 was used for large value refs
+ kPrevLogNumber = 9,
+ kMinLogNumberToKeep = 10,
+ // Ignore-able field
+ kDbId = kTagSafeIgnoreMask + 1,
+
+ // these are new formats divergent from open source leveldb
+ kNewFile2 = 100,
+ kNewFile3 = 102,
+ kNewFile4 = 103, // 4th (the latest) format version of adding files
+ kColumnFamily = 200, // specify column family for version edit
+ kColumnFamilyAdd = 201,
+ kColumnFamilyDrop = 202,
+ kMaxColumnFamily = 203,
+
+ kInAtomicGroup = 300,
+};
+
+enum CustomTag : uint32_t {
+ kTerminate = 1, // The end of customized fields
+ kNeedCompaction = 2,
+ // Since Manifest is not entirely currently forward-compatible, and the only
+ // forward-compatible part is the CutsomtTag of kNewFile, we currently encode
+ // kMinLogNumberToKeep as part of a CustomTag as a hack. This should be
+ // removed when manifest becomes forward-comptabile.
+ kMinLogNumberToKeepHack = 3,
+ kOldestBlobFileNumber = 4,
+ kOldestAncesterTime = 5,
+ kFileCreationTime = 6,
+ kFileChecksum = 7,
+ kFileChecksumFuncName = 8,
+ kPathId = 65,
+};
+// If this bit for the custom tag is set, opening DB should fail if
+// we don't know this field.
+uint32_t kCustomTagNonSafeIgnoreMask = 1 << 6;
+
+uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id) {
+ assert(number <= kFileNumberMask);
+ return number | (path_id * (kFileNumberMask + 1));
+}
+
+void FileMetaData::UpdateBoundaries(const Slice& key, const Slice& value,
+ SequenceNumber seqno,
+ ValueType value_type) {
+ if (smallest.size() == 0) {
+ smallest.DecodeFrom(key);
+ }
+ largest.DecodeFrom(key);
+ fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
+ fd.largest_seqno = std::max(fd.largest_seqno, seqno);
+
+#ifndef ROCKSDB_LITE
+ if (value_type == kTypeBlobIndex) {
+ BlobIndex blob_index;
+ const Status s = blob_index.DecodeFrom(value);
+ if (!s.ok()) {
+ return;
+ }
+
+ if (blob_index.IsInlined()) {
+ return;
+ }
+
+ if (blob_index.HasTTL()) {
+ return;
+ }
+
+ // Paranoid check: this should not happen because BlobDB numbers the blob
+ // files starting from 1.
+ if (blob_index.file_number() == kInvalidBlobFileNumber) {
+ return;
+ }
+
+ if (oldest_blob_file_number == kInvalidBlobFileNumber ||
+ oldest_blob_file_number > blob_index.file_number()) {
+ oldest_blob_file_number = blob_index.file_number();
+ }
+ }
+#else
+ (void)value;
+ (void)value_type;
+#endif
+}
+
+void VersionEdit::Clear() {
+ max_level_ = 0;
+ db_id_.clear();
+ comparator_.clear();
+ log_number_ = 0;
+ prev_log_number_ = 0;
+ next_file_number_ = 0;
+ max_column_family_ = 0;
+ min_log_number_to_keep_ = 0;
+ last_sequence_ = 0;
+ has_db_id_ = false;
+ has_comparator_ = false;
+ has_log_number_ = false;
+ has_prev_log_number_ = false;
+ has_next_file_number_ = false;
+ has_max_column_family_ = false;
+ has_min_log_number_to_keep_ = false;
+ has_last_sequence_ = false;
+ deleted_files_.clear();
+ new_files_.clear();
+ column_family_ = 0;
+ is_column_family_add_ = false;
+ is_column_family_drop_ = false;
+ column_family_name_.clear();
+ is_in_atomic_group_ = false;
+ remaining_entries_ = 0;
+}
+
+bool VersionEdit::EncodeTo(std::string* dst) const {
+ if (has_db_id_) {
+ PutVarint32(dst, kDbId);
+ PutLengthPrefixedSlice(dst, db_id_);
+ }
+ if (has_comparator_) {
+ PutVarint32(dst, kComparator);
+ PutLengthPrefixedSlice(dst, comparator_);
+ }
+ if (has_log_number_) {
+ PutVarint32Varint64(dst, kLogNumber, log_number_);
+ }
+ if (has_prev_log_number_) {
+ PutVarint32Varint64(dst, kPrevLogNumber, prev_log_number_);
+ }
+ if (has_next_file_number_) {
+ PutVarint32Varint64(dst, kNextFileNumber, next_file_number_);
+ }
+ if (has_max_column_family_) {
+ PutVarint32Varint32(dst, kMaxColumnFamily, max_column_family_);
+ }
+ if (has_last_sequence_) {
+ PutVarint32Varint64(dst, kLastSequence, last_sequence_);
+ }
+ for (const auto& deleted : deleted_files_) {
+ PutVarint32Varint32Varint64(dst, kDeletedFile, deleted.first /* level */,
+ deleted.second /* file number */);
+ }
+
+ bool min_log_num_written = false;
+ for (size_t i = 0; i < new_files_.size(); i++) {
+ const FileMetaData& f = new_files_[i].second;
+ if (!f.smallest.Valid() || !f.largest.Valid()) {
+ return false;
+ }
+ PutVarint32(dst, kNewFile4);
+ PutVarint32Varint64(dst, new_files_[i].first /* level */, f.fd.GetNumber());
+ PutVarint64(dst, f.fd.GetFileSize());
+ PutLengthPrefixedSlice(dst, f.smallest.Encode());
+ PutLengthPrefixedSlice(dst, f.largest.Encode());
+ PutVarint64Varint64(dst, f.fd.smallest_seqno, f.fd.largest_seqno);
+ // Customized fields' format:
+ // +-----------------------------+
+ // | 1st field's tag (varint32) |
+ // +-----------------------------+
+ // | 1st field's size (varint32) |
+ // +-----------------------------+
+ // | bytes for 1st field |
+ // | (based on size decoded) |
+ // +-----------------------------+
+ // | |
+ // | ...... |
+ // | |
+ // +-----------------------------+
+ // | last field's size (varint32)|
+ // +-----------------------------+
+ // | bytes for last field |
+ // | (based on size decoded) |
+ // +-----------------------------+
+ // | terminating tag (varint32) |
+ // +-----------------------------+
+ //
+ // Customized encoding for fields:
+ // tag kPathId: 1 byte as path_id
+ // tag kNeedCompaction:
+ // now only can take one char value 1 indicating need-compaction
+ //
+ PutVarint32(dst, CustomTag::kOldestAncesterTime);
+ std::string varint_oldest_ancester_time;
+ PutVarint64(&varint_oldest_ancester_time, f.oldest_ancester_time);
+ TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintOldestAncesterTime",
+ &varint_oldest_ancester_time);
+ PutLengthPrefixedSlice(dst, Slice(varint_oldest_ancester_time));
+
+ PutVarint32(dst, CustomTag::kFileCreationTime);
+ std::string varint_file_creation_time;
+ PutVarint64(&varint_file_creation_time, f.file_creation_time);
+ TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintFileCreationTime",
+ &varint_file_creation_time);
+ PutLengthPrefixedSlice(dst, Slice(varint_file_creation_time));
+
+ PutVarint32(dst, CustomTag::kFileChecksum);
+ PutLengthPrefixedSlice(dst, Slice(f.file_checksum));
+
+ PutVarint32(dst, CustomTag::kFileChecksumFuncName);
+ PutLengthPrefixedSlice(dst, Slice(f.file_checksum_func_name));
+
+ if (f.fd.GetPathId() != 0) {
+ PutVarint32(dst, CustomTag::kPathId);
+ char p = static_cast<char>(f.fd.GetPathId());
+ PutLengthPrefixedSlice(dst, Slice(&p, 1));
+ }
+ if (f.marked_for_compaction) {
+ PutVarint32(dst, CustomTag::kNeedCompaction);
+ char p = static_cast<char>(1);
+ PutLengthPrefixedSlice(dst, Slice(&p, 1));
+ }
+ if (has_min_log_number_to_keep_ && !min_log_num_written) {
+ PutVarint32(dst, CustomTag::kMinLogNumberToKeepHack);
+ std::string varint_log_number;
+ PutFixed64(&varint_log_number, min_log_number_to_keep_);
+ PutLengthPrefixedSlice(dst, Slice(varint_log_number));
+ min_log_num_written = true;
+ }
+ if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
+ PutVarint32(dst, CustomTag::kOldestBlobFileNumber);
+ std::string oldest_blob_file_number;
+ PutVarint64(&oldest_blob_file_number, f.oldest_blob_file_number);
+ PutLengthPrefixedSlice(dst, Slice(oldest_blob_file_number));
+ }
+ TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields",
+ dst);
+
+ PutVarint32(dst, CustomTag::kTerminate);
+ }
+
+ // 0 is default and does not need to be explicitly written
+ if (column_family_ != 0) {
+ PutVarint32Varint32(dst, kColumnFamily, column_family_);
+ }
+
+ if (is_column_family_add_) {
+ PutVarint32(dst, kColumnFamilyAdd);
+ PutLengthPrefixedSlice(dst, Slice(column_family_name_));
+ }
+
+ if (is_column_family_drop_) {
+ PutVarint32(dst, kColumnFamilyDrop);
+ }
+
+ if (is_in_atomic_group_) {
+ PutVarint32(dst, kInAtomicGroup);
+ PutVarint32(dst, remaining_entries_);
+ }
+ return true;
+}
+
+static bool GetInternalKey(Slice* input, InternalKey* dst) {
+ Slice str;
+ if (GetLengthPrefixedSlice(input, &str)) {
+ dst->DecodeFrom(str);
+ return dst->Valid();
+ } else {
+ return false;
+ }
+}
+
+bool VersionEdit::GetLevel(Slice* input, int* level, const char** /*msg*/) {
+ uint32_t v = 0;
+ if (GetVarint32(input, &v)) {
+ *level = v;
+ if (max_level_ < *level) {
+ max_level_ = *level;
+ }
+ return true;
+ } else {
+ return false;
+ }
+}
+
+static bool is_pseudo_new_file_record_pr3488(
+ const int level,
+ const uint64_t number,
+ const uint64_t file_size,
+ InternalKey& smallest,
+ InternalKey& largest,
+ const bool has_min_log_number_to_keep_) {
+
+ if (level == 0 && number == 0 && file_size == 0 &&
+ has_min_log_number_to_keep_) {
+ InternalKey dummy_key(Slice("dummy_key"), 0ull, ValueType::kTypeValue);
+ return (*smallest.rep() == *dummy_key.rep() &&
+ *largest.rep() == *dummy_key.rep());
+ } else {
+ return false;
+ }
+}
+
+const char* VersionEdit::DecodeNewFile4From(Slice* input) {
+ const char* msg = nullptr;
+ int level = 0;
+ FileMetaData f;
+ uint64_t number = 0;
+ uint32_t path_id = 0;
+ uint64_t file_size = 0;
+ SequenceNumber smallest_seqno = 0;
+ SequenceNumber largest_seqno = kMaxSequenceNumber;
+ // Since this is the only forward-compatible part of the code, we hack new
+ // extension into this record. When we do, we set this boolean to distinguish
+ // the record from the normal NewFile records.
+ if (GetLevel(input, &level, &msg) && GetVarint64(input, &number) &&
+ GetVarint64(input, &file_size) && GetInternalKey(input, &f.smallest) &&
+ GetInternalKey(input, &f.largest) &&
+ GetVarint64(input, &smallest_seqno) &&
+ GetVarint64(input, &largest_seqno)) {
+ // See comments in VersionEdit::EncodeTo() for format of customized fields
+ while (true) {
+ uint32_t custom_tag = 0;
+ Slice field;
+ if (!GetVarint32(input, &custom_tag)) {
+ return "new-file4 custom field";
+ }
+ if (custom_tag == kTerminate) {
+ break;
+ }
+ if (!GetLengthPrefixedSlice(input, &field)) {
+ return "new-file4 custom field length prefixed slice error";
+ }
+ switch (custom_tag) {
+ case kPathId:
+ if (field.size() != 1) {
+ return "path_id field wrong size";
+ }
+ path_id = field[0];
+ if (path_id > 3) {
+ return "path_id wrong vaue";
+ }
+ break;
+ case kOldestAncesterTime:
+ if (!GetVarint64(&field, &f.oldest_ancester_time)) {
+ return "invalid oldest ancester time";
+ }
+ break;
+ case kFileCreationTime:
+ if (!GetVarint64(&field, &f.file_creation_time)) {
+ return "invalid file creation time";
+ }
+ break;
+ case kFileChecksum:
+ f.file_checksum = field.ToString();
+ break;
+ case kFileChecksumFuncName:
+ f.file_checksum_func_name = field.ToString();
+ break;
+ case kNeedCompaction:
+ if (field.size() != 1) {
+ return "need_compaction field wrong size";
+ }
+ f.marked_for_compaction = (field[0] == 1);
+ break;
+ case kMinLogNumberToKeepHack:
+ // This is a hack to encode kMinLogNumberToKeep in a
+ // forward-compatible fashion.
+ if (!GetFixed64(&field, &min_log_number_to_keep_)) {
+ return "deleted log number malformatted";
+ }
+ has_min_log_number_to_keep_ = true;
+ break;
+ case kOldestBlobFileNumber:
+ if (!GetVarint64(&field, &f.oldest_blob_file_number)) {
+ return "invalid oldest blob file number";
+ }
+ break;
+ default:
+ if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) {
+ // Should not proceed if cannot understand it
+ return "new-file4 custom field not supported";
+ }
+ break;
+ }
+ }
+ } else {
+ return "new-file4 entry";
+ }
+ if (is_pseudo_new_file_record_pr3488(level, number, file_size,
+ f.smallest, f.largest,
+ has_min_log_number_to_keep_)) {
+ // Since this has nothing to do with NewFile, return immediately.
+ return nullptr;
+ }
+ f.fd =
+ FileDescriptor(number, path_id, file_size, smallest_seqno, largest_seqno);
+ new_files_.push_back(std::make_pair(level, f));
+ return nullptr;
+}
+
+Status VersionEdit::DecodeFrom(const Slice& src) {
+ Clear();
+ Slice input = src;
+ const char* msg = nullptr;
+ uint32_t tag = 0;
+
+ // Temporary storage for parsing
+ int level = 0;
+ FileMetaData f;
+ Slice str;
+ InternalKey key;
+ while (msg == nullptr && GetVarint32(&input, &tag)) {
+ switch (tag) {
+ case kDbId:
+ if (GetLengthPrefixedSlice(&input, &str)) {
+ db_id_ = str.ToString();
+ has_db_id_ = true;
+ } else {
+ msg = "db id";
+ }
+ break;
+ case kComparator:
+ if (GetLengthPrefixedSlice(&input, &str)) {
+ comparator_ = str.ToString();
+ has_comparator_ = true;
+ } else {
+ msg = "comparator name";
+ }
+ break;
+
+ case kLogNumber:
+ if (GetVarint64(&input, &log_number_)) {
+ has_log_number_ = true;
+ } else {
+ msg = "log number";
+ }
+ break;
+
+ case kPrevLogNumber:
+ if (GetVarint64(&input, &prev_log_number_)) {
+ has_prev_log_number_ = true;
+ } else {
+ msg = "previous log number";
+ }
+ break;
+
+ case kNextFileNumber:
+ if (GetVarint64(&input, &next_file_number_)) {
+ has_next_file_number_ = true;
+ } else {
+ msg = "next file number";
+ }
+ break;
+
+ case kMaxColumnFamily:
+ if (GetVarint32(&input, &max_column_family_)) {
+ has_max_column_family_ = true;
+ } else {
+ msg = "max column family";
+ }
+ break;
+
+ case kMinLogNumberToKeep:
+ if (GetVarint64(&input, &min_log_number_to_keep_)) {
+ has_min_log_number_to_keep_ = true;
+ } else {
+ msg = "min log number to kee";
+ }
+ break;
+
+ case kLastSequence:
+ if (GetVarint64(&input, &last_sequence_)) {
+ has_last_sequence_ = true;
+ } else {
+ msg = "last sequence number";
+ }
+ break;
+
+ case kCompactPointer:
+ if (GetLevel(&input, &level, &msg) &&
+ GetInternalKey(&input, &key)) {
+ // we don't use compact pointers anymore,
+ // but we should not fail if they are still
+ // in manifest
+ } else {
+ if (!msg) {
+ msg = "compaction pointer";
+ }
+ }
+ break;
+
+ case kDeletedFile: {
+ uint64_t number = 0;
+ if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number)) {
+ deleted_files_.insert(std::make_pair(level, number));
+ } else {
+ if (!msg) {
+ msg = "deleted file";
+ }
+ }
+ break;
+ }
+
+ case kNewFile: {
+ uint64_t number = 0;
+ uint64_t file_size = 0;
+ if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
+ GetVarint64(&input, &file_size) &&
+ GetInternalKey(&input, &f.smallest) &&
+ GetInternalKey(&input, &f.largest)) {
+ f.fd = FileDescriptor(number, 0, file_size);
+ new_files_.push_back(std::make_pair(level, f));
+ } else {
+ if (!msg) {
+ msg = "new-file entry";
+ }
+ }
+ break;
+ }
+ case kNewFile2: {
+ uint64_t number = 0;
+ uint64_t file_size = 0;
+ SequenceNumber smallest_seqno = 0;
+ SequenceNumber largest_seqno = kMaxSequenceNumber;
+ if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
+ GetVarint64(&input, &file_size) &&
+ GetInternalKey(&input, &f.smallest) &&
+ GetInternalKey(&input, &f.largest) &&
+ GetVarint64(&input, &smallest_seqno) &&
+ GetVarint64(&input, &largest_seqno)) {
+ f.fd = FileDescriptor(number, 0, file_size, smallest_seqno,
+ largest_seqno);
+ new_files_.push_back(std::make_pair(level, f));
+ } else {
+ if (!msg) {
+ msg = "new-file2 entry";
+ }
+ }
+ break;
+ }
+
+ case kNewFile3: {
+ uint64_t number = 0;
+ uint32_t path_id = 0;
+ uint64_t file_size = 0;
+ SequenceNumber smallest_seqno = 0;
+ SequenceNumber largest_seqno = kMaxSequenceNumber;
+ if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
+ GetVarint32(&input, &path_id) && GetVarint64(&input, &file_size) &&
+ GetInternalKey(&input, &f.smallest) &&
+ GetInternalKey(&input, &f.largest) &&
+ GetVarint64(&input, &smallest_seqno) &&
+ GetVarint64(&input, &largest_seqno)) {
+ f.fd = FileDescriptor(number, path_id, file_size, smallest_seqno,
+ largest_seqno);
+ new_files_.push_back(std::make_pair(level, f));
+ } else {
+ if (!msg) {
+ msg = "new-file3 entry";
+ }
+ }
+ break;
+ }
+
+ case kNewFile4: {
+ msg = DecodeNewFile4From(&input);
+ break;
+ }
+
+ case kColumnFamily:
+ if (!GetVarint32(&input, &column_family_)) {
+ if (!msg) {
+ msg = "set column family id";
+ }
+ }
+ break;
+
+ case kColumnFamilyAdd:
+ if (GetLengthPrefixedSlice(&input, &str)) {
+ is_column_family_add_ = true;
+ column_family_name_ = str.ToString();
+ } else {
+ if (!msg) {
+ msg = "column family add";
+ }
+ }
+ break;
+
+ case kColumnFamilyDrop:
+ is_column_family_drop_ = true;
+ break;
+
+ case kInAtomicGroup:
+ is_in_atomic_group_ = true;
+ if (!GetVarint32(&input, &remaining_entries_)) {
+ if (!msg) {
+ msg = "remaining entries";
+ }
+ }
+ break;
+
+ default:
+ if (tag & kTagSafeIgnoreMask) {
+ // Tag from future which can be safely ignored.
+ // The next field must be the length of the entry.
+ uint32_t field_len;
+ if (!GetVarint32(&input, &field_len) ||
+ static_cast<size_t>(field_len) > input.size()) {
+ if (!msg) {
+ msg = "safely ignoreable tag length error";
+ }
+ } else {
+ input.remove_prefix(static_cast<size_t>(field_len));
+ }
+ } else {
+ msg = "unknown tag";
+ }
+ break;
+ }
+ }
+
+ if (msg == nullptr && !input.empty()) {
+ msg = "invalid tag";
+ }
+
+ Status result;
+ if (msg != nullptr) {
+ result = Status::Corruption("VersionEdit", msg);
+ }
+ return result;
+}
+
+std::string VersionEdit::DebugString(bool hex_key) const {
+ std::string r;
+ r.append("VersionEdit {");
+ if (has_db_id_) {
+ r.append("\n DB ID: ");
+ r.append(db_id_);
+ }
+ if (has_comparator_) {
+ r.append("\n Comparator: ");
+ r.append(comparator_);
+ }
+ if (has_log_number_) {
+ r.append("\n LogNumber: ");
+ AppendNumberTo(&r, log_number_);
+ }
+ if (has_prev_log_number_) {
+ r.append("\n PrevLogNumber: ");
+ AppendNumberTo(&r, prev_log_number_);
+ }
+ if (has_next_file_number_) {
+ r.append("\n NextFileNumber: ");
+ AppendNumberTo(&r, next_file_number_);
+ }
+ if (has_max_column_family_) {
+ r.append("\n MaxColumnFamily: ");
+ AppendNumberTo(&r, max_column_family_);
+ }
+ if (has_min_log_number_to_keep_) {
+ r.append("\n MinLogNumberToKeep: ");
+ AppendNumberTo(&r, min_log_number_to_keep_);
+ }
+ if (has_last_sequence_) {
+ r.append("\n LastSeq: ");
+ AppendNumberTo(&r, last_sequence_);
+ }
+ for (const auto& deleted_file : deleted_files_) {
+ r.append("\n DeleteFile: ");
+ AppendNumberTo(&r, deleted_file.first);
+ r.append(" ");
+ AppendNumberTo(&r, deleted_file.second);
+ }
+ for (size_t i = 0; i < new_files_.size(); i++) {
+ const FileMetaData& f = new_files_[i].second;
+ r.append("\n AddFile: ");
+ AppendNumberTo(&r, new_files_[i].first);
+ r.append(" ");
+ AppendNumberTo(&r, f.fd.GetNumber());
+ r.append(" ");
+ AppendNumberTo(&r, f.fd.GetFileSize());
+ r.append(" ");
+ r.append(f.smallest.DebugString(hex_key));
+ r.append(" .. ");
+ r.append(f.largest.DebugString(hex_key));
+ if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
+ r.append(" blob_file:");
+ AppendNumberTo(&r, f.oldest_blob_file_number);
+ }
+ r.append(" oldest_ancester_time:");
+ AppendNumberTo(&r, f.oldest_ancester_time);
+ r.append(" file_creation_time:");
+ AppendNumberTo(&r, f.file_creation_time);
+ r.append(" file_checksum:");
+ r.append(f.file_checksum);
+ r.append(" file_checksum_func_name: ");
+ r.append(f.file_checksum_func_name);
+ }
+ r.append("\n ColumnFamily: ");
+ AppendNumberTo(&r, column_family_);
+ if (is_column_family_add_) {
+ r.append("\n ColumnFamilyAdd: ");
+ r.append(column_family_name_);
+ }
+ if (is_column_family_drop_) {
+ r.append("\n ColumnFamilyDrop");
+ }
+ if (is_in_atomic_group_) {
+ r.append("\n AtomicGroup: ");
+ AppendNumberTo(&r, remaining_entries_);
+ r.append(" entries remains");
+ }
+ r.append("\n}\n");
+ return r;
+}
+
+std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const {
+ JSONWriter jw;
+ jw << "EditNumber" << edit_num;
+
+ if (has_db_id_) {
+ jw << "DB ID" << db_id_;
+ }
+ if (has_comparator_) {
+ jw << "Comparator" << comparator_;
+ }
+ if (has_log_number_) {
+ jw << "LogNumber" << log_number_;
+ }
+ if (has_prev_log_number_) {
+ jw << "PrevLogNumber" << prev_log_number_;
+ }
+ if (has_next_file_number_) {
+ jw << "NextFileNumber" << next_file_number_;
+ }
+ if (has_max_column_family_) {
+ jw << "MaxColumnFamily" << max_column_family_;
+ }
+ if (has_min_log_number_to_keep_) {
+ jw << "MinLogNumberToKeep" << min_log_number_to_keep_;
+ }
+ if (has_last_sequence_) {
+ jw << "LastSeq" << last_sequence_;
+ }
+
+ if (!deleted_files_.empty()) {
+ jw << "DeletedFiles";
+ jw.StartArray();
+
+ for (const auto& deleted_file : deleted_files_) {
+ jw.StartArrayedObject();
+ jw << "Level" << deleted_file.first;
+ jw << "FileNumber" << deleted_file.second;
+ jw.EndArrayedObject();
+ }
+
+ jw.EndArray();
+ }
+
+ if (!new_files_.empty()) {
+ jw << "AddedFiles";
+ jw.StartArray();
+
+ for (size_t i = 0; i < new_files_.size(); i++) {
+ jw.StartArrayedObject();
+ jw << "Level" << new_files_[i].first;
+ const FileMetaData& f = new_files_[i].second;
+ jw << "FileNumber" << f.fd.GetNumber();
+ jw << "FileSize" << f.fd.GetFileSize();
+ jw << "SmallestIKey" << f.smallest.DebugString(hex_key);
+ jw << "LargestIKey" << f.largest.DebugString(hex_key);
+ if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
+ jw << "OldestBlobFile" << f.oldest_blob_file_number;
+ }
+ jw.EndArrayedObject();
+ }
+
+ jw.EndArray();
+ }
+
+ jw << "ColumnFamily" << column_family_;
+
+ if (is_column_family_add_) {
+ jw << "ColumnFamilyAdd" << column_family_name_;
+ }
+ if (is_column_family_drop_) {
+ jw << "ColumnFamilyDrop" << column_family_name_;
+ }
+ if (is_in_atomic_group_) {
+ jw << "AtomicGroup" << remaining_entries_;
+ }
+
+ jw.EndObject();
+
+ return jw.Get();
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_edit.h b/src/rocksdb/db/version_edit.h
new file mode 100644
index 000000000..6d1893f2a
--- /dev/null
+++ b/src/rocksdb/db/version_edit.h
@@ -0,0 +1,438 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <algorithm>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+#include "db/dbformat.h"
+#include "memory/arena.h"
+#include "rocksdb/cache.h"
+#include "table/table_reader.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class VersionSet;
+
+constexpr uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF;
+constexpr uint64_t kInvalidBlobFileNumber = 0;
+constexpr uint64_t kUnknownOldestAncesterTime = 0;
+constexpr uint64_t kUnknownFileCreationTime = 0;
+
+extern const std::string kUnknownFileChecksum;
+extern const std::string kUnknownFileChecksumFuncName;
+
+extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id);
+
+// A copyable structure contains information needed to read data from an SST
+// file. It can contain a pointer to a table reader opened for the file, or
+// file number and size, which can be used to create a new table reader for it.
+// The behavior is undefined when a copied of the structure is used when the
+// file is not in any live version any more.
+struct FileDescriptor {
+ // Table reader in table_reader_handle
+ TableReader* table_reader;
+ uint64_t packed_number_and_path_id;
+ uint64_t file_size; // File size in bytes
+ SequenceNumber smallest_seqno; // The smallest seqno in this file
+ SequenceNumber largest_seqno; // The largest seqno in this file
+
+ FileDescriptor() : FileDescriptor(0, 0, 0) {}
+
+ FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size)
+ : FileDescriptor(number, path_id, _file_size, kMaxSequenceNumber, 0) {}
+
+ FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size,
+ SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno)
+ : table_reader(nullptr),
+ packed_number_and_path_id(PackFileNumberAndPathId(number, path_id)),
+ file_size(_file_size),
+ smallest_seqno(_smallest_seqno),
+ largest_seqno(_largest_seqno) {}
+
+ FileDescriptor(const FileDescriptor& fd) { *this = fd; }
+
+ FileDescriptor& operator=(const FileDescriptor& fd) {
+ table_reader = fd.table_reader;
+ packed_number_and_path_id = fd.packed_number_and_path_id;
+ file_size = fd.file_size;
+ smallest_seqno = fd.smallest_seqno;
+ largest_seqno = fd.largest_seqno;
+ return *this;
+ }
+
+ uint64_t GetNumber() const {
+ return packed_number_and_path_id & kFileNumberMask;
+ }
+ uint32_t GetPathId() const {
+ return static_cast<uint32_t>(
+ packed_number_and_path_id / (kFileNumberMask + 1));
+ }
+ uint64_t GetFileSize() const { return file_size; }
+};
+
+struct FileSampledStats {
+ FileSampledStats() : num_reads_sampled(0) {}
+ FileSampledStats(const FileSampledStats& other) { *this = other; }
+ FileSampledStats& operator=(const FileSampledStats& other) {
+ num_reads_sampled = other.num_reads_sampled.load();
+ return *this;
+ }
+
+ // number of user reads to this file.
+ mutable std::atomic<uint64_t> num_reads_sampled;
+};
+
+struct FileMetaData {
+ FileDescriptor fd;
+ InternalKey smallest; // Smallest internal key served by table
+ InternalKey largest; // Largest internal key served by table
+
+ // Needs to be disposed when refs becomes 0.
+ Cache::Handle* table_reader_handle = nullptr;
+
+ FileSampledStats stats;
+
+ // Stats for compensating deletion entries during compaction
+
+ // File size compensated by deletion entry.
+ // This is updated in Version::UpdateAccumulatedStats() first time when the
+ // file is created or loaded. After it is updated (!= 0), it is immutable.
+ uint64_t compensated_file_size = 0;
+ // These values can mutate, but they can only be read or written from
+ // single-threaded LogAndApply thread
+ uint64_t num_entries = 0; // the number of entries.
+ uint64_t num_deletions = 0; // the number of deletion entries.
+ uint64_t raw_key_size = 0; // total uncompressed key size.
+ uint64_t raw_value_size = 0; // total uncompressed value size.
+
+ int refs = 0; // Reference count
+
+ bool being_compacted = false; // Is this file undergoing compaction?
+ bool init_stats_from_file = false; // true if the data-entry stats of this
+ // file has initialized from file.
+
+ bool marked_for_compaction = false; // True if client asked us nicely to
+ // compact this file.
+
+ // Used only in BlobDB. The file number of the oldest blob file this SST file
+ // refers to. 0 is an invalid value; BlobDB numbers the files starting from 1.
+ uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
+
+ // The file could be the compaction output from other SST files, which could
+ // in turn be outputs for compact older SST files. We track the memtable
+ // flush timestamp for the oldest SST file that eventaully contribute data
+ // to this file. 0 means the information is not available.
+ uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
+
+ // Unix time when the SST file is created.
+ uint64_t file_creation_time = kUnknownFileCreationTime;
+
+ // File checksum
+ std::string file_checksum = kUnknownFileChecksum;
+
+ // File checksum function name
+ std::string file_checksum_func_name = kUnknownFileChecksumFuncName;
+
+ FileMetaData() = default;
+
+ FileMetaData(uint64_t file, uint32_t file_path_id, uint64_t file_size,
+ const InternalKey& smallest_key, const InternalKey& largest_key,
+ const SequenceNumber& smallest_seq,
+ const SequenceNumber& largest_seq, bool marked_for_compact,
+ uint64_t oldest_blob_file, uint64_t _oldest_ancester_time,
+ uint64_t _file_creation_time, const std::string& _file_checksum,
+ const std::string& _file_checksum_func_name)
+ : fd(file, file_path_id, file_size, smallest_seq, largest_seq),
+ smallest(smallest_key),
+ largest(largest_key),
+ marked_for_compaction(marked_for_compact),
+ oldest_blob_file_number(oldest_blob_file),
+ oldest_ancester_time(_oldest_ancester_time),
+ file_creation_time(_file_creation_time),
+ file_checksum(_file_checksum),
+ file_checksum_func_name(_file_checksum_func_name) {
+ TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this);
+ }
+
+ // REQUIRED: Keys must be given to the function in sorted order (it expects
+ // the last key to be the largest).
+ void UpdateBoundaries(const Slice& key, const Slice& value,
+ SequenceNumber seqno, ValueType value_type);
+
+ // Unlike UpdateBoundaries, ranges do not need to be presented in any
+ // particular order.
+ void UpdateBoundariesForRange(const InternalKey& start,
+ const InternalKey& end, SequenceNumber seqno,
+ const InternalKeyComparator& icmp) {
+ if (smallest.size() == 0 || icmp.Compare(start, smallest) < 0) {
+ smallest = start;
+ }
+ if (largest.size() == 0 || icmp.Compare(largest, end) < 0) {
+ largest = end;
+ }
+ fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
+ fd.largest_seqno = std::max(fd.largest_seqno, seqno);
+ }
+
+ // Try to get oldest ancester time from the class itself or table properties
+ // if table reader is already pinned.
+ // 0 means the information is not available.
+ uint64_t TryGetOldestAncesterTime() {
+ if (oldest_ancester_time != kUnknownOldestAncesterTime) {
+ return oldest_ancester_time;
+ } else if (fd.table_reader != nullptr &&
+ fd.table_reader->GetTableProperties() != nullptr) {
+ return fd.table_reader->GetTableProperties()->creation_time;
+ }
+ return kUnknownOldestAncesterTime;
+ }
+
+ uint64_t TryGetFileCreationTime() {
+ if (file_creation_time != kUnknownFileCreationTime) {
+ return file_creation_time;
+ } else if (fd.table_reader != nullptr &&
+ fd.table_reader->GetTableProperties() != nullptr) {
+ return fd.table_reader->GetTableProperties()->file_creation_time;
+ }
+ return kUnknownFileCreationTime;
+ }
+};
+
+// A compressed copy of file meta data that just contain minimum data needed
+// to server read operations, while still keeping the pointer to full metadata
+// of the file in case it is needed.
+struct FdWithKeyRange {
+ FileDescriptor fd;
+ FileMetaData* file_metadata; // Point to all metadata
+ Slice smallest_key; // slice that contain smallest key
+ Slice largest_key; // slice that contain largest key
+
+ FdWithKeyRange()
+ : fd(),
+ file_metadata(nullptr),
+ smallest_key(),
+ largest_key() {
+ }
+
+ FdWithKeyRange(FileDescriptor _fd, Slice _smallest_key, Slice _largest_key,
+ FileMetaData* _file_metadata)
+ : fd(_fd),
+ file_metadata(_file_metadata),
+ smallest_key(_smallest_key),
+ largest_key(_largest_key) {}
+};
+
+// Data structure to store an array of FdWithKeyRange in one level
+// Actual data is guaranteed to be stored closely
+struct LevelFilesBrief {
+ size_t num_files;
+ FdWithKeyRange* files;
+ LevelFilesBrief() {
+ num_files = 0;
+ files = nullptr;
+ }
+};
+
+// The state of a DB at any given time is referred to as a Version.
+// Any modification to the Version is considered a Version Edit. A Version is
+// constructed by joining a sequence of Version Edits. Version Edits are written
+// to the MANIFEST file.
+class VersionEdit {
+ public:
+ void Clear();
+
+ void SetDBId(const std::string& db_id) {
+ has_db_id_ = true;
+ db_id_ = db_id;
+ }
+ bool HasDbId() const { return has_db_id_; }
+ const std::string& GetDbId() const { return db_id_; }
+
+ void SetComparatorName(const Slice& name) {
+ has_comparator_ = true;
+ comparator_ = name.ToString();
+ }
+ bool HasComparatorName() const { return has_comparator_; }
+ const std::string& GetComparatorName() const { return comparator_; }
+
+ void SetLogNumber(uint64_t num) {
+ has_log_number_ = true;
+ log_number_ = num;
+ }
+ bool HasLogNumber() const { return has_log_number_; }
+ uint64_t GetLogNumber() const { return log_number_; }
+
+ void SetPrevLogNumber(uint64_t num) {
+ has_prev_log_number_ = true;
+ prev_log_number_ = num;
+ }
+ bool HasPrevLogNumber() const { return has_prev_log_number_; }
+ uint64_t GetPrevLogNumber() const { return prev_log_number_; }
+
+ void SetNextFile(uint64_t num) {
+ has_next_file_number_ = true;
+ next_file_number_ = num;
+ }
+ bool HasNextFile() const { return has_next_file_number_; }
+ uint64_t GetNextFile() const { return next_file_number_; }
+
+ void SetMaxColumnFamily(uint32_t max_column_family) {
+ has_max_column_family_ = true;
+ max_column_family_ = max_column_family;
+ }
+ bool HasMaxColumnFamily() const { return has_max_column_family_; }
+ uint32_t GetMaxColumnFamily() const { return max_column_family_; }
+
+ void SetMinLogNumberToKeep(uint64_t num) {
+ has_min_log_number_to_keep_ = true;
+ min_log_number_to_keep_ = num;
+ }
+ bool HasMinLogNumberToKeep() const { return has_min_log_number_to_keep_; }
+ uint64_t GetMinLogNumberToKeep() const { return min_log_number_to_keep_; }
+
+ void SetLastSequence(SequenceNumber seq) {
+ has_last_sequence_ = true;
+ last_sequence_ = seq;
+ }
+ bool HasLastSequence() const { return has_last_sequence_; }
+ SequenceNumber GetLastSequence() const { return last_sequence_; }
+
+ // Delete the specified "file" from the specified "level".
+ void DeleteFile(int level, uint64_t file) {
+ deleted_files_.emplace(level, file);
+ }
+
+ // Retrieve the files deleted as well as their associated levels.
+ using DeletedFiles = std::set<std::pair<int, uint64_t>>;
+ const DeletedFiles& GetDeletedFiles() const { return deleted_files_; }
+
+ // Add the specified file at the specified level.
+ // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
+ // REQUIRES: "smallest" and "largest" are smallest and largest keys in file
+ // REQUIRES: "oldest_blob_file_number" is the number of the oldest blob file
+ // referred to by this file if any, kInvalidBlobFileNumber otherwise.
+ void AddFile(int level, uint64_t file, uint32_t file_path_id,
+ uint64_t file_size, const InternalKey& smallest,
+ const InternalKey& largest, const SequenceNumber& smallest_seqno,
+ const SequenceNumber& largest_seqno, bool marked_for_compaction,
+ uint64_t oldest_blob_file_number, uint64_t oldest_ancester_time,
+ uint64_t file_creation_time, const std::string& file_checksum,
+ const std::string& file_checksum_func_name) {
+ assert(smallest_seqno <= largest_seqno);
+ new_files_.emplace_back(
+ level, FileMetaData(file, file_path_id, file_size, smallest, largest,
+ smallest_seqno, largest_seqno,
+ marked_for_compaction, oldest_blob_file_number,
+ oldest_ancester_time, file_creation_time,
+ file_checksum, file_checksum_func_name));
+ }
+
+ void AddFile(int level, const FileMetaData& f) {
+ assert(f.fd.smallest_seqno <= f.fd.largest_seqno);
+ new_files_.emplace_back(level, f);
+ }
+
+ // Retrieve the files added as well as their associated levels.
+ using NewFiles = std::vector<std::pair<int, FileMetaData>>;
+ const NewFiles& GetNewFiles() const { return new_files_; }
+
+ // Number of edits
+ size_t NumEntries() const { return new_files_.size() + deleted_files_.size(); }
+
+ void SetColumnFamily(uint32_t column_family_id) {
+ column_family_ = column_family_id;
+ }
+ uint32_t GetColumnFamily() const { return column_family_; }
+
+ // set column family ID by calling SetColumnFamily()
+ void AddColumnFamily(const std::string& name) {
+ assert(!is_column_family_drop_);
+ assert(!is_column_family_add_);
+ assert(NumEntries() == 0);
+ is_column_family_add_ = true;
+ column_family_name_ = name;
+ }
+
+ // set column family ID by calling SetColumnFamily()
+ void DropColumnFamily() {
+ assert(!is_column_family_drop_);
+ assert(!is_column_family_add_);
+ assert(NumEntries() == 0);
+ is_column_family_drop_ = true;
+ }
+
+ bool IsColumnFamilyManipulation() const {
+ return is_column_family_add_ || is_column_family_drop_;
+ }
+
+ void MarkAtomicGroup(uint32_t remaining_entries) {
+ is_in_atomic_group_ = true;
+ remaining_entries_ = remaining_entries;
+ }
+ bool IsInAtomicGroup() const { return is_in_atomic_group_; }
+ uint32_t GetRemainingEntries() const { return remaining_entries_; }
+
+ // return true on success.
+ bool EncodeTo(std::string* dst) const;
+ Status DecodeFrom(const Slice& src);
+
+ std::string DebugString(bool hex_key = false) const;
+ std::string DebugJSON(int edit_num, bool hex_key = false) const;
+
+ private:
+ friend class ReactiveVersionSet;
+ friend class VersionSet;
+ friend class Version;
+ friend class AtomicGroupReadBuffer;
+
+ bool GetLevel(Slice* input, int* level, const char** msg);
+
+ const char* DecodeNewFile4From(Slice* input);
+
+ int max_level_ = 0;
+ std::string db_id_;
+ std::string comparator_;
+ uint64_t log_number_ = 0;
+ uint64_t prev_log_number_ = 0;
+ uint64_t next_file_number_ = 0;
+ uint32_t max_column_family_ = 0;
+ // The most recent WAL log number that is deleted
+ uint64_t min_log_number_to_keep_ = 0;
+ SequenceNumber last_sequence_ = 0;
+ bool has_db_id_ = false;
+ bool has_comparator_ = false;
+ bool has_log_number_ = false;
+ bool has_prev_log_number_ = false;
+ bool has_next_file_number_ = false;
+ bool has_max_column_family_ = false;
+ bool has_min_log_number_to_keep_ = false;
+ bool has_last_sequence_ = false;
+
+ DeletedFiles deleted_files_;
+ NewFiles new_files_;
+
+ // Each version edit record should have column_family_ set
+ // If it's not set, it is default (0)
+ uint32_t column_family_ = 0;
+ // a version edit can be either column_family add or
+ // column_family drop. If it's column family add,
+ // it also includes column family name.
+ bool is_column_family_drop_ = false;
+ bool is_column_family_add_ = false;
+ std::string column_family_name_;
+
+ bool is_in_atomic_group_ = false;
+ uint32_t remaining_entries_ = 0;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_edit_test.cc b/src/rocksdb/db/version_edit_test.cc
new file mode 100644
index 000000000..8bc884df9
--- /dev/null
+++ b/src/rocksdb/db/version_edit_test.cc
@@ -0,0 +1,286 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_edit.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static void TestEncodeDecode(const VersionEdit& edit) {
+ std::string encoded, encoded2;
+ edit.EncodeTo(&encoded);
+ VersionEdit parsed;
+ Status s = parsed.DecodeFrom(encoded);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ parsed.EncodeTo(&encoded2);
+ ASSERT_EQ(encoded, encoded2);
+}
+
+class VersionEditTest : public testing::Test {};
+
+TEST_F(VersionEditTest, EncodeDecode) {
+ static const uint64_t kBig = 1ull << 50;
+ static const uint32_t kBig32Bit = 1ull << 30;
+
+ VersionEdit edit;
+ for (int i = 0; i < 4; i++) {
+ TestEncodeDecode(edit);
+ edit.AddFile(3, kBig + 300 + i, kBig32Bit + 400 + i, 0,
+ InternalKey("foo", kBig + 500 + i, kTypeValue),
+ InternalKey("zoo", kBig + 600 + i, kTypeDeletion),
+ kBig + 500 + i, kBig + 600 + i, false, kInvalidBlobFileNumber,
+ 888, 678, "234", "crc32c");
+ edit.DeleteFile(4, kBig + 700 + i);
+ }
+
+ edit.SetComparatorName("foo");
+ edit.SetLogNumber(kBig + 100);
+ edit.SetNextFile(kBig + 200);
+ edit.SetLastSequence(kBig + 1000);
+ TestEncodeDecode(edit);
+}
+
+TEST_F(VersionEditTest, EncodeDecodeNewFile4) {
+ static const uint64_t kBig = 1ull << 50;
+
+ VersionEdit edit;
+ edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
+ InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
+ kBig + 600, true, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+ edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
+ InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
+ kBig + 601, false, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+ edit.AddFile(5, 302, 0, 100, InternalKey("foo", kBig + 502, kTypeValue),
+ InternalKey("zoo", kBig + 602, kTypeDeletion), kBig + 502,
+ kBig + 602, true, kInvalidBlobFileNumber, 666, 888,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+ edit.AddFile(5, 303, 0, 100, InternalKey("foo", kBig + 503, kTypeBlobIndex),
+ InternalKey("zoo", kBig + 603, kTypeBlobIndex), kBig + 503,
+ kBig + 603, true, 1001, kUnknownOldestAncesterTime,
+ kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName);
+ ;
+
+ edit.DeleteFile(4, 700);
+
+ edit.SetComparatorName("foo");
+ edit.SetLogNumber(kBig + 100);
+ edit.SetNextFile(kBig + 200);
+ edit.SetLastSequence(kBig + 1000);
+ TestEncodeDecode(edit);
+
+ std::string encoded, encoded2;
+ edit.EncodeTo(&encoded);
+ VersionEdit parsed;
+ Status s = parsed.DecodeFrom(encoded);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ auto& new_files = parsed.GetNewFiles();
+ ASSERT_TRUE(new_files[0].second.marked_for_compaction);
+ ASSERT_TRUE(!new_files[1].second.marked_for_compaction);
+ ASSERT_TRUE(new_files[2].second.marked_for_compaction);
+ ASSERT_TRUE(new_files[3].second.marked_for_compaction);
+ ASSERT_EQ(3u, new_files[0].second.fd.GetPathId());
+ ASSERT_EQ(3u, new_files[1].second.fd.GetPathId());
+ ASSERT_EQ(0u, new_files[2].second.fd.GetPathId());
+ ASSERT_EQ(0u, new_files[3].second.fd.GetPathId());
+ ASSERT_EQ(kInvalidBlobFileNumber,
+ new_files[0].second.oldest_blob_file_number);
+ ASSERT_EQ(kInvalidBlobFileNumber,
+ new_files[1].second.oldest_blob_file_number);
+ ASSERT_EQ(kInvalidBlobFileNumber,
+ new_files[2].second.oldest_blob_file_number);
+ ASSERT_EQ(1001, new_files[3].second.oldest_blob_file_number);
+}
+
+TEST_F(VersionEditTest, ForwardCompatibleNewFile4) {
+ static const uint64_t kBig = 1ull << 50;
+ VersionEdit edit;
+ edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
+ InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
+ kBig + 600, true, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+ edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
+ InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
+ kBig + 601, false, kInvalidBlobFileNumber, 686, 868, "234",
+ "crc32c");
+ edit.DeleteFile(4, 700);
+
+ edit.SetComparatorName("foo");
+ edit.SetLogNumber(kBig + 100);
+ edit.SetNextFile(kBig + 200);
+ edit.SetLastSequence(kBig + 1000);
+
+ std::string encoded;
+
+ // Call back function to add extra customized builds.
+ bool first = true;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "VersionEdit::EncodeTo:NewFile4:CustomizeFields", [&](void* arg) {
+ std::string* str = reinterpret_cast<std::string*>(arg);
+ PutVarint32(str, 33);
+ const std::string str1 = "random_string";
+ PutLengthPrefixedSlice(str, str1);
+ if (first) {
+ first = false;
+ PutVarint32(str, 22);
+ const std::string str2 = "s";
+ PutLengthPrefixedSlice(str, str2);
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ edit.EncodeTo(&encoded);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ VersionEdit parsed;
+ Status s = parsed.DecodeFrom(encoded);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ ASSERT_TRUE(!first);
+ auto& new_files = parsed.GetNewFiles();
+ ASSERT_TRUE(new_files[0].second.marked_for_compaction);
+ ASSERT_TRUE(!new_files[1].second.marked_for_compaction);
+ ASSERT_EQ(3u, new_files[0].second.fd.GetPathId());
+ ASSERT_EQ(3u, new_files[1].second.fd.GetPathId());
+ ASSERT_EQ(1u, parsed.GetDeletedFiles().size());
+}
+
+TEST_F(VersionEditTest, NewFile4NotSupportedField) {
+ static const uint64_t kBig = 1ull << 50;
+ VersionEdit edit;
+ edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
+ InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
+ kBig + 600, true, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+
+ edit.SetComparatorName("foo");
+ edit.SetLogNumber(kBig + 100);
+ edit.SetNextFile(kBig + 200);
+ edit.SetLastSequence(kBig + 1000);
+
+ std::string encoded;
+
+ // Call back function to add extra customized builds.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "VersionEdit::EncodeTo:NewFile4:CustomizeFields", [&](void* arg) {
+ std::string* str = reinterpret_cast<std::string*>(arg);
+ const std::string str1 = "s";
+ PutLengthPrefixedSlice(str, str1);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ edit.EncodeTo(&encoded);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ VersionEdit parsed;
+ Status s = parsed.DecodeFrom(encoded);
+ ASSERT_NOK(s);
+}
+
+TEST_F(VersionEditTest, EncodeEmptyFile) {
+ VersionEdit edit;
+ edit.AddFile(0, 0, 0, 0, InternalKey(), InternalKey(), 0, 0, false,
+ kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+ kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName);
+ std::string buffer;
+ ASSERT_TRUE(!edit.EncodeTo(&buffer));
+}
+
+TEST_F(VersionEditTest, ColumnFamilyTest) {
+ VersionEdit edit;
+ edit.SetColumnFamily(2);
+ edit.AddColumnFamily("column_family");
+ edit.SetMaxColumnFamily(5);
+ TestEncodeDecode(edit);
+
+ edit.Clear();
+ edit.SetColumnFamily(3);
+ edit.DropColumnFamily();
+ TestEncodeDecode(edit);
+}
+
+TEST_F(VersionEditTest, MinLogNumberToKeep) {
+ VersionEdit edit;
+ edit.SetMinLogNumberToKeep(13);
+ TestEncodeDecode(edit);
+
+ edit.Clear();
+ edit.SetMinLogNumberToKeep(23);
+ TestEncodeDecode(edit);
+}
+
+TEST_F(VersionEditTest, AtomicGroupTest) {
+ VersionEdit edit;
+ edit.MarkAtomicGroup(1);
+ TestEncodeDecode(edit);
+}
+
+TEST_F(VersionEditTest, IgnorableField) {
+ VersionEdit ve;
+ std::string encoded;
+
+ // Size of ignorable field is too large
+ PutVarint32Varint64(&encoded, 2 /* kLogNumber */, 66);
+ // This is a customized ignorable tag
+ PutVarint32Varint64(&encoded,
+ 0x2710 /* A field with kTagSafeIgnoreMask set */,
+ 5 /* fieldlength 5 */);
+ encoded += "abc"; // Only fills 3 bytes,
+ ASSERT_NOK(ve.DecodeFrom(encoded));
+
+ encoded.clear();
+ // Error when seeing unidentified tag that is not ignorable
+ PutVarint32Varint64(&encoded, 2 /* kLogNumber */, 66);
+ // This is a customized ignorable tag
+ PutVarint32Varint64(&encoded, 666 /* A field with kTagSafeIgnoreMask unset */,
+ 3 /* fieldlength 3 */);
+ encoded += "abc"; // Fill 3 bytes
+ PutVarint32Varint64(&encoded, 3 /* next file number */, 88);
+ ASSERT_NOK(ve.DecodeFrom(encoded));
+
+ // Safely ignore an identified but safely ignorable entry
+ encoded.clear();
+ PutVarint32Varint64(&encoded, 2 /* kLogNumber */, 66);
+ // This is a customized ignorable tag
+ PutVarint32Varint64(&encoded,
+ 0x2710 /* A field with kTagSafeIgnoreMask set */,
+ 3 /* fieldlength 3 */);
+ encoded += "abc"; // Fill 3 bytes
+ PutVarint32Varint64(&encoded, 3 /* kNextFileNumber */, 88);
+
+ ASSERT_OK(ve.DecodeFrom(encoded));
+
+ ASSERT_TRUE(ve.HasLogNumber());
+ ASSERT_TRUE(ve.HasNextFile());
+ ASSERT_EQ(66, ve.GetLogNumber());
+ ASSERT_EQ(88, ve.GetNextFile());
+}
+
+TEST_F(VersionEditTest, DbId) {
+ VersionEdit edit;
+ edit.SetDBId("ab34-cd12-435f-er00");
+ TestEncodeDecode(edit);
+
+ edit.Clear();
+ edit.SetDBId("34ba-cd12-435f-er01");
+ TestEncodeDecode(edit);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/version_set.cc b/src/rocksdb/db/version_set.cc
new file mode 100644
index 000000000..e913a97dd
--- /dev/null
+++ b/src/rocksdb/db/version_set.cc
@@ -0,0 +1,6005 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_set.h"
+
+#include <stdio.h>
+#include <algorithm>
+#include <array>
+#include <cinttypes>
+#include <list>
+#include <map>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "compaction/compaction.h"
+#include "db/internal_stats.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/merge_context.h"
+#include "db/merge_helper.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/table_cache.h"
+#include "db/version_builder.h"
+#include "file/filename.h"
+#include "file/random_access_file_reader.h"
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
+#include "monitoring/file_read_sample.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/persistent_stats_history.h"
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/format.h"
+#include "table/get_context.h"
+#include "table/internal_iterator.h"
+#include "table/merging_iterator.h"
+#include "table/meta_blocks.h"
+#include "table/multiget_context.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/table_reader.h"
+#include "table/two_level_iterator.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+#include "util/user_comparator_wrapper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// Find File in LevelFilesBrief data structure
+// Within an index range defined by left and right
+int FindFileInRange(const InternalKeyComparator& icmp,
+ const LevelFilesBrief& file_level,
+ const Slice& key,
+ uint32_t left,
+ uint32_t right) {
+ auto cmp = [&](const FdWithKeyRange& f, const Slice& k) -> bool {
+ return icmp.InternalKeyComparator::Compare(f.largest_key, k) < 0;
+ };
+ const auto &b = file_level.files;
+ return static_cast<int>(std::lower_bound(b + left,
+ b + right, key, cmp) - b);
+}
+
+Status OverlapWithIterator(const Comparator* ucmp,
+ const Slice& smallest_user_key,
+ const Slice& largest_user_key,
+ InternalIterator* iter,
+ bool* overlap) {
+ InternalKey range_start(smallest_user_key, kMaxSequenceNumber,
+ kValueTypeForSeek);
+ iter->Seek(range_start.Encode());
+ if (!iter->status().ok()) {
+ return iter->status();
+ }
+
+ *overlap = false;
+ if (iter->Valid()) {
+ ParsedInternalKey seek_result;
+ if (!ParseInternalKey(iter->key(), &seek_result)) {
+ return Status::Corruption("DB have corrupted keys");
+ }
+
+ if (ucmp->CompareWithoutTimestamp(seek_result.user_key, largest_user_key) <=
+ 0) {
+ *overlap = true;
+ }
+ }
+
+ return iter->status();
+}
+
+// Class to help choose the next file to search for the particular key.
+// Searches and returns files level by level.
+// We can search level-by-level since entries never hop across
+// levels. Therefore we are guaranteed that if we find data
+// in a smaller level, later levels are irrelevant (unless we
+// are MergeInProgress).
+class FilePicker {
+ public:
+ FilePicker(std::vector<FileMetaData*>* files, const Slice& user_key,
+ const Slice& ikey, autovector<LevelFilesBrief>* file_levels,
+ unsigned int num_levels, FileIndexer* file_indexer,
+ const Comparator* user_comparator,
+ const InternalKeyComparator* internal_comparator)
+ : num_levels_(num_levels),
+ curr_level_(static_cast<unsigned int>(-1)),
+ returned_file_level_(static_cast<unsigned int>(-1)),
+ hit_file_level_(static_cast<unsigned int>(-1)),
+ search_left_bound_(0),
+ search_right_bound_(FileIndexer::kLevelMaxIndex),
+#ifndef NDEBUG
+ files_(files),
+#endif
+ level_files_brief_(file_levels),
+ is_hit_file_last_in_level_(false),
+ curr_file_level_(nullptr),
+ user_key_(user_key),
+ ikey_(ikey),
+ file_indexer_(file_indexer),
+ user_comparator_(user_comparator),
+ internal_comparator_(internal_comparator) {
+#ifdef NDEBUG
+ (void)files;
+#endif
+ // Setup member variables to search first level.
+ search_ended_ = !PrepareNextLevel();
+ if (!search_ended_) {
+ // Prefetch Level 0 table data to avoid cache miss if possible.
+ for (unsigned int i = 0; i < (*level_files_brief_)[0].num_files; ++i) {
+ auto* r = (*level_files_brief_)[0].files[i].fd.table_reader;
+ if (r) {
+ r->Prepare(ikey);
+ }
+ }
+ }
+ }
+
+ int GetCurrentLevel() const { return curr_level_; }
+
+ FdWithKeyRange* GetNextFile() {
+ while (!search_ended_) { // Loops over different levels.
+ while (curr_index_in_curr_level_ < curr_file_level_->num_files) {
+ // Loops over all files in current level.
+ FdWithKeyRange* f = &curr_file_level_->files[curr_index_in_curr_level_];
+ hit_file_level_ = curr_level_;
+ is_hit_file_last_in_level_ =
+ curr_index_in_curr_level_ == curr_file_level_->num_files - 1;
+ int cmp_largest = -1;
+
+ // Do key range filtering of files or/and fractional cascading if:
+ // (1) not all the files are in level 0, or
+ // (2) there are more than 3 current level files
+ // If there are only 3 or less current level files in the system, we skip
+ // the key range filtering. In this case, more likely, the system is
+ // highly tuned to minimize number of tables queried by each query,
+ // so it is unlikely that key range filtering is more efficient than
+ // querying the files.
+ if (num_levels_ > 1 || curr_file_level_->num_files > 3) {
+ // Check if key is within a file's range. If search left bound and
+ // right bound point to the same find, we are sure key falls in
+ // range.
+ assert(curr_level_ == 0 ||
+ curr_index_in_curr_level_ == start_index_in_curr_level_ ||
+ user_comparator_->CompareWithoutTimestamp(
+ user_key_, ExtractUserKey(f->smallest_key)) <= 0);
+
+ int cmp_smallest = user_comparator_->CompareWithoutTimestamp(
+ user_key_, ExtractUserKey(f->smallest_key));
+ if (cmp_smallest >= 0) {
+ cmp_largest = user_comparator_->CompareWithoutTimestamp(
+ user_key_, ExtractUserKey(f->largest_key));
+ }
+
+ // Setup file search bound for the next level based on the
+ // comparison results
+ if (curr_level_ > 0) {
+ file_indexer_->GetNextLevelIndex(curr_level_,
+ curr_index_in_curr_level_,
+ cmp_smallest, cmp_largest,
+ &search_left_bound_,
+ &search_right_bound_);
+ }
+ // Key falls out of current file's range
+ if (cmp_smallest < 0 || cmp_largest > 0) {
+ if (curr_level_ == 0) {
+ ++curr_index_in_curr_level_;
+ continue;
+ } else {
+ // Search next level.
+ break;
+ }
+ }
+ }
+#ifndef NDEBUG
+ // Sanity check to make sure that the files are correctly sorted
+ if (prev_file_) {
+ if (curr_level_ != 0) {
+ int comp_sign = internal_comparator_->Compare(
+ prev_file_->largest_key, f->smallest_key);
+ assert(comp_sign < 0);
+ } else {
+ // level == 0, the current file cannot be newer than the previous
+ // one. Use compressed data structure, has no attribute seqNo
+ assert(curr_index_in_curr_level_ > 0);
+ assert(!NewestFirstBySeqNo(files_[0][curr_index_in_curr_level_],
+ files_[0][curr_index_in_curr_level_-1]));
+ }
+ }
+ prev_file_ = f;
+#endif
+ returned_file_level_ = curr_level_;
+ if (curr_level_ > 0 && cmp_largest < 0) {
+ // No more files to search in this level.
+ search_ended_ = !PrepareNextLevel();
+ } else {
+ ++curr_index_in_curr_level_;
+ }
+ return f;
+ }
+ // Start searching next level.
+ search_ended_ = !PrepareNextLevel();
+ }
+ // Search ended.
+ return nullptr;
+ }
+
+ // getter for current file level
+ // for GET_HIT_L0, GET_HIT_L1 & GET_HIT_L2_AND_UP counts
+ unsigned int GetHitFileLevel() { return hit_file_level_; }
+
+ // Returns true if the most recent "hit file" (i.e., one returned by
+ // GetNextFile()) is at the last index in its level.
+ bool IsHitFileLastInLevel() { return is_hit_file_last_in_level_; }
+
+ private:
+ unsigned int num_levels_;
+ unsigned int curr_level_;
+ unsigned int returned_file_level_;
+ unsigned int hit_file_level_;
+ int32_t search_left_bound_;
+ int32_t search_right_bound_;
+#ifndef NDEBUG
+ std::vector<FileMetaData*>* files_;
+#endif
+ autovector<LevelFilesBrief>* level_files_brief_;
+ bool search_ended_;
+ bool is_hit_file_last_in_level_;
+ LevelFilesBrief* curr_file_level_;
+ unsigned int curr_index_in_curr_level_;
+ unsigned int start_index_in_curr_level_;
+ Slice user_key_;
+ Slice ikey_;
+ FileIndexer* file_indexer_;
+ const Comparator* user_comparator_;
+ const InternalKeyComparator* internal_comparator_;
+#ifndef NDEBUG
+ FdWithKeyRange* prev_file_;
+#endif
+
+ // Setup local variables to search next level.
+ // Returns false if there are no more levels to search.
+ bool PrepareNextLevel() {
+ curr_level_++;
+ while (curr_level_ < num_levels_) {
+ curr_file_level_ = &(*level_files_brief_)[curr_level_];
+ if (curr_file_level_->num_files == 0) {
+ // When current level is empty, the search bound generated from upper
+ // level must be [0, -1] or [0, FileIndexer::kLevelMaxIndex] if it is
+ // also empty.
+ assert(search_left_bound_ == 0);
+ assert(search_right_bound_ == -1 ||
+ search_right_bound_ == FileIndexer::kLevelMaxIndex);
+ // Since current level is empty, it will need to search all files in
+ // the next level
+ search_left_bound_ = 0;
+ search_right_bound_ = FileIndexer::kLevelMaxIndex;
+ curr_level_++;
+ continue;
+ }
+
+ // Some files may overlap each other. We find
+ // all files that overlap user_key and process them in order from
+ // newest to oldest. In the context of merge-operator, this can occur at
+ // any level. Otherwise, it only occurs at Level-0 (since Put/Deletes
+ // are always compacted into a single entry).
+ int32_t start_index;
+ if (curr_level_ == 0) {
+ // On Level-0, we read through all files to check for overlap.
+ start_index = 0;
+ } else {
+ // On Level-n (n>=1), files are sorted. Binary search to find the
+ // earliest file whose largest key >= ikey. Search left bound and
+ // right bound are used to narrow the range.
+ if (search_left_bound_ <= search_right_bound_) {
+ if (search_right_bound_ == FileIndexer::kLevelMaxIndex) {
+ search_right_bound_ =
+ static_cast<int32_t>(curr_file_level_->num_files) - 1;
+ }
+ // `search_right_bound_` is an inclusive upper-bound, but since it was
+ // determined based on user key, it is still possible the lookup key
+ // falls to the right of `search_right_bound_`'s corresponding file.
+ // So, pass a limit one higher, which allows us to detect this case.
+ start_index =
+ FindFileInRange(*internal_comparator_, *curr_file_level_, ikey_,
+ static_cast<uint32_t>(search_left_bound_),
+ static_cast<uint32_t>(search_right_bound_) + 1);
+ if (start_index == search_right_bound_ + 1) {
+ // `ikey_` comes after `search_right_bound_`. The lookup key does
+ // not exist on this level, so let's skip this level and do a full
+ // binary search on the next level.
+ search_left_bound_ = 0;
+ search_right_bound_ = FileIndexer::kLevelMaxIndex;
+ curr_level_++;
+ continue;
+ }
+ } else {
+ // search_left_bound > search_right_bound, key does not exist in
+ // this level. Since no comparison is done in this level, it will
+ // need to search all files in the next level.
+ search_left_bound_ = 0;
+ search_right_bound_ = FileIndexer::kLevelMaxIndex;
+ curr_level_++;
+ continue;
+ }
+ }
+ start_index_in_curr_level_ = start_index;
+ curr_index_in_curr_level_ = start_index;
+#ifndef NDEBUG
+ prev_file_ = nullptr;
+#endif
+ return true;
+ }
+ // curr_level_ = num_levels_. So, no more levels to search.
+ return false;
+ }
+};
+
+class FilePickerMultiGet {
+ private:
+ struct FilePickerContext;
+
+ public:
+ FilePickerMultiGet(MultiGetRange* range,
+ autovector<LevelFilesBrief>* file_levels,
+ unsigned int num_levels, FileIndexer* file_indexer,
+ const Comparator* user_comparator,
+ const InternalKeyComparator* internal_comparator)
+ : num_levels_(num_levels),
+ curr_level_(static_cast<unsigned int>(-1)),
+ returned_file_level_(static_cast<unsigned int>(-1)),
+ hit_file_level_(static_cast<unsigned int>(-1)),
+ range_(range),
+ batch_iter_(range->begin()),
+ batch_iter_prev_(range->begin()),
+ maybe_repeat_key_(false),
+ current_level_range_(*range, range->begin(), range->end()),
+ current_file_range_(*range, range->begin(), range->end()),
+ level_files_brief_(file_levels),
+ is_hit_file_last_in_level_(false),
+ curr_file_level_(nullptr),
+ file_indexer_(file_indexer),
+ user_comparator_(user_comparator),
+ internal_comparator_(internal_comparator) {
+ for (auto iter = range_->begin(); iter != range_->end(); ++iter) {
+ fp_ctx_array_[iter.index()] =
+ FilePickerContext(0, FileIndexer::kLevelMaxIndex);
+ }
+
+ // Setup member variables to search first level.
+ search_ended_ = !PrepareNextLevel();
+ if (!search_ended_) {
+ // REVISIT
+ // Prefetch Level 0 table data to avoid cache miss if possible.
+ // As of now, only PlainTableReader and CuckooTableReader do any
+ // prefetching. This may not be necessary anymore once we implement
+ // batching in those table readers
+ for (unsigned int i = 0; i < (*level_files_brief_)[0].num_files; ++i) {
+ auto* r = (*level_files_brief_)[0].files[i].fd.table_reader;
+ if (r) {
+ for (auto iter = range_->begin(); iter != range_->end(); ++iter) {
+ r->Prepare(iter->ikey);
+ }
+ }
+ }
+ }
+ }
+
+ int GetCurrentLevel() const { return curr_level_; }
+
+ // Iterates through files in the current level until it finds a file that
+ // contains atleast one key from the MultiGet batch
+ bool GetNextFileInLevelWithKeys(MultiGetRange* next_file_range,
+ size_t* file_index, FdWithKeyRange** fd,
+ bool* is_last_key_in_file) {
+ size_t curr_file_index = *file_index;
+ FdWithKeyRange* f = nullptr;
+ bool file_hit = false;
+ int cmp_largest = -1;
+ if (curr_file_index >= curr_file_level_->num_files) {
+ // In the unlikely case the next key is a duplicate of the current key,
+ // and the current key is the last in the level and the internal key
+ // was not found, we need to skip lookup for the remaining keys and
+ // reset the search bounds
+ if (batch_iter_ != current_level_range_.end()) {
+ ++batch_iter_;
+ for (; batch_iter_ != current_level_range_.end(); ++batch_iter_) {
+ struct FilePickerContext& fp_ctx = fp_ctx_array_[batch_iter_.index()];
+ fp_ctx.search_left_bound = 0;
+ fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex;
+ }
+ }
+ return false;
+ }
+ // Loops over keys in the MultiGet batch until it finds a file with
+ // atleast one of the keys. Then it keeps moving forward until the
+ // last key in the batch that falls in that file
+ while (batch_iter_ != current_level_range_.end() &&
+ (fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level ==
+ curr_file_index ||
+ !file_hit)) {
+ struct FilePickerContext& fp_ctx = fp_ctx_array_[batch_iter_.index()];
+ f = &curr_file_level_->files[fp_ctx.curr_index_in_curr_level];
+ Slice& user_key = batch_iter_->ukey;
+
+ // Do key range filtering of files or/and fractional cascading if:
+ // (1) not all the files are in level 0, or
+ // (2) there are more than 3 current level files
+ // If there are only 3 or less current level files in the system, we
+ // skip the key range filtering. In this case, more likely, the system
+ // is highly tuned to minimize number of tables queried by each query,
+ // so it is unlikely that key range filtering is more efficient than
+ // querying the files.
+ if (num_levels_ > 1 || curr_file_level_->num_files > 3) {
+ // Check if key is within a file's range. If search left bound and
+ // right bound point to the same find, we are sure key falls in
+ // range.
+ assert(curr_level_ == 0 ||
+ fp_ctx.curr_index_in_curr_level ==
+ fp_ctx.start_index_in_curr_level ||
+ user_comparator_->Compare(user_key,
+ ExtractUserKey(f->smallest_key)) <= 0);
+
+ int cmp_smallest = user_comparator_->Compare(
+ user_key, ExtractUserKey(f->smallest_key));
+ if (cmp_smallest >= 0) {
+ cmp_largest = user_comparator_->Compare(
+ user_key, ExtractUserKey(f->largest_key));
+ } else {
+ cmp_largest = -1;
+ }
+
+ // Setup file search bound for the next level based on the
+ // comparison results
+ if (curr_level_ > 0) {
+ file_indexer_->GetNextLevelIndex(
+ curr_level_, fp_ctx.curr_index_in_curr_level, cmp_smallest,
+ cmp_largest, &fp_ctx.search_left_bound,
+ &fp_ctx.search_right_bound);
+ }
+ // Key falls out of current file's range
+ if (cmp_smallest < 0 || cmp_largest > 0) {
+ next_file_range->SkipKey(batch_iter_);
+ } else {
+ file_hit = true;
+ }
+ } else {
+ file_hit = true;
+ }
+ if (cmp_largest == 0) {
+ // cmp_largest is 0, which means the next key will not be in this
+ // file, so stop looking further. Also don't increment megt_iter_
+ // as we may have to look for this key in the next file if we don't
+ // find it in this one
+ break;
+ } else {
+ if (curr_level_ == 0) {
+ // We need to look through all files in level 0
+ ++fp_ctx.curr_index_in_curr_level;
+ }
+ ++batch_iter_;
+ }
+ if (!file_hit) {
+ curr_file_index =
+ (batch_iter_ != current_level_range_.end())
+ ? fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level
+ : curr_file_level_->num_files;
+ }
+ }
+
+ *fd = f;
+ *file_index = curr_file_index;
+ *is_last_key_in_file = cmp_largest == 0;
+ return file_hit;
+ }
+
+ FdWithKeyRange* GetNextFile() {
+ while (!search_ended_) {
+ // Start searching next level.
+ if (batch_iter_ == current_level_range_.end()) {
+ search_ended_ = !PrepareNextLevel();
+ continue;
+ } else {
+ if (maybe_repeat_key_) {
+ maybe_repeat_key_ = false;
+ // Check if we found the final value for the last key in the
+ // previous lookup range. If we did, then there's no need to look
+ // any further for that key, so advance batch_iter_. Else, keep
+ // batch_iter_ positioned on that key so we look it up again in
+ // the next file
+ // For L0, always advance the key because we will look in the next
+ // file regardless for all keys not found yet
+ if (current_level_range_.CheckKeyDone(batch_iter_) ||
+ curr_level_ == 0) {
+ ++batch_iter_;
+ }
+ }
+ // batch_iter_prev_ will become the start key for the next file
+ // lookup
+ batch_iter_prev_ = batch_iter_;
+ }
+
+ MultiGetRange next_file_range(current_level_range_, batch_iter_prev_,
+ current_level_range_.end());
+ size_t curr_file_index =
+ (batch_iter_ != current_level_range_.end())
+ ? fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level
+ : curr_file_level_->num_files;
+ FdWithKeyRange* f;
+ bool is_last_key_in_file;
+ if (!GetNextFileInLevelWithKeys(&next_file_range, &curr_file_index, &f,
+ &is_last_key_in_file)) {
+ search_ended_ = !PrepareNextLevel();
+ } else {
+ MultiGetRange::Iterator upper_key = batch_iter_;
+ if (is_last_key_in_file) {
+ // Since cmp_largest is 0, batch_iter_ still points to the last key
+ // that falls in this file, instead of the next one. Increment
+ // upper_key so we can set the range properly for SST MultiGet
+ ++upper_key;
+ ++(fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level);
+ maybe_repeat_key_ = true;
+ }
+ // Set the range for this file
+ current_file_range_ =
+ MultiGetRange(next_file_range, batch_iter_prev_, upper_key);
+ returned_file_level_ = curr_level_;
+ hit_file_level_ = curr_level_;
+ is_hit_file_last_in_level_ =
+ curr_file_index == curr_file_level_->num_files - 1;
+ return f;
+ }
+ }
+
+ // Search ended
+ return nullptr;
+ }
+
+ // getter for current file level
+ // for GET_HIT_L0, GET_HIT_L1 & GET_HIT_L2_AND_UP counts
+ unsigned int GetHitFileLevel() { return hit_file_level_; }
+
+ // Returns true if the most recent "hit file" (i.e., one returned by
+ // GetNextFile()) is at the last index in its level.
+ bool IsHitFileLastInLevel() { return is_hit_file_last_in_level_; }
+
+ const MultiGetRange& CurrentFileRange() { return current_file_range_; }
+
+ private:
+ unsigned int num_levels_;
+ unsigned int curr_level_;
+ unsigned int returned_file_level_;
+ unsigned int hit_file_level_;
+
+ struct FilePickerContext {
+ int32_t search_left_bound;
+ int32_t search_right_bound;
+ unsigned int curr_index_in_curr_level;
+ unsigned int start_index_in_curr_level;
+
+ FilePickerContext(int32_t left, int32_t right)
+ : search_left_bound(left), search_right_bound(right),
+ curr_index_in_curr_level(0), start_index_in_curr_level(0) {}
+
+ FilePickerContext() = default;
+ };
+ std::array<FilePickerContext, MultiGetContext::MAX_BATCH_SIZE> fp_ctx_array_;
+ MultiGetRange* range_;
+ // Iterator to iterate through the keys in a MultiGet batch, that gets reset
+ // at the beginning of each level. Each call to GetNextFile() will position
+ // batch_iter_ at or right after the last key that was found in the returned
+ // SST file
+ MultiGetRange::Iterator batch_iter_;
+ // An iterator that records the previous position of batch_iter_, i.e last
+ // key found in the previous SST file, in order to serve as the start of
+ // the batch key range for the next SST file
+ MultiGetRange::Iterator batch_iter_prev_;
+ bool maybe_repeat_key_;
+ MultiGetRange current_level_range_;
+ MultiGetRange current_file_range_;
+ autovector<LevelFilesBrief>* level_files_brief_;
+ bool search_ended_;
+ bool is_hit_file_last_in_level_;
+ LevelFilesBrief* curr_file_level_;
+ FileIndexer* file_indexer_;
+ const Comparator* user_comparator_;
+ const InternalKeyComparator* internal_comparator_;
+
+ // Setup local variables to search next level.
+ // Returns false if there are no more levels to search.
+ bool PrepareNextLevel() {
+ if (curr_level_ == 0) {
+ MultiGetRange::Iterator mget_iter = current_level_range_.begin();
+ if (fp_ctx_array_[mget_iter.index()].curr_index_in_curr_level <
+ curr_file_level_->num_files) {
+ batch_iter_prev_ = current_level_range_.begin();
+ batch_iter_ = current_level_range_.begin();
+ return true;
+ }
+ }
+
+ curr_level_++;
+ // Reset key range to saved value
+ while (curr_level_ < num_levels_) {
+ bool level_contains_keys = false;
+ curr_file_level_ = &(*level_files_brief_)[curr_level_];
+ if (curr_file_level_->num_files == 0) {
+ // When current level is empty, the search bound generated from upper
+ // level must be [0, -1] or [0, FileIndexer::kLevelMaxIndex] if it is
+ // also empty.
+
+ for (auto mget_iter = current_level_range_.begin();
+ mget_iter != current_level_range_.end(); ++mget_iter) {
+ struct FilePickerContext& fp_ctx = fp_ctx_array_[mget_iter.index()];
+
+ assert(fp_ctx.search_left_bound == 0);
+ assert(fp_ctx.search_right_bound == -1 ||
+ fp_ctx.search_right_bound == FileIndexer::kLevelMaxIndex);
+ // Since current level is empty, it will need to search all files in
+ // the next level
+ fp_ctx.search_left_bound = 0;
+ fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex;
+ }
+ // Skip all subsequent empty levels
+ do {
+ ++curr_level_;
+ } while ((curr_level_ < num_levels_) &&
+ (*level_files_brief_)[curr_level_].num_files == 0);
+ continue;
+ }
+
+ // Some files may overlap each other. We find
+ // all files that overlap user_key and process them in order from
+ // newest to oldest. In the context of merge-operator, this can occur at
+ // any level. Otherwise, it only occurs at Level-0 (since Put/Deletes
+ // are always compacted into a single entry).
+ int32_t start_index = -1;
+ current_level_range_ =
+ MultiGetRange(*range_, range_->begin(), range_->end());
+ for (auto mget_iter = current_level_range_.begin();
+ mget_iter != current_level_range_.end(); ++mget_iter) {
+ struct FilePickerContext& fp_ctx = fp_ctx_array_[mget_iter.index()];
+ if (curr_level_ == 0) {
+ // On Level-0, we read through all files to check for overlap.
+ start_index = 0;
+ level_contains_keys = true;
+ } else {
+ // On Level-n (n>=1), files are sorted. Binary search to find the
+ // earliest file whose largest key >= ikey. Search left bound and
+ // right bound are used to narrow the range.
+ if (fp_ctx.search_left_bound <= fp_ctx.search_right_bound) {
+ if (fp_ctx.search_right_bound == FileIndexer::kLevelMaxIndex) {
+ fp_ctx.search_right_bound =
+ static_cast<int32_t>(curr_file_level_->num_files) - 1;
+ }
+ // `search_right_bound_` is an inclusive upper-bound, but since it
+ // was determined based on user key, it is still possible the lookup
+ // key falls to the right of `search_right_bound_`'s corresponding
+ // file. So, pass a limit one higher, which allows us to detect this
+ // case.
+ Slice& ikey = mget_iter->ikey;
+ start_index = FindFileInRange(
+ *internal_comparator_, *curr_file_level_, ikey,
+ static_cast<uint32_t>(fp_ctx.search_left_bound),
+ static_cast<uint32_t>(fp_ctx.search_right_bound) + 1);
+ if (start_index == fp_ctx.search_right_bound + 1) {
+ // `ikey_` comes after `search_right_bound_`. The lookup key does
+ // not exist on this level, so let's skip this level and do a full
+ // binary search on the next level.
+ fp_ctx.search_left_bound = 0;
+ fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex;
+ current_level_range_.SkipKey(mget_iter);
+ continue;
+ } else {
+ level_contains_keys = true;
+ }
+ } else {
+ // search_left_bound > search_right_bound, key does not exist in
+ // this level. Since no comparison is done in this level, it will
+ // need to search all files in the next level.
+ fp_ctx.search_left_bound = 0;
+ fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex;
+ current_level_range_.SkipKey(mget_iter);
+ continue;
+ }
+ }
+ fp_ctx.start_index_in_curr_level = start_index;
+ fp_ctx.curr_index_in_curr_level = start_index;
+ }
+ if (level_contains_keys) {
+ batch_iter_prev_ = current_level_range_.begin();
+ batch_iter_ = current_level_range_.begin();
+ return true;
+ }
+ curr_level_++;
+ }
+ // curr_level_ = num_levels_. So, no more levels to search.
+ return false;
+ }
+};
+} // anonymous namespace
+
+VersionStorageInfo::~VersionStorageInfo() { delete[] files_; }
+
+Version::~Version() {
+ assert(refs_ == 0);
+
+ // Remove from linked list
+ prev_->next_ = next_;
+ next_->prev_ = prev_;
+
+ // Drop references to files
+ for (int level = 0; level < storage_info_.num_levels_; level++) {
+ for (size_t i = 0; i < storage_info_.files_[level].size(); i++) {
+ FileMetaData* f = storage_info_.files_[level][i];
+ assert(f->refs > 0);
+ f->refs--;
+ if (f->refs <= 0) {
+ assert(cfd_ != nullptr);
+ uint32_t path_id = f->fd.GetPathId();
+ assert(path_id < cfd_->ioptions()->cf_paths.size());
+ vset_->obsolete_files_.push_back(
+ ObsoleteFileInfo(f, cfd_->ioptions()->cf_paths[path_id].path));
+ }
+ }
+ }
+}
+
+int FindFile(const InternalKeyComparator& icmp,
+ const LevelFilesBrief& file_level,
+ const Slice& key) {
+ return FindFileInRange(icmp, file_level, key, 0,
+ static_cast<uint32_t>(file_level.num_files));
+}
+
+void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level,
+ const std::vector<FileMetaData*>& files,
+ Arena* arena) {
+ assert(file_level);
+ assert(arena);
+
+ size_t num = files.size();
+ file_level->num_files = num;
+ char* mem = arena->AllocateAligned(num * sizeof(FdWithKeyRange));
+ file_level->files = new (mem)FdWithKeyRange[num];
+
+ for (size_t i = 0; i < num; i++) {
+ Slice smallest_key = files[i]->smallest.Encode();
+ Slice largest_key = files[i]->largest.Encode();
+
+ // Copy key slice to sequential memory
+ size_t smallest_size = smallest_key.size();
+ size_t largest_size = largest_key.size();
+ mem = arena->AllocateAligned(smallest_size + largest_size);
+ memcpy(mem, smallest_key.data(), smallest_size);
+ memcpy(mem + smallest_size, largest_key.data(), largest_size);
+
+ FdWithKeyRange& f = file_level->files[i];
+ f.fd = files[i]->fd;
+ f.file_metadata = files[i];
+ f.smallest_key = Slice(mem, smallest_size);
+ f.largest_key = Slice(mem + smallest_size, largest_size);
+ }
+}
+
+static bool AfterFile(const Comparator* ucmp,
+ const Slice* user_key, const FdWithKeyRange* f) {
+ // nullptr user_key occurs before all keys and is therefore never after *f
+ return (user_key != nullptr &&
+ ucmp->CompareWithoutTimestamp(*user_key,
+ ExtractUserKey(f->largest_key)) > 0);
+}
+
+static bool BeforeFile(const Comparator* ucmp,
+ const Slice* user_key, const FdWithKeyRange* f) {
+ // nullptr user_key occurs after all keys and is therefore never before *f
+ return (user_key != nullptr &&
+ ucmp->CompareWithoutTimestamp(*user_key,
+ ExtractUserKey(f->smallest_key)) < 0);
+}
+
+bool SomeFileOverlapsRange(
+ const InternalKeyComparator& icmp,
+ bool disjoint_sorted_files,
+ const LevelFilesBrief& file_level,
+ const Slice* smallest_user_key,
+ const Slice* largest_user_key) {
+ const Comparator* ucmp = icmp.user_comparator();
+ if (!disjoint_sorted_files) {
+ // Need to check against all files
+ for (size_t i = 0; i < file_level.num_files; i++) {
+ const FdWithKeyRange* f = &(file_level.files[i]);
+ if (AfterFile(ucmp, smallest_user_key, f) ||
+ BeforeFile(ucmp, largest_user_key, f)) {
+ // No overlap
+ } else {
+ return true; // Overlap
+ }
+ }
+ return false;
+ }
+
+ // Binary search over file list
+ uint32_t index = 0;
+ if (smallest_user_key != nullptr) {
+ // Find the leftmost possible internal key for smallest_user_key
+ InternalKey small;
+ small.SetMinPossibleForUserKey(*smallest_user_key);
+ index = FindFile(icmp, file_level, small.Encode());
+ }
+
+ if (index >= file_level.num_files) {
+ // beginning of range is after all files, so no overlap.
+ return false;
+ }
+
+ return !BeforeFile(ucmp, largest_user_key, &file_level.files[index]);
+}
+
+namespace {
+
+class LevelIterator final : public InternalIterator {
+ public:
+ LevelIterator(TableCache* table_cache, const ReadOptions& read_options,
+ const FileOptions& file_options,
+ const InternalKeyComparator& icomparator,
+ const LevelFilesBrief* flevel,
+ const SliceTransform* prefix_extractor, bool should_sample,
+ HistogramImpl* file_read_hist, TableReaderCaller caller,
+ bool skip_filters, int level, RangeDelAggregator* range_del_agg,
+ const std::vector<AtomicCompactionUnitBoundary>*
+ compaction_boundaries = nullptr)
+ : table_cache_(table_cache),
+ read_options_(read_options),
+ file_options_(file_options),
+ icomparator_(icomparator),
+ user_comparator_(icomparator.user_comparator()),
+ flevel_(flevel),
+ prefix_extractor_(prefix_extractor),
+ file_read_hist_(file_read_hist),
+ should_sample_(should_sample),
+ caller_(caller),
+ skip_filters_(skip_filters),
+ file_index_(flevel_->num_files),
+ level_(level),
+ range_del_agg_(range_del_agg),
+ pinned_iters_mgr_(nullptr),
+ compaction_boundaries_(compaction_boundaries) {
+ // Empty level is not supported.
+ assert(flevel_ != nullptr && flevel_->num_files > 0);
+ }
+
+ ~LevelIterator() override { delete file_iter_.Set(nullptr); }
+
+ void Seek(const Slice& target) override;
+ void SeekForPrev(const Slice& target) override;
+ void SeekToFirst() override;
+ void SeekToLast() override;
+ void Next() final override;
+ bool NextAndGetResult(IterateResult* result) override;
+ void Prev() override;
+
+ bool Valid() const override { return file_iter_.Valid(); }
+ Slice key() const override {
+ assert(Valid());
+ return file_iter_.key();
+ }
+
+ Slice value() const override {
+ assert(Valid());
+ return file_iter_.value();
+ }
+
+ Status status() const override {
+ return file_iter_.iter() ? file_iter_.status() : Status::OK();
+ }
+
+ inline bool MayBeOutOfLowerBound() override {
+ assert(Valid());
+ return may_be_out_of_lower_bound_ && file_iter_.MayBeOutOfLowerBound();
+ }
+
+ inline bool MayBeOutOfUpperBound() override {
+ assert(Valid());
+ return file_iter_.MayBeOutOfUpperBound();
+ }
+
+ void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+ pinned_iters_mgr_ = pinned_iters_mgr;
+ if (file_iter_.iter()) {
+ file_iter_.SetPinnedItersMgr(pinned_iters_mgr);
+ }
+ }
+
+ bool IsKeyPinned() const override {
+ return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+ file_iter_.iter() && file_iter_.IsKeyPinned();
+ }
+
+ bool IsValuePinned() const override {
+ return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+ file_iter_.iter() && file_iter_.IsValuePinned();
+ }
+
+ private:
+ // Return true if at least one invalid file is seen and skipped.
+ bool SkipEmptyFileForward();
+ void SkipEmptyFileBackward();
+ void SetFileIterator(InternalIterator* iter);
+ void InitFileIterator(size_t new_file_index);
+
+ // Called by both of Next() and NextAndGetResult(). Force inline.
+ void NextImpl() {
+ assert(Valid());
+ file_iter_.Next();
+ SkipEmptyFileForward();
+ }
+
+ const Slice& file_smallest_key(size_t file_index) {
+ assert(file_index < flevel_->num_files);
+ return flevel_->files[file_index].smallest_key;
+ }
+
+ bool KeyReachedUpperBound(const Slice& internal_key) {
+ return read_options_.iterate_upper_bound != nullptr &&
+ user_comparator_.CompareWithoutTimestamp(
+ ExtractUserKey(internal_key),
+ *read_options_.iterate_upper_bound) >= 0;
+ }
+
+ InternalIterator* NewFileIterator() {
+ assert(file_index_ < flevel_->num_files);
+ auto file_meta = flevel_->files[file_index_];
+ if (should_sample_) {
+ sample_file_read_inc(file_meta.file_metadata);
+ }
+
+ const InternalKey* smallest_compaction_key = nullptr;
+ const InternalKey* largest_compaction_key = nullptr;
+ if (compaction_boundaries_ != nullptr) {
+ smallest_compaction_key = (*compaction_boundaries_)[file_index_].smallest;
+ largest_compaction_key = (*compaction_boundaries_)[file_index_].largest;
+ }
+ CheckMayBeOutOfLowerBound();
+ return table_cache_->NewIterator(
+ read_options_, file_options_, icomparator_, *file_meta.file_metadata,
+ range_del_agg_, prefix_extractor_,
+ nullptr /* don't need reference to table */, file_read_hist_, caller_,
+ /*arena=*/nullptr, skip_filters_, level_, smallest_compaction_key,
+ largest_compaction_key);
+ }
+
+ // Check if current file being fully within iterate_lower_bound.
+ //
+ // Note MyRocks may update iterate bounds between seek. To workaround it,
+ // we need to check and update may_be_out_of_lower_bound_ accordingly.
+ void CheckMayBeOutOfLowerBound() {
+ if (read_options_.iterate_lower_bound != nullptr &&
+ file_index_ < flevel_->num_files) {
+ may_be_out_of_lower_bound_ =
+ user_comparator_.Compare(
+ ExtractUserKey(file_smallest_key(file_index_)),
+ *read_options_.iterate_lower_bound) < 0;
+ }
+ }
+
+ TableCache* table_cache_;
+ const ReadOptions read_options_;
+ const FileOptions& file_options_;
+ const InternalKeyComparator& icomparator_;
+ const UserComparatorWrapper user_comparator_;
+ const LevelFilesBrief* flevel_;
+ mutable FileDescriptor current_value_;
+ // `prefix_extractor_` may be non-null even for total order seek. Checking
+ // this variable is not the right way to identify whether prefix iterator
+ // is used.
+ const SliceTransform* prefix_extractor_;
+
+ HistogramImpl* file_read_hist_;
+ bool should_sample_;
+ TableReaderCaller caller_;
+ bool skip_filters_;
+ bool may_be_out_of_lower_bound_ = true;
+ size_t file_index_;
+ int level_;
+ RangeDelAggregator* range_del_agg_;
+ IteratorWrapper file_iter_; // May be nullptr
+ PinnedIteratorsManager* pinned_iters_mgr_;
+
+ // To be propagated to RangeDelAggregator in order to safely truncate range
+ // tombstones.
+ const std::vector<AtomicCompactionUnitBoundary>* compaction_boundaries_;
+};
+
+void LevelIterator::Seek(const Slice& target) {
+ // Check whether the seek key fall under the same file
+ bool need_to_reseek = true;
+ if (file_iter_.iter() != nullptr && file_index_ < flevel_->num_files) {
+ const FdWithKeyRange& cur_file = flevel_->files[file_index_];
+ if (icomparator_.InternalKeyComparator::Compare(
+ target, cur_file.largest_key) <= 0 &&
+ icomparator_.InternalKeyComparator::Compare(
+ target, cur_file.smallest_key) >= 0) {
+ need_to_reseek = false;
+ assert(static_cast<size_t>(FindFile(icomparator_, *flevel_, target)) ==
+ file_index_);
+ }
+ }
+ if (need_to_reseek) {
+ TEST_SYNC_POINT("LevelIterator::Seek:BeforeFindFile");
+ size_t new_file_index = FindFile(icomparator_, *flevel_, target);
+ InitFileIterator(new_file_index);
+ }
+
+ if (file_iter_.iter() != nullptr) {
+ file_iter_.Seek(target);
+ }
+ if (SkipEmptyFileForward() && prefix_extractor_ != nullptr &&
+ !read_options_.total_order_seek && !read_options_.auto_prefix_mode &&
+ file_iter_.iter() != nullptr && file_iter_.Valid()) {
+ // We've skipped the file we initially positioned to. In the prefix
+ // seek case, it is likely that the file is skipped because of
+ // prefix bloom or hash, where more keys are skipped. We then check
+ // the current key and invalidate the iterator if the prefix is
+ // already passed.
+ // When doing prefix iterator seek, when keys for one prefix have
+ // been exhausted, it can jump to any key that is larger. Here we are
+ // enforcing a stricter contract than that, in order to make it easier for
+ // higher layers (merging and DB iterator) to reason the correctness:
+ // 1. Within the prefix, the result should be accurate.
+ // 2. If keys for the prefix is exhausted, it is either positioned to the
+ // next key after the prefix, or make the iterator invalid.
+ // A side benefit will be that it invalidates the iterator earlier so that
+ // the upper level merging iterator can merge fewer child iterators.
+ Slice target_user_key = ExtractUserKey(target);
+ Slice file_user_key = ExtractUserKey(file_iter_.key());
+ if (prefix_extractor_->InDomain(target_user_key) &&
+ (!prefix_extractor_->InDomain(file_user_key) ||
+ user_comparator_.Compare(
+ prefix_extractor_->Transform(target_user_key),
+ prefix_extractor_->Transform(file_user_key)) != 0)) {
+ SetFileIterator(nullptr);
+ }
+ }
+ CheckMayBeOutOfLowerBound();
+}
+
+void LevelIterator::SeekForPrev(const Slice& target) {
+ size_t new_file_index = FindFile(icomparator_, *flevel_, target);
+ if (new_file_index >= flevel_->num_files) {
+ new_file_index = flevel_->num_files - 1;
+ }
+
+ InitFileIterator(new_file_index);
+ if (file_iter_.iter() != nullptr) {
+ file_iter_.SeekForPrev(target);
+ SkipEmptyFileBackward();
+ }
+ CheckMayBeOutOfLowerBound();
+}
+
+void LevelIterator::SeekToFirst() {
+ InitFileIterator(0);
+ if (file_iter_.iter() != nullptr) {
+ file_iter_.SeekToFirst();
+ }
+ SkipEmptyFileForward();
+ CheckMayBeOutOfLowerBound();
+}
+
+void LevelIterator::SeekToLast() {
+ InitFileIterator(flevel_->num_files - 1);
+ if (file_iter_.iter() != nullptr) {
+ file_iter_.SeekToLast();
+ }
+ SkipEmptyFileBackward();
+ CheckMayBeOutOfLowerBound();
+}
+
+void LevelIterator::Next() { NextImpl(); }
+
+bool LevelIterator::NextAndGetResult(IterateResult* result) {
+ NextImpl();
+ bool is_valid = Valid();
+ if (is_valid) {
+ result->key = key();
+ result->may_be_out_of_upper_bound = MayBeOutOfUpperBound();
+ }
+ return is_valid;
+}
+
+void LevelIterator::Prev() {
+ assert(Valid());
+ file_iter_.Prev();
+ SkipEmptyFileBackward();
+}
+
+bool LevelIterator::SkipEmptyFileForward() {
+ bool seen_empty_file = false;
+ while (file_iter_.iter() == nullptr ||
+ (!file_iter_.Valid() && file_iter_.status().ok() &&
+ !file_iter_.iter()->IsOutOfBound())) {
+ seen_empty_file = true;
+ // Move to next file
+ if (file_index_ >= flevel_->num_files - 1) {
+ // Already at the last file
+ SetFileIterator(nullptr);
+ break;
+ }
+ if (KeyReachedUpperBound(file_smallest_key(file_index_ + 1))) {
+ SetFileIterator(nullptr);
+ break;
+ }
+ InitFileIterator(file_index_ + 1);
+ if (file_iter_.iter() != nullptr) {
+ file_iter_.SeekToFirst();
+ }
+ }
+ return seen_empty_file;
+}
+
+void LevelIterator::SkipEmptyFileBackward() {
+ while (file_iter_.iter() == nullptr ||
+ (!file_iter_.Valid() && file_iter_.status().ok())) {
+ // Move to previous file
+ if (file_index_ == 0) {
+ // Already the first file
+ SetFileIterator(nullptr);
+ return;
+ }
+ InitFileIterator(file_index_ - 1);
+ if (file_iter_.iter() != nullptr) {
+ file_iter_.SeekToLast();
+ }
+ }
+}
+
+void LevelIterator::SetFileIterator(InternalIterator* iter) {
+ if (pinned_iters_mgr_ && iter) {
+ iter->SetPinnedItersMgr(pinned_iters_mgr_);
+ }
+
+ InternalIterator* old_iter = file_iter_.Set(iter);
+ if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
+ pinned_iters_mgr_->PinIterator(old_iter);
+ } else {
+ delete old_iter;
+ }
+}
+
+void LevelIterator::InitFileIterator(size_t new_file_index) {
+ if (new_file_index >= flevel_->num_files) {
+ file_index_ = new_file_index;
+ SetFileIterator(nullptr);
+ return;
+ } else {
+ // If the file iterator shows incomplete, we try it again if users seek
+ // to the same file, as this time we may go to a different data block
+ // which is cached in block cache.
+ //
+ if (file_iter_.iter() != nullptr && !file_iter_.status().IsIncomplete() &&
+ new_file_index == file_index_) {
+ // file_iter_ is already constructed with this iterator, so
+ // no need to change anything
+ } else {
+ file_index_ = new_file_index;
+ InternalIterator* iter = NewFileIterator();
+ SetFileIterator(iter);
+ }
+ }
+}
+} // anonymous namespace
+
+// A wrapper of version builder which references the current version in
+// constructor and unref it in the destructor.
+// Both of the constructor and destructor need to be called inside DB Mutex.
+class BaseReferencedVersionBuilder {
+ public:
+ explicit BaseReferencedVersionBuilder(ColumnFamilyData* cfd)
+ : version_builder_(new VersionBuilder(
+ cfd->current()->version_set()->file_options(), cfd->table_cache(),
+ cfd->current()->storage_info(), cfd->ioptions()->info_log)),
+ version_(cfd->current()) {
+ version_->Ref();
+ }
+ ~BaseReferencedVersionBuilder() {
+ version_->Unref();
+ }
+ VersionBuilder* version_builder() { return version_builder_.get(); }
+
+ private:
+ std::unique_ptr<VersionBuilder> version_builder_;
+ Version* version_;
+};
+
+Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
+ const FileMetaData* file_meta,
+ const std::string* fname) const {
+ auto table_cache = cfd_->table_cache();
+ auto ioptions = cfd_->ioptions();
+ Status s = table_cache->GetTableProperties(
+ file_options_, cfd_->internal_comparator(), file_meta->fd, tp,
+ mutable_cf_options_.prefix_extractor.get(), true /* no io */);
+ if (s.ok()) {
+ return s;
+ }
+
+ // We only ignore error type `Incomplete` since it's by design that we
+ // disallow table when it's not in table cache.
+ if (!s.IsIncomplete()) {
+ return s;
+ }
+
+ // 2. Table is not present in table cache, we'll read the table properties
+ // directly from the properties block in the file.
+ std::unique_ptr<FSRandomAccessFile> file;
+ std::string file_name;
+ if (fname != nullptr) {
+ file_name = *fname;
+ } else {
+ file_name =
+ TableFileName(ioptions->cf_paths, file_meta->fd.GetNumber(),
+ file_meta->fd.GetPathId());
+ }
+ s = ioptions->fs->NewRandomAccessFile(file_name, file_options_, &file,
+ nullptr);
+ if (!s.ok()) {
+ return s;
+ }
+
+ TableProperties* raw_table_properties;
+ // By setting the magic number to kInvalidTableMagicNumber, we can by
+ // pass the magic number check in the footer.
+ std::unique_ptr<RandomAccessFileReader> file_reader(
+ new RandomAccessFileReader(
+ std::move(file), file_name, nullptr /* env */, nullptr /* stats */,
+ 0 /* hist_type */, nullptr /* file_read_hist */,
+ nullptr /* rate_limiter */, ioptions->listeners));
+ s = ReadTableProperties(
+ file_reader.get(), file_meta->fd.GetFileSize(),
+ Footer::kInvalidTableMagicNumber /* table's magic number */, *ioptions,
+ &raw_table_properties, false /* compression_type_missing */);
+ if (!s.ok()) {
+ return s;
+ }
+ RecordTick(ioptions->statistics, NUMBER_DIRECT_LOAD_TABLE_PROPERTIES);
+
+ *tp = std::shared_ptr<const TableProperties>(raw_table_properties);
+ return s;
+}
+
+Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) {
+ Status s;
+ for (int level = 0; level < storage_info_.num_levels_; level++) {
+ s = GetPropertiesOfAllTables(props, level);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ return Status::OK();
+}
+
+Status Version::TablesRangeTombstoneSummary(int max_entries_to_print,
+ std::string* out_str) {
+ if (max_entries_to_print <= 0) {
+ return Status::OK();
+ }
+ int num_entries_left = max_entries_to_print;
+
+ std::stringstream ss;
+
+ for (int level = 0; level < storage_info_.num_levels_; level++) {
+ for (const auto& file_meta : storage_info_.files_[level]) {
+ auto fname =
+ TableFileName(cfd_->ioptions()->cf_paths, file_meta->fd.GetNumber(),
+ file_meta->fd.GetPathId());
+
+ ss << "=== file : " << fname << " ===\n";
+
+ TableCache* table_cache = cfd_->table_cache();
+ std::unique_ptr<FragmentedRangeTombstoneIterator> tombstone_iter;
+
+ Status s = table_cache->GetRangeTombstoneIterator(
+ ReadOptions(), cfd_->internal_comparator(), *file_meta,
+ &tombstone_iter);
+ if (!s.ok()) {
+ return s;
+ }
+ if (tombstone_iter) {
+ tombstone_iter->SeekToFirst();
+
+ while (tombstone_iter->Valid() && num_entries_left > 0) {
+ ss << "start: " << tombstone_iter->start_key().ToString(true)
+ << " end: " << tombstone_iter->end_key().ToString(true)
+ << " seq: " << tombstone_iter->seq() << '\n';
+ tombstone_iter->Next();
+ num_entries_left--;
+ }
+ if (num_entries_left <= 0) {
+ break;
+ }
+ }
+ }
+ if (num_entries_left <= 0) {
+ break;
+ }
+ }
+ assert(num_entries_left >= 0);
+ if (num_entries_left <= 0) {
+ ss << "(results may not be complete)\n";
+ }
+
+ *out_str = ss.str();
+ return Status::OK();
+}
+
+Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props,
+ int level) {
+ for (const auto& file_meta : storage_info_.files_[level]) {
+ auto fname =
+ TableFileName(cfd_->ioptions()->cf_paths, file_meta->fd.GetNumber(),
+ file_meta->fd.GetPathId());
+ // 1. If the table is already present in table cache, load table
+ // properties from there.
+ std::shared_ptr<const TableProperties> table_properties;
+ Status s = GetTableProperties(&table_properties, file_meta, &fname);
+ if (s.ok()) {
+ props->insert({fname, table_properties});
+ } else {
+ return s;
+ }
+ }
+
+ return Status::OK();
+}
+
+Status Version::GetPropertiesOfTablesInRange(
+ const Range* range, std::size_t n, TablePropertiesCollection* props) const {
+ for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) {
+ for (decltype(n) i = 0; i < n; i++) {
+ // Convert user_key into a corresponding internal key.
+ InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
+ InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
+ std::vector<FileMetaData*> files;
+ storage_info_.GetOverlappingInputs(level, &k1, &k2, &files, -1, nullptr,
+ false);
+ for (const auto& file_meta : files) {
+ auto fname =
+ TableFileName(cfd_->ioptions()->cf_paths,
+ file_meta->fd.GetNumber(), file_meta->fd.GetPathId());
+ if (props->count(fname) == 0) {
+ // 1. If the table is already present in table cache, load table
+ // properties from there.
+ std::shared_ptr<const TableProperties> table_properties;
+ Status s = GetTableProperties(&table_properties, file_meta, &fname);
+ if (s.ok()) {
+ props->insert({fname, table_properties});
+ } else {
+ return s;
+ }
+ }
+ }
+ }
+ }
+
+ return Status::OK();
+}
+
+Status Version::GetAggregatedTableProperties(
+ std::shared_ptr<const TableProperties>* tp, int level) {
+ TablePropertiesCollection props;
+ Status s;
+ if (level < 0) {
+ s = GetPropertiesOfAllTables(&props);
+ } else {
+ s = GetPropertiesOfAllTables(&props, level);
+ }
+ if (!s.ok()) {
+ return s;
+ }
+
+ auto* new_tp = new TableProperties();
+ for (const auto& item : props) {
+ new_tp->Add(*item.second);
+ }
+ tp->reset(new_tp);
+ return Status::OK();
+}
+
+size_t Version::GetMemoryUsageByTableReaders() {
+ size_t total_usage = 0;
+ for (auto& file_level : storage_info_.level_files_brief_) {
+ for (size_t i = 0; i < file_level.num_files; i++) {
+ total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader(
+ file_options_, cfd_->internal_comparator(), file_level.files[i].fd,
+ mutable_cf_options_.prefix_extractor.get());
+ }
+ }
+ return total_usage;
+}
+
+void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) {
+ assert(cf_meta);
+ assert(cfd_);
+
+ cf_meta->name = cfd_->GetName();
+ cf_meta->size = 0;
+ cf_meta->file_count = 0;
+ cf_meta->levels.clear();
+
+ auto* ioptions = cfd_->ioptions();
+ auto* vstorage = storage_info();
+
+ for (int level = 0; level < cfd_->NumberLevels(); level++) {
+ uint64_t level_size = 0;
+ cf_meta->file_count += vstorage->LevelFiles(level).size();
+ std::vector<SstFileMetaData> files;
+ for (const auto& file : vstorage->LevelFiles(level)) {
+ uint32_t path_id = file->fd.GetPathId();
+ std::string file_path;
+ if (path_id < ioptions->cf_paths.size()) {
+ file_path = ioptions->cf_paths[path_id].path;
+ } else {
+ assert(!ioptions->cf_paths.empty());
+ file_path = ioptions->cf_paths.back().path;
+ }
+ const uint64_t file_number = file->fd.GetNumber();
+ files.emplace_back(SstFileMetaData{
+ MakeTableFileName("", file_number), file_number, file_path,
+ static_cast<size_t>(file->fd.GetFileSize()), file->fd.smallest_seqno,
+ file->fd.largest_seqno, file->smallest.user_key().ToString(),
+ file->largest.user_key().ToString(),
+ file->stats.num_reads_sampled.load(std::memory_order_relaxed),
+ file->being_compacted, file->oldest_blob_file_number,
+ file->TryGetOldestAncesterTime(), file->TryGetFileCreationTime(),
+ file->file_checksum, file->file_checksum_func_name});
+ files.back().num_entries = file->num_entries;
+ files.back().num_deletions = file->num_deletions;
+ level_size += file->fd.GetFileSize();
+ }
+ cf_meta->levels.emplace_back(
+ level, level_size, std::move(files));
+ cf_meta->size += level_size;
+ }
+}
+
+uint64_t Version::GetSstFilesSize() {
+ uint64_t sst_files_size = 0;
+ for (int level = 0; level < storage_info_.num_levels_; level++) {
+ for (const auto& file_meta : storage_info_.LevelFiles(level)) {
+ sst_files_size += file_meta->fd.GetFileSize();
+ }
+ }
+ return sst_files_size;
+}
+
+void Version::GetCreationTimeOfOldestFile(uint64_t* creation_time) {
+ uint64_t oldest_time = port::kMaxUint64;
+ for (int level = 0; level < storage_info_.num_non_empty_levels_; level++) {
+ for (FileMetaData* meta : storage_info_.LevelFiles(level)) {
+ assert(meta->fd.table_reader != nullptr);
+ uint64_t file_creation_time = meta->TryGetFileCreationTime();
+ if (file_creation_time == kUnknownFileCreationTime) {
+ *creation_time = 0;
+ return;
+ }
+ if (file_creation_time < oldest_time) {
+ oldest_time = file_creation_time;
+ }
+ }
+ }
+ *creation_time = oldest_time;
+}
+
+uint64_t VersionStorageInfo::GetEstimatedActiveKeys() const {
+ // Estimation will be inaccurate when:
+ // (1) there exist merge keys
+ // (2) keys are directly overwritten
+ // (3) deletion on non-existing keys
+ // (4) low number of samples
+ if (current_num_samples_ == 0) {
+ return 0;
+ }
+
+ if (current_num_non_deletions_ <= current_num_deletions_) {
+ return 0;
+ }
+
+ uint64_t est = current_num_non_deletions_ - current_num_deletions_;
+
+ uint64_t file_count = 0;
+ for (int level = 0; level < num_levels_; ++level) {
+ file_count += files_[level].size();
+ }
+
+ if (current_num_samples_ < file_count) {
+ // casting to avoid overflowing
+ return
+ static_cast<uint64_t>(
+ (est * static_cast<double>(file_count) / current_num_samples_)
+ );
+ } else {
+ return est;
+ }
+}
+
+double VersionStorageInfo::GetEstimatedCompressionRatioAtLevel(
+ int level) const {
+ assert(level < num_levels_);
+ uint64_t sum_file_size_bytes = 0;
+ uint64_t sum_data_size_bytes = 0;
+ for (auto* file_meta : files_[level]) {
+ sum_file_size_bytes += file_meta->fd.GetFileSize();
+ sum_data_size_bytes += file_meta->raw_key_size + file_meta->raw_value_size;
+ }
+ if (sum_file_size_bytes == 0) {
+ return -1.0;
+ }
+ return static_cast<double>(sum_data_size_bytes) / sum_file_size_bytes;
+}
+
+void Version::AddIterators(const ReadOptions& read_options,
+ const FileOptions& soptions,
+ MergeIteratorBuilder* merge_iter_builder,
+ RangeDelAggregator* range_del_agg) {
+ assert(storage_info_.finalized_);
+
+ for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) {
+ AddIteratorsForLevel(read_options, soptions, merge_iter_builder, level,
+ range_del_agg);
+ }
+}
+
+void Version::AddIteratorsForLevel(const ReadOptions& read_options,
+ const FileOptions& soptions,
+ MergeIteratorBuilder* merge_iter_builder,
+ int level,
+ RangeDelAggregator* range_del_agg) {
+ assert(storage_info_.finalized_);
+ if (level >= storage_info_.num_non_empty_levels()) {
+ // This is an empty level
+ return;
+ } else if (storage_info_.LevelFilesBrief(level).num_files == 0) {
+ // No files in this level
+ return;
+ }
+
+ bool should_sample = should_sample_file_read();
+
+ auto* arena = merge_iter_builder->GetArena();
+ if (level == 0) {
+ // Merge all level zero files together since they may overlap
+ for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) {
+ const auto& file = storage_info_.LevelFilesBrief(0).files[i];
+ merge_iter_builder->AddIterator(cfd_->table_cache()->NewIterator(
+ read_options, soptions, cfd_->internal_comparator(),
+ *file.file_metadata, range_del_agg,
+ mutable_cf_options_.prefix_extractor.get(), nullptr,
+ cfd_->internal_stats()->GetFileReadHist(0),
+ TableReaderCaller::kUserIterator, arena,
+ /*skip_filters=*/false, /*level=*/0,
+ /*smallest_compaction_key=*/nullptr,
+ /*largest_compaction_key=*/nullptr));
+ }
+ if (should_sample) {
+ // Count ones for every L0 files. This is done per iterator creation
+ // rather than Seek(), while files in other levels are recored per seek.
+ // If users execute one range query per iterator, there may be some
+ // discrepancy here.
+ for (FileMetaData* meta : storage_info_.LevelFiles(0)) {
+ sample_file_read_inc(meta);
+ }
+ }
+ } else if (storage_info_.LevelFilesBrief(level).num_files > 0) {
+ // For levels > 0, we can use a concatenating iterator that sequentially
+ // walks through the non-overlapping files in the level, opening them
+ // lazily.
+ auto* mem = arena->AllocateAligned(sizeof(LevelIterator));
+ merge_iter_builder->AddIterator(new (mem) LevelIterator(
+ cfd_->table_cache(), read_options, soptions,
+ cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level),
+ mutable_cf_options_.prefix_extractor.get(), should_sample_file_read(),
+ cfd_->internal_stats()->GetFileReadHist(level),
+ TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
+ range_del_agg, /*largest_compaction_key=*/nullptr));
+ }
+}
+
+Status Version::OverlapWithLevelIterator(const ReadOptions& read_options,
+ const FileOptions& file_options,
+ const Slice& smallest_user_key,
+ const Slice& largest_user_key,
+ int level, bool* overlap) {
+ assert(storage_info_.finalized_);
+
+ auto icmp = cfd_->internal_comparator();
+ auto ucmp = icmp.user_comparator();
+
+ Arena arena;
+ Status status;
+ ReadRangeDelAggregator range_del_agg(&icmp,
+ kMaxSequenceNumber /* upper_bound */);
+
+ *overlap = false;
+
+ if (level == 0) {
+ for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) {
+ const auto file = &storage_info_.LevelFilesBrief(0).files[i];
+ if (AfterFile(ucmp, &smallest_user_key, file) ||
+ BeforeFile(ucmp, &largest_user_key, file)) {
+ continue;
+ }
+ ScopedArenaIterator iter(cfd_->table_cache()->NewIterator(
+ read_options, file_options, cfd_->internal_comparator(),
+ *file->file_metadata, &range_del_agg,
+ mutable_cf_options_.prefix_extractor.get(), nullptr,
+ cfd_->internal_stats()->GetFileReadHist(0),
+ TableReaderCaller::kUserIterator, &arena,
+ /*skip_filters=*/false, /*level=*/0,
+ /*smallest_compaction_key=*/nullptr,
+ /*largest_compaction_key=*/nullptr));
+ status = OverlapWithIterator(
+ ucmp, smallest_user_key, largest_user_key, iter.get(), overlap);
+ if (!status.ok() || *overlap) {
+ break;
+ }
+ }
+ } else if (storage_info_.LevelFilesBrief(level).num_files > 0) {
+ auto mem = arena.AllocateAligned(sizeof(LevelIterator));
+ ScopedArenaIterator iter(new (mem) LevelIterator(
+ cfd_->table_cache(), read_options, file_options,
+ cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level),
+ mutable_cf_options_.prefix_extractor.get(), should_sample_file_read(),
+ cfd_->internal_stats()->GetFileReadHist(level),
+ TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
+ &range_del_agg));
+ status = OverlapWithIterator(
+ ucmp, smallest_user_key, largest_user_key, iter.get(), overlap);
+ }
+
+ if (status.ok() && *overlap == false &&
+ range_del_agg.IsRangeOverlapped(smallest_user_key, largest_user_key)) {
+ *overlap = true;
+ }
+ return status;
+}
+
+VersionStorageInfo::VersionStorageInfo(
+ const InternalKeyComparator* internal_comparator,
+ const Comparator* user_comparator, int levels,
+ CompactionStyle compaction_style, VersionStorageInfo* ref_vstorage,
+ bool _force_consistency_checks)
+ : internal_comparator_(internal_comparator),
+ user_comparator_(user_comparator),
+ // cfd is nullptr if Version is dummy
+ num_levels_(levels),
+ num_non_empty_levels_(0),
+ file_indexer_(user_comparator),
+ compaction_style_(compaction_style),
+ files_(new std::vector<FileMetaData*>[num_levels_]),
+ base_level_(num_levels_ == 1 ? -1 : 1),
+ level_multiplier_(0.0),
+ files_by_compaction_pri_(num_levels_),
+ level0_non_overlapping_(false),
+ next_file_to_compact_by_size_(num_levels_),
+ compaction_score_(num_levels_),
+ compaction_level_(num_levels_),
+ l0_delay_trigger_count_(0),
+ accumulated_file_size_(0),
+ accumulated_raw_key_size_(0),
+ accumulated_raw_value_size_(0),
+ accumulated_num_non_deletions_(0),
+ accumulated_num_deletions_(0),
+ current_num_non_deletions_(0),
+ current_num_deletions_(0),
+ current_num_samples_(0),
+ estimated_compaction_needed_bytes_(0),
+ finalized_(false),
+ force_consistency_checks_(_force_consistency_checks) {
+ if (ref_vstorage != nullptr) {
+ accumulated_file_size_ = ref_vstorage->accumulated_file_size_;
+ accumulated_raw_key_size_ = ref_vstorage->accumulated_raw_key_size_;
+ accumulated_raw_value_size_ = ref_vstorage->accumulated_raw_value_size_;
+ accumulated_num_non_deletions_ =
+ ref_vstorage->accumulated_num_non_deletions_;
+ accumulated_num_deletions_ = ref_vstorage->accumulated_num_deletions_;
+ current_num_non_deletions_ = ref_vstorage->current_num_non_deletions_;
+ current_num_deletions_ = ref_vstorage->current_num_deletions_;
+ current_num_samples_ = ref_vstorage->current_num_samples_;
+ oldest_snapshot_seqnum_ = ref_vstorage->oldest_snapshot_seqnum_;
+ }
+}
+
+Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset,
+ const FileOptions& file_opt,
+ const MutableCFOptions mutable_cf_options,
+ uint64_t version_number)
+ : env_(vset->env_),
+ cfd_(column_family_data),
+ info_log_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->info_log),
+ db_statistics_((cfd_ == nullptr) ? nullptr
+ : cfd_->ioptions()->statistics),
+ table_cache_((cfd_ == nullptr) ? nullptr : cfd_->table_cache()),
+ merge_operator_((cfd_ == nullptr) ? nullptr
+ : cfd_->ioptions()->merge_operator),
+ storage_info_(
+ (cfd_ == nullptr) ? nullptr : &cfd_->internal_comparator(),
+ (cfd_ == nullptr) ? nullptr : cfd_->user_comparator(),
+ cfd_ == nullptr ? 0 : cfd_->NumberLevels(),
+ cfd_ == nullptr ? kCompactionStyleLevel
+ : cfd_->ioptions()->compaction_style,
+ (cfd_ == nullptr || cfd_->current() == nullptr)
+ ? nullptr
+ : cfd_->current()->storage_info(),
+ cfd_ == nullptr ? false : cfd_->ioptions()->force_consistency_checks),
+ vset_(vset),
+ next_(this),
+ prev_(this),
+ refs_(0),
+ file_options_(file_opt),
+ mutable_cf_options_(mutable_cf_options),
+ version_number_(version_number) {}
+
+void Version::Get(const ReadOptions& read_options, const LookupKey& k,
+ PinnableSlice* value, Status* status,
+ MergeContext* merge_context,
+ SequenceNumber* max_covering_tombstone_seq, bool* value_found,
+ bool* key_exists, SequenceNumber* seq, ReadCallback* callback,
+ bool* is_blob, bool do_merge) {
+ Slice ikey = k.internal_key();
+ Slice user_key = k.user_key();
+
+ assert(status->ok() || status->IsMergeInProgress());
+
+ if (key_exists != nullptr) {
+ // will falsify below if not found
+ *key_exists = true;
+ }
+
+ PinnedIteratorsManager pinned_iters_mgr;
+ uint64_t tracing_get_id = BlockCacheTraceHelper::kReservedGetId;
+ if (vset_ && vset_->block_cache_tracer_ &&
+ vset_->block_cache_tracer_->is_tracing_enabled()) {
+ tracing_get_id = vset_->block_cache_tracer_->NextGetId();
+ }
+ GetContext get_context(
+ user_comparator(), merge_operator_, info_log_, db_statistics_,
+ status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key,
+ do_merge ? value : nullptr, value_found, merge_context, do_merge,
+ max_covering_tombstone_seq, this->env_, seq,
+ merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob,
+ tracing_get_id);
+
+ // Pin blocks that we read to hold merge operands
+ if (merge_operator_) {
+ pinned_iters_mgr.StartPinning();
+ }
+
+ FilePicker fp(
+ storage_info_.files_, user_key, ikey, &storage_info_.level_files_brief_,
+ storage_info_.num_non_empty_levels_, &storage_info_.file_indexer_,
+ user_comparator(), internal_comparator());
+ FdWithKeyRange* f = fp.GetNextFile();
+
+ while (f != nullptr) {
+ if (*max_covering_tombstone_seq > 0) {
+ // The remaining files we look at will only contain covered keys, so we
+ // stop here.
+ break;
+ }
+ if (get_context.sample()) {
+ sample_file_read_inc(f->file_metadata);
+ }
+
+ bool timer_enabled =
+ GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex &&
+ get_perf_context()->per_level_perf_context_enabled;
+ StopWatchNano timer(env_, timer_enabled /* auto_start */);
+ *status = table_cache_->Get(
+ read_options, *internal_comparator(), *f->file_metadata, ikey,
+ &get_context, mutable_cf_options_.prefix_extractor.get(),
+ cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
+ IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
+ fp.IsHitFileLastInLevel()),
+ fp.GetCurrentLevel());
+ // TODO: examine the behavior for corrupted key
+ if (timer_enabled) {
+ PERF_COUNTER_BY_LEVEL_ADD(get_from_table_nanos, timer.ElapsedNanos(),
+ fp.GetCurrentLevel());
+ }
+ if (!status->ok()) {
+ return;
+ }
+
+ // report the counters before returning
+ if (get_context.State() != GetContext::kNotFound &&
+ get_context.State() != GetContext::kMerge &&
+ db_statistics_ != nullptr) {
+ get_context.ReportCounters();
+ }
+ switch (get_context.State()) {
+ case GetContext::kNotFound:
+ // Keep searching in other files
+ break;
+ case GetContext::kMerge:
+ // TODO: update per-level perfcontext user_key_return_count for kMerge
+ break;
+ case GetContext::kFound:
+ if (fp.GetHitFileLevel() == 0) {
+ RecordTick(db_statistics_, GET_HIT_L0);
+ } else if (fp.GetHitFileLevel() == 1) {
+ RecordTick(db_statistics_, GET_HIT_L1);
+ } else if (fp.GetHitFileLevel() >= 2) {
+ RecordTick(db_statistics_, GET_HIT_L2_AND_UP);
+ }
+ PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1,
+ fp.GetHitFileLevel());
+ return;
+ case GetContext::kDeleted:
+ // Use empty error message for speed
+ *status = Status::NotFound();
+ return;
+ case GetContext::kCorrupt:
+ *status = Status::Corruption("corrupted key for ", user_key);
+ return;
+ case GetContext::kBlobIndex:
+ ROCKS_LOG_ERROR(info_log_, "Encounter unexpected blob index.");
+ *status = Status::NotSupported(
+ "Encounter unexpected blob index. Please open DB with "
+ "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
+ return;
+ }
+ f = fp.GetNextFile();
+ }
+ if (db_statistics_ != nullptr) {
+ get_context.ReportCounters();
+ }
+ if (GetContext::kMerge == get_context.State()) {
+ if (!do_merge) {
+ *status = Status::OK();
+ return;
+ }
+ if (!merge_operator_) {
+ *status = Status::InvalidArgument(
+ "merge_operator is not properly initialized.");
+ return;
+ }
+ // merge_operands are in saver and we hit the beginning of the key history
+ // do a final merge of nullptr and operands;
+ std::string* str_value = value != nullptr ? value->GetSelf() : nullptr;
+ *status = MergeHelper::TimedFullMerge(
+ merge_operator_, user_key, nullptr, merge_context->GetOperands(),
+ str_value, info_log_, db_statistics_, env_,
+ nullptr /* result_operand */, true);
+ if (LIKELY(value != nullptr)) {
+ value->PinSelf();
+ }
+ } else {
+ if (key_exists != nullptr) {
+ *key_exists = false;
+ }
+ *status = Status::NotFound(); // Use an empty error message for speed
+ }
+}
+
+void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
+ ReadCallback* callback, bool* is_blob) {
+ PinnedIteratorsManager pinned_iters_mgr;
+
+ // Pin blocks that we read to hold merge operands
+ if (merge_operator_) {
+ pinned_iters_mgr.StartPinning();
+ }
+ uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId;
+
+ if (vset_ && vset_->block_cache_tracer_ &&
+ vset_->block_cache_tracer_->is_tracing_enabled()) {
+ tracing_mget_id = vset_->block_cache_tracer_->NextGetId();
+ }
+ // Even though we know the batch size won't be > MAX_BATCH_SIZE,
+ // use autovector in order to avoid unnecessary construction of GetContext
+ // objects, which is expensive
+ autovector<GetContext, 16> get_ctx;
+ for (auto iter = range->begin(); iter != range->end(); ++iter) {
+ assert(iter->s->ok() || iter->s->IsMergeInProgress());
+ get_ctx.emplace_back(
+ user_comparator(), merge_operator_, info_log_, db_statistics_,
+ iter->s->ok() ? GetContext::kNotFound : GetContext::kMerge, iter->ukey,
+ iter->value, nullptr, &(iter->merge_context), true,
+ &iter->max_covering_tombstone_seq, this->env_, nullptr,
+ merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob,
+ tracing_mget_id);
+ // MergeInProgress status, if set, has been transferred to the get_context
+ // state, so we set status to ok here. From now on, the iter status will
+ // be used for IO errors, and get_context state will be used for any
+ // key level errors
+ *(iter->s) = Status::OK();
+ }
+ int get_ctx_index = 0;
+ for (auto iter = range->begin(); iter != range->end();
+ ++iter, get_ctx_index++) {
+ iter->get_context = &(get_ctx[get_ctx_index]);
+ }
+
+ MultiGetRange file_picker_range(*range, range->begin(), range->end());
+ FilePickerMultiGet fp(
+ &file_picker_range,
+ &storage_info_.level_files_brief_, storage_info_.num_non_empty_levels_,
+ &storage_info_.file_indexer_, user_comparator(), internal_comparator());
+ FdWithKeyRange* f = fp.GetNextFile();
+
+ while (f != nullptr) {
+ MultiGetRange file_range = fp.CurrentFileRange();
+ bool timer_enabled =
+ GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex &&
+ get_perf_context()->per_level_perf_context_enabled;
+ StopWatchNano timer(env_, timer_enabled /* auto_start */);
+ Status s = table_cache_->MultiGet(
+ read_options, *internal_comparator(), *f->file_metadata, &file_range,
+ mutable_cf_options_.prefix_extractor.get(),
+ cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
+ IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
+ fp.IsHitFileLastInLevel()),
+ fp.GetCurrentLevel());
+ // TODO: examine the behavior for corrupted key
+ if (timer_enabled) {
+ PERF_COUNTER_BY_LEVEL_ADD(get_from_table_nanos, timer.ElapsedNanos(),
+ fp.GetCurrentLevel());
+ }
+ if (!s.ok()) {
+ // TODO: Set status for individual keys appropriately
+ for (auto iter = file_range.begin(); iter != file_range.end(); ++iter) {
+ *iter->s = s;
+ file_range.MarkKeyDone(iter);
+ }
+ return;
+ }
+ uint64_t batch_size = 0;
+ for (auto iter = file_range.begin(); iter != file_range.end(); ++iter) {
+ GetContext& get_context = *iter->get_context;
+ Status* status = iter->s;
+ // The Status in the KeyContext takes precedence over GetContext state
+ // Status may be an error if there were any IO errors in the table
+ // reader. We never expect Status to be NotFound(), as that is
+ // determined by get_context
+ assert(!status->IsNotFound());
+ if (!status->ok()) {
+ file_range.MarkKeyDone(iter);
+ continue;
+ }
+
+ if (get_context.sample()) {
+ sample_file_read_inc(f->file_metadata);
+ }
+ batch_size++;
+ // report the counters before returning
+ if (get_context.State() != GetContext::kNotFound &&
+ get_context.State() != GetContext::kMerge &&
+ db_statistics_ != nullptr) {
+ get_context.ReportCounters();
+ } else {
+ if (iter->max_covering_tombstone_seq > 0) {
+ // The remaining files we look at will only contain covered keys, so
+ // we stop here for this key
+ file_picker_range.SkipKey(iter);
+ }
+ }
+ switch (get_context.State()) {
+ case GetContext::kNotFound:
+ // Keep searching in other files
+ break;
+ case GetContext::kMerge:
+ // TODO: update per-level perfcontext user_key_return_count for kMerge
+ break;
+ case GetContext::kFound:
+ if (fp.GetHitFileLevel() == 0) {
+ RecordTick(db_statistics_, GET_HIT_L0);
+ } else if (fp.GetHitFileLevel() == 1) {
+ RecordTick(db_statistics_, GET_HIT_L1);
+ } else if (fp.GetHitFileLevel() >= 2) {
+ RecordTick(db_statistics_, GET_HIT_L2_AND_UP);
+ }
+ PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1,
+ fp.GetHitFileLevel());
+ file_range.MarkKeyDone(iter);
+ continue;
+ case GetContext::kDeleted:
+ // Use empty error message for speed
+ *status = Status::NotFound();
+ file_range.MarkKeyDone(iter);
+ continue;
+ case GetContext::kCorrupt:
+ *status =
+ Status::Corruption("corrupted key for ", iter->lkey->user_key());
+ file_range.MarkKeyDone(iter);
+ continue;
+ case GetContext::kBlobIndex:
+ ROCKS_LOG_ERROR(info_log_, "Encounter unexpected blob index.");
+ *status = Status::NotSupported(
+ "Encounter unexpected blob index. Please open DB with "
+ "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
+ file_range.MarkKeyDone(iter);
+ continue;
+ }
+ }
+ RecordInHistogram(db_statistics_, SST_BATCH_SIZE, batch_size);
+ if (file_picker_range.empty()) {
+ break;
+ }
+ f = fp.GetNextFile();
+ }
+
+ // Process any left over keys
+ for (auto iter = range->begin(); iter != range->end(); ++iter) {
+ GetContext& get_context = *iter->get_context;
+ Status* status = iter->s;
+ Slice user_key = iter->lkey->user_key();
+
+ if (db_statistics_ != nullptr) {
+ get_context.ReportCounters();
+ }
+ if (GetContext::kMerge == get_context.State()) {
+ if (!merge_operator_) {
+ *status = Status::InvalidArgument(
+ "merge_operator is not properly initialized.");
+ range->MarkKeyDone(iter);
+ continue;
+ }
+ // merge_operands are in saver and we hit the beginning of the key history
+ // do a final merge of nullptr and operands;
+ std::string* str_value =
+ iter->value != nullptr ? iter->value->GetSelf() : nullptr;
+ *status = MergeHelper::TimedFullMerge(
+ merge_operator_, user_key, nullptr, iter->merge_context.GetOperands(),
+ str_value, info_log_, db_statistics_, env_,
+ nullptr /* result_operand */, true);
+ if (LIKELY(iter->value != nullptr)) {
+ iter->value->PinSelf();
+ }
+ } else {
+ range->MarkKeyDone(iter);
+ *status = Status::NotFound(); // Use an empty error message for speed
+ }
+ }
+}
+
+bool Version::IsFilterSkipped(int level, bool is_file_last_in_level) {
+ // Reaching the bottom level implies misses at all upper levels, so we'll
+ // skip checking the filters when we predict a hit.
+ return cfd_->ioptions()->optimize_filters_for_hits &&
+ (level > 0 || is_file_last_in_level) &&
+ level == storage_info_.num_non_empty_levels() - 1;
+}
+
+void VersionStorageInfo::GenerateLevelFilesBrief() {
+ level_files_brief_.resize(num_non_empty_levels_);
+ for (int level = 0; level < num_non_empty_levels_; level++) {
+ DoGenerateLevelFilesBrief(
+ &level_files_brief_[level], files_[level], &arena_);
+ }
+}
+
+void Version::PrepareApply(
+ const MutableCFOptions& mutable_cf_options,
+ bool update_stats) {
+ UpdateAccumulatedStats(update_stats);
+ storage_info_.UpdateNumNonEmptyLevels();
+ storage_info_.CalculateBaseBytes(*cfd_->ioptions(), mutable_cf_options);
+ storage_info_.UpdateFilesByCompactionPri(cfd_->ioptions()->compaction_pri);
+ storage_info_.GenerateFileIndexer();
+ storage_info_.GenerateLevelFilesBrief();
+ storage_info_.GenerateLevel0NonOverlapping();
+ storage_info_.GenerateBottommostFiles();
+}
+
+bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) {
+ if (file_meta->init_stats_from_file ||
+ file_meta->compensated_file_size > 0) {
+ return false;
+ }
+ std::shared_ptr<const TableProperties> tp;
+ Status s = GetTableProperties(&tp, file_meta);
+ file_meta->init_stats_from_file = true;
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(vset_->db_options_->info_log,
+ "Unable to load table properties for file %" PRIu64
+ " --- %s\n",
+ file_meta->fd.GetNumber(), s.ToString().c_str());
+ return false;
+ }
+ if (tp.get() == nullptr) return false;
+ file_meta->num_entries = tp->num_entries;
+ file_meta->num_deletions = tp->num_deletions;
+ file_meta->raw_value_size = tp->raw_value_size;
+ file_meta->raw_key_size = tp->raw_key_size;
+
+ return true;
+}
+
+void VersionStorageInfo::UpdateAccumulatedStats(FileMetaData* file_meta) {
+ TEST_SYNC_POINT_CALLBACK("VersionStorageInfo::UpdateAccumulatedStats",
+ nullptr);
+
+ assert(file_meta->init_stats_from_file);
+ accumulated_file_size_ += file_meta->fd.GetFileSize();
+ accumulated_raw_key_size_ += file_meta->raw_key_size;
+ accumulated_raw_value_size_ += file_meta->raw_value_size;
+ accumulated_num_non_deletions_ +=
+ file_meta->num_entries - file_meta->num_deletions;
+ accumulated_num_deletions_ += file_meta->num_deletions;
+
+ current_num_non_deletions_ +=
+ file_meta->num_entries - file_meta->num_deletions;
+ current_num_deletions_ += file_meta->num_deletions;
+ current_num_samples_++;
+}
+
+void VersionStorageInfo::RemoveCurrentStats(FileMetaData* file_meta) {
+ if (file_meta->init_stats_from_file) {
+ current_num_non_deletions_ -=
+ file_meta->num_entries - file_meta->num_deletions;
+ current_num_deletions_ -= file_meta->num_deletions;
+ current_num_samples_--;
+ }
+}
+
+void Version::UpdateAccumulatedStats(bool update_stats) {
+ if (update_stats) {
+ // maximum number of table properties loaded from files.
+ const int kMaxInitCount = 20;
+ int init_count = 0;
+ // here only the first kMaxInitCount files which haven't been
+ // initialized from file will be updated with num_deletions.
+ // The motivation here is to cap the maximum I/O per Version creation.
+ // The reason for choosing files from lower-level instead of higher-level
+ // is that such design is able to propagate the initialization from
+ // lower-level to higher-level: When the num_deletions of lower-level
+ // files are updated, it will make the lower-level files have accurate
+ // compensated_file_size, making lower-level to higher-level compaction
+ // will be triggered, which creates higher-level files whose num_deletions
+ // will be updated here.
+ for (int level = 0;
+ level < storage_info_.num_levels_ && init_count < kMaxInitCount;
+ ++level) {
+ for (auto* file_meta : storage_info_.files_[level]) {
+ if (MaybeInitializeFileMetaData(file_meta)) {
+ // each FileMeta will be initialized only once.
+ storage_info_.UpdateAccumulatedStats(file_meta);
+ // when option "max_open_files" is -1, all the file metadata has
+ // already been read, so MaybeInitializeFileMetaData() won't incur
+ // any I/O cost. "max_open_files=-1" means that the table cache passed
+ // to the VersionSet and then to the ColumnFamilySet has a size of
+ // TableCache::kInfiniteCapacity
+ if (vset_->GetColumnFamilySet()->get_table_cache()->GetCapacity() ==
+ TableCache::kInfiniteCapacity) {
+ continue;
+ }
+ if (++init_count >= kMaxInitCount) {
+ break;
+ }
+ }
+ }
+ }
+ // In case all sampled-files contain only deletion entries, then we
+ // load the table-property of a file in higher-level to initialize
+ // that value.
+ for (int level = storage_info_.num_levels_ - 1;
+ storage_info_.accumulated_raw_value_size_ == 0 && level >= 0;
+ --level) {
+ for (int i = static_cast<int>(storage_info_.files_[level].size()) - 1;
+ storage_info_.accumulated_raw_value_size_ == 0 && i >= 0; --i) {
+ if (MaybeInitializeFileMetaData(storage_info_.files_[level][i])) {
+ storage_info_.UpdateAccumulatedStats(storage_info_.files_[level][i]);
+ }
+ }
+ }
+ }
+
+ storage_info_.ComputeCompensatedSizes();
+}
+
+void VersionStorageInfo::ComputeCompensatedSizes() {
+ static const int kDeletionWeightOnCompaction = 2;
+ uint64_t average_value_size = GetAverageValueSize();
+
+ // compute the compensated size
+ for (int level = 0; level < num_levels_; level++) {
+ for (auto* file_meta : files_[level]) {
+ // Here we only compute compensated_file_size for those file_meta
+ // which compensated_file_size is uninitialized (== 0). This is true only
+ // for files that have been created right now and no other thread has
+ // access to them. That's why we can safely mutate compensated_file_size.
+ if (file_meta->compensated_file_size == 0) {
+ file_meta->compensated_file_size = file_meta->fd.GetFileSize();
+ // Here we only boost the size of deletion entries of a file only
+ // when the number of deletion entries is greater than the number of
+ // non-deletion entries in the file. The motivation here is that in
+ // a stable workload, the number of deletion entries should be roughly
+ // equal to the number of non-deletion entries. If we compensate the
+ // size of deletion entries in a stable workload, the deletion
+ // compensation logic might introduce unwanted effet which changes the
+ // shape of LSM tree.
+ if (file_meta->num_deletions * 2 >= file_meta->num_entries) {
+ file_meta->compensated_file_size +=
+ (file_meta->num_deletions * 2 - file_meta->num_entries) *
+ average_value_size * kDeletionWeightOnCompaction;
+ }
+ }
+ }
+ }
+}
+
+int VersionStorageInfo::MaxInputLevel() const {
+ if (compaction_style_ == kCompactionStyleLevel) {
+ return num_levels() - 2;
+ }
+ return 0;
+}
+
+int VersionStorageInfo::MaxOutputLevel(bool allow_ingest_behind) const {
+ if (allow_ingest_behind) {
+ assert(num_levels() > 1);
+ return num_levels() - 2;
+ }
+ return num_levels() - 1;
+}
+
+void VersionStorageInfo::EstimateCompactionBytesNeeded(
+ const MutableCFOptions& mutable_cf_options) {
+ // Only implemented for level-based compaction
+ if (compaction_style_ != kCompactionStyleLevel) {
+ estimated_compaction_needed_bytes_ = 0;
+ return;
+ }
+
+ // Start from Level 0, if level 0 qualifies compaction to level 1,
+ // we estimate the size of compaction.
+ // Then we move on to the next level and see whether it qualifies compaction
+ // to the next level. The size of the level is estimated as the actual size
+ // on the level plus the input bytes from the previous level if there is any.
+ // If it exceeds, take the exceeded bytes as compaction input and add the size
+ // of the compaction size to tatal size.
+ // We keep doing it to Level 2, 3, etc, until the last level and return the
+ // accumulated bytes.
+
+ uint64_t bytes_compact_to_next_level = 0;
+ uint64_t level_size = 0;
+ for (auto* f : files_[0]) {
+ level_size += f->fd.GetFileSize();
+ }
+ // Level 0
+ bool level0_compact_triggered = false;
+ if (static_cast<int>(files_[0].size()) >=
+ mutable_cf_options.level0_file_num_compaction_trigger ||
+ level_size >= mutable_cf_options.max_bytes_for_level_base) {
+ level0_compact_triggered = true;
+ estimated_compaction_needed_bytes_ = level_size;
+ bytes_compact_to_next_level = level_size;
+ } else {
+ estimated_compaction_needed_bytes_ = 0;
+ }
+
+ // Level 1 and up.
+ uint64_t bytes_next_level = 0;
+ for (int level = base_level(); level <= MaxInputLevel(); level++) {
+ level_size = 0;
+ if (bytes_next_level > 0) {
+#ifndef NDEBUG
+ uint64_t level_size2 = 0;
+ for (auto* f : files_[level]) {
+ level_size2 += f->fd.GetFileSize();
+ }
+ assert(level_size2 == bytes_next_level);
+#endif
+ level_size = bytes_next_level;
+ bytes_next_level = 0;
+ } else {
+ for (auto* f : files_[level]) {
+ level_size += f->fd.GetFileSize();
+ }
+ }
+ if (level == base_level() && level0_compact_triggered) {
+ // Add base level size to compaction if level0 compaction triggered.
+ estimated_compaction_needed_bytes_ += level_size;
+ }
+ // Add size added by previous compaction
+ level_size += bytes_compact_to_next_level;
+ bytes_compact_to_next_level = 0;
+ uint64_t level_target = MaxBytesForLevel(level);
+ if (level_size > level_target) {
+ bytes_compact_to_next_level = level_size - level_target;
+ // Estimate the actual compaction fan-out ratio as size ratio between
+ // the two levels.
+
+ assert(bytes_next_level == 0);
+ if (level + 1 < num_levels_) {
+ for (auto* f : files_[level + 1]) {
+ bytes_next_level += f->fd.GetFileSize();
+ }
+ }
+ if (bytes_next_level > 0) {
+ assert(level_size > 0);
+ estimated_compaction_needed_bytes_ += static_cast<uint64_t>(
+ static_cast<double>(bytes_compact_to_next_level) *
+ (static_cast<double>(bytes_next_level) /
+ static_cast<double>(level_size) +
+ 1));
+ }
+ }
+ }
+}
+
+namespace {
+uint32_t GetExpiredTtlFilesCount(const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options,
+ const std::vector<FileMetaData*>& files) {
+ uint32_t ttl_expired_files_count = 0;
+
+ int64_t _current_time;
+ auto status = ioptions.env->GetCurrentTime(&_current_time);
+ if (status.ok()) {
+ const uint64_t current_time = static_cast<uint64_t>(_current_time);
+ for (FileMetaData* f : files) {
+ if (!f->being_compacted) {
+ uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime();
+ if (oldest_ancester_time != 0 &&
+ oldest_ancester_time < (current_time - mutable_cf_options.ttl)) {
+ ttl_expired_files_count++;
+ }
+ }
+ }
+ }
+ return ttl_expired_files_count;
+}
+} // anonymous namespace
+
+void VersionStorageInfo::ComputeCompactionScore(
+ const ImmutableCFOptions& immutable_cf_options,
+ const MutableCFOptions& mutable_cf_options) {
+ for (int level = 0; level <= MaxInputLevel(); level++) {
+ double score;
+ if (level == 0) {
+ // We treat level-0 specially by bounding the number of files
+ // instead of number of bytes for two reasons:
+ //
+ // (1) With larger write-buffer sizes, it is nice not to do too
+ // many level-0 compactions.
+ //
+ // (2) The files in level-0 are merged on every read and
+ // therefore we wish to avoid too many files when the individual
+ // file size is small (perhaps because of a small write-buffer
+ // setting, or very high compression ratios, or lots of
+ // overwrites/deletions).
+ int num_sorted_runs = 0;
+ uint64_t total_size = 0;
+ for (auto* f : files_[level]) {
+ if (!f->being_compacted) {
+ total_size += f->compensated_file_size;
+ num_sorted_runs++;
+ }
+ }
+ if (compaction_style_ == kCompactionStyleUniversal) {
+ // For universal compaction, we use level0 score to indicate
+ // compaction score for the whole DB. Adding other levels as if
+ // they are L0 files.
+ for (int i = 1; i < num_levels(); i++) {
+ if (!files_[i].empty() && !files_[i][0]->being_compacted) {
+ num_sorted_runs++;
+ }
+ }
+ }
+
+ if (compaction_style_ == kCompactionStyleFIFO) {
+ score = static_cast<double>(total_size) /
+ mutable_cf_options.compaction_options_fifo.max_table_files_size;
+ if (mutable_cf_options.compaction_options_fifo.allow_compaction) {
+ score = std::max(
+ static_cast<double>(num_sorted_runs) /
+ mutable_cf_options.level0_file_num_compaction_trigger,
+ score);
+ }
+ if (mutable_cf_options.ttl > 0) {
+ score = std::max(
+ static_cast<double>(GetExpiredTtlFilesCount(
+ immutable_cf_options, mutable_cf_options, files_[level])),
+ score);
+ }
+
+ } else {
+ score = static_cast<double>(num_sorted_runs) /
+ mutable_cf_options.level0_file_num_compaction_trigger;
+ if (compaction_style_ == kCompactionStyleLevel && num_levels() > 1) {
+ // Level-based involves L0->L0 compactions that can lead to oversized
+ // L0 files. Take into account size as well to avoid later giant
+ // compactions to the base level.
+ score = std::max(
+ score, static_cast<double>(total_size) /
+ mutable_cf_options.max_bytes_for_level_base);
+ }
+ }
+ } else {
+ // Compute the ratio of current size to size limit.
+ uint64_t level_bytes_no_compacting = 0;
+ for (auto f : files_[level]) {
+ if (!f->being_compacted) {
+ level_bytes_no_compacting += f->compensated_file_size;
+ }
+ }
+ score = static_cast<double>(level_bytes_no_compacting) /
+ MaxBytesForLevel(level);
+ }
+ compaction_level_[level] = level;
+ compaction_score_[level] = score;
+ }
+
+ // sort all the levels based on their score. Higher scores get listed
+ // first. Use bubble sort because the number of entries are small.
+ for (int i = 0; i < num_levels() - 2; i++) {
+ for (int j = i + 1; j < num_levels() - 1; j++) {
+ if (compaction_score_[i] < compaction_score_[j]) {
+ double score = compaction_score_[i];
+ int level = compaction_level_[i];
+ compaction_score_[i] = compaction_score_[j];
+ compaction_level_[i] = compaction_level_[j];
+ compaction_score_[j] = score;
+ compaction_level_[j] = level;
+ }
+ }
+ }
+ ComputeFilesMarkedForCompaction();
+ ComputeBottommostFilesMarkedForCompaction();
+ if (mutable_cf_options.ttl > 0) {
+ ComputeExpiredTtlFiles(immutable_cf_options, mutable_cf_options.ttl);
+ }
+ if (mutable_cf_options.periodic_compaction_seconds > 0) {
+ ComputeFilesMarkedForPeriodicCompaction(
+ immutable_cf_options, mutable_cf_options.periodic_compaction_seconds);
+ }
+ EstimateCompactionBytesNeeded(mutable_cf_options);
+}
+
+void VersionStorageInfo::ComputeFilesMarkedForCompaction() {
+ files_marked_for_compaction_.clear();
+ int last_qualify_level = 0;
+
+ // Do not include files from the last level with data
+ // If table properties collector suggests a file on the last level,
+ // we should not move it to a new level.
+ for (int level = num_levels() - 1; level >= 1; level--) {
+ if (!files_[level].empty()) {
+ last_qualify_level = level - 1;
+ break;
+ }
+ }
+
+ for (int level = 0; level <= last_qualify_level; level++) {
+ for (auto* f : files_[level]) {
+ if (!f->being_compacted && f->marked_for_compaction) {
+ files_marked_for_compaction_.emplace_back(level, f);
+ }
+ }
+ }
+}
+
+void VersionStorageInfo::ComputeExpiredTtlFiles(
+ const ImmutableCFOptions& ioptions, const uint64_t ttl) {
+ assert(ttl > 0);
+
+ expired_ttl_files_.clear();
+
+ int64_t _current_time;
+ auto status = ioptions.env->GetCurrentTime(&_current_time);
+ if (!status.ok()) {
+ return;
+ }
+ const uint64_t current_time = static_cast<uint64_t>(_current_time);
+
+ for (int level = 0; level < num_levels() - 1; level++) {
+ for (FileMetaData* f : files_[level]) {
+ if (!f->being_compacted) {
+ uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime();
+ if (oldest_ancester_time > 0 &&
+ oldest_ancester_time < (current_time - ttl)) {
+ expired_ttl_files_.emplace_back(level, f);
+ }
+ }
+ }
+ }
+}
+
+void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction(
+ const ImmutableCFOptions& ioptions,
+ const uint64_t periodic_compaction_seconds) {
+ assert(periodic_compaction_seconds > 0);
+
+ files_marked_for_periodic_compaction_.clear();
+
+ int64_t temp_current_time;
+ auto status = ioptions.env->GetCurrentTime(&temp_current_time);
+ if (!status.ok()) {
+ return;
+ }
+ const uint64_t current_time = static_cast<uint64_t>(temp_current_time);
+
+ // If periodic_compaction_seconds is larger than current time, periodic
+ // compaction can't possibly be triggered.
+ if (periodic_compaction_seconds > current_time) {
+ return;
+ }
+
+ const uint64_t allowed_time_limit =
+ current_time - periodic_compaction_seconds;
+
+ for (int level = 0; level < num_levels(); level++) {
+ for (auto f : files_[level]) {
+ if (!f->being_compacted) {
+ // Compute a file's modification time in the following order:
+ // 1. Use file_creation_time table property if it is > 0.
+ // 2. Use creation_time table property if it is > 0.
+ // 3. Use file's mtime metadata if the above two table properties are 0.
+ // Don't consider the file at all if the modification time cannot be
+ // correctly determined based on the above conditions.
+ uint64_t file_modification_time = f->TryGetFileCreationTime();
+ if (file_modification_time == kUnknownFileCreationTime) {
+ file_modification_time = f->TryGetOldestAncesterTime();
+ }
+ if (file_modification_time == kUnknownOldestAncesterTime) {
+ auto file_path = TableFileName(ioptions.cf_paths, f->fd.GetNumber(),
+ f->fd.GetPathId());
+ status = ioptions.env->GetFileModificationTime(
+ file_path, &file_modification_time);
+ if (!status.ok()) {
+ ROCKS_LOG_WARN(ioptions.info_log,
+ "Can't get file modification time: %s: %s",
+ file_path.c_str(), status.ToString().c_str());
+ continue;
+ }
+ }
+ if (file_modification_time > 0 &&
+ file_modification_time < allowed_time_limit) {
+ files_marked_for_periodic_compaction_.emplace_back(level, f);
+ }
+ }
+ }
+ }
+}
+
+namespace {
+
+// used to sort files by size
+struct Fsize {
+ size_t index;
+ FileMetaData* file;
+};
+
+// Compator that is used to sort files based on their size
+// In normal mode: descending size
+bool CompareCompensatedSizeDescending(const Fsize& first, const Fsize& second) {
+ return (first.file->compensated_file_size >
+ second.file->compensated_file_size);
+}
+} // anonymous namespace
+
+void VersionStorageInfo::AddFile(int level, FileMetaData* f, Logger* info_log) {
+ auto* level_files = &files_[level];
+ // Must not overlap
+#ifndef NDEBUG
+ if (level > 0 && !level_files->empty() &&
+ internal_comparator_->Compare(
+ (*level_files)[level_files->size() - 1]->largest, f->smallest) >= 0) {
+ auto* f2 = (*level_files)[level_files->size() - 1];
+ if (info_log != nullptr) {
+ Error(info_log, "Adding new file %" PRIu64
+ " range (%s, %s) to level %d but overlapping "
+ "with existing file %" PRIu64 " %s %s",
+ f->fd.GetNumber(), f->smallest.DebugString(true).c_str(),
+ f->largest.DebugString(true).c_str(), level, f2->fd.GetNumber(),
+ f2->smallest.DebugString(true).c_str(),
+ f2->largest.DebugString(true).c_str());
+ LogFlush(info_log);
+ }
+ assert(false);
+ }
+#else
+ (void)info_log;
+#endif
+ f->refs++;
+ level_files->push_back(f);
+}
+
+// Version::PrepareApply() need to be called before calling the function, or
+// following functions called:
+// 1. UpdateNumNonEmptyLevels();
+// 2. CalculateBaseBytes();
+// 3. UpdateFilesByCompactionPri();
+// 4. GenerateFileIndexer();
+// 5. GenerateLevelFilesBrief();
+// 6. GenerateLevel0NonOverlapping();
+// 7. GenerateBottommostFiles();
+void VersionStorageInfo::SetFinalized() {
+ finalized_ = true;
+#ifndef NDEBUG
+ if (compaction_style_ != kCompactionStyleLevel) {
+ // Not level based compaction.
+ return;
+ }
+ assert(base_level_ < 0 || num_levels() == 1 ||
+ (base_level_ >= 1 && base_level_ < num_levels()));
+ // Verify all levels newer than base_level are empty except L0
+ for (int level = 1; level < base_level(); level++) {
+ assert(NumLevelBytes(level) == 0);
+ }
+ uint64_t max_bytes_prev_level = 0;
+ for (int level = base_level(); level < num_levels() - 1; level++) {
+ if (LevelFiles(level).size() == 0) {
+ continue;
+ }
+ assert(MaxBytesForLevel(level) >= max_bytes_prev_level);
+ max_bytes_prev_level = MaxBytesForLevel(level);
+ }
+ int num_empty_non_l0_level = 0;
+ for (int level = 0; level < num_levels(); level++) {
+ assert(LevelFiles(level).size() == 0 ||
+ LevelFiles(level).size() == LevelFilesBrief(level).num_files);
+ if (level > 0 && NumLevelBytes(level) > 0) {
+ num_empty_non_l0_level++;
+ }
+ if (LevelFiles(level).size() > 0) {
+ assert(level < num_non_empty_levels());
+ }
+ }
+ assert(compaction_level_.size() > 0);
+ assert(compaction_level_.size() == compaction_score_.size());
+#endif
+}
+
+void VersionStorageInfo::UpdateNumNonEmptyLevels() {
+ num_non_empty_levels_ = num_levels_;
+ for (int i = num_levels_ - 1; i >= 0; i--) {
+ if (files_[i].size() != 0) {
+ return;
+ } else {
+ num_non_empty_levels_ = i;
+ }
+ }
+}
+
+namespace {
+// Sort `temp` based on ratio of overlapping size over file size
+void SortFileByOverlappingRatio(
+ const InternalKeyComparator& icmp, const std::vector<FileMetaData*>& files,
+ const std::vector<FileMetaData*>& next_level_files,
+ std::vector<Fsize>* temp) {
+ std::unordered_map<uint64_t, uint64_t> file_to_order;
+ auto next_level_it = next_level_files.begin();
+
+ for (auto& file : files) {
+ uint64_t overlapping_bytes = 0;
+ // Skip files in next level that is smaller than current file
+ while (next_level_it != next_level_files.end() &&
+ icmp.Compare((*next_level_it)->largest, file->smallest) < 0) {
+ next_level_it++;
+ }
+
+ while (next_level_it != next_level_files.end() &&
+ icmp.Compare((*next_level_it)->smallest, file->largest) < 0) {
+ overlapping_bytes += (*next_level_it)->fd.file_size;
+
+ if (icmp.Compare((*next_level_it)->largest, file->largest) > 0) {
+ // next level file cross large boundary of current file.
+ break;
+ }
+ next_level_it++;
+ }
+
+ assert(file->compensated_file_size != 0);
+ file_to_order[file->fd.GetNumber()] =
+ overlapping_bytes * 1024u / file->compensated_file_size;
+ }
+
+ std::sort(temp->begin(), temp->end(),
+ [&](const Fsize& f1, const Fsize& f2) -> bool {
+ return file_to_order[f1.file->fd.GetNumber()] <
+ file_to_order[f2.file->fd.GetNumber()];
+ });
+}
+} // namespace
+
+void VersionStorageInfo::UpdateFilesByCompactionPri(
+ CompactionPri compaction_pri) {
+ if (compaction_style_ == kCompactionStyleNone ||
+ compaction_style_ == kCompactionStyleFIFO ||
+ compaction_style_ == kCompactionStyleUniversal) {
+ // don't need this
+ return;
+ }
+ // No need to sort the highest level because it is never compacted.
+ for (int level = 0; level < num_levels() - 1; level++) {
+ const std::vector<FileMetaData*>& files = files_[level];
+ auto& files_by_compaction_pri = files_by_compaction_pri_[level];
+ assert(files_by_compaction_pri.size() == 0);
+
+ // populate a temp vector for sorting based on size
+ std::vector<Fsize> temp(files.size());
+ for (size_t i = 0; i < files.size(); i++) {
+ temp[i].index = i;
+ temp[i].file = files[i];
+ }
+
+ // sort the top number_of_files_to_sort_ based on file size
+ size_t num = VersionStorageInfo::kNumberFilesToSort;
+ if (num > temp.size()) {
+ num = temp.size();
+ }
+ switch (compaction_pri) {
+ case kByCompensatedSize:
+ std::partial_sort(temp.begin(), temp.begin() + num, temp.end(),
+ CompareCompensatedSizeDescending);
+ break;
+ case kOldestLargestSeqFirst:
+ std::sort(temp.begin(), temp.end(),
+ [](const Fsize& f1, const Fsize& f2) -> bool {
+ return f1.file->fd.largest_seqno <
+ f2.file->fd.largest_seqno;
+ });
+ break;
+ case kOldestSmallestSeqFirst:
+ std::sort(temp.begin(), temp.end(),
+ [](const Fsize& f1, const Fsize& f2) -> bool {
+ return f1.file->fd.smallest_seqno <
+ f2.file->fd.smallest_seqno;
+ });
+ break;
+ case kMinOverlappingRatio:
+ SortFileByOverlappingRatio(*internal_comparator_, files_[level],
+ files_[level + 1], &temp);
+ break;
+ default:
+ assert(false);
+ }
+ assert(temp.size() == files.size());
+
+ // initialize files_by_compaction_pri_
+ for (size_t i = 0; i < temp.size(); i++) {
+ files_by_compaction_pri.push_back(static_cast<int>(temp[i].index));
+ }
+ next_file_to_compact_by_size_[level] = 0;
+ assert(files_[level].size() == files_by_compaction_pri_[level].size());
+ }
+}
+
+void VersionStorageInfo::GenerateLevel0NonOverlapping() {
+ assert(!finalized_);
+ level0_non_overlapping_ = true;
+ if (level_files_brief_.size() == 0) {
+ return;
+ }
+
+ // A copy of L0 files sorted by smallest key
+ std::vector<FdWithKeyRange> level0_sorted_file(
+ level_files_brief_[0].files,
+ level_files_brief_[0].files + level_files_brief_[0].num_files);
+ std::sort(level0_sorted_file.begin(), level0_sorted_file.end(),
+ [this](const FdWithKeyRange& f1, const FdWithKeyRange& f2) -> bool {
+ return (internal_comparator_->Compare(f1.smallest_key,
+ f2.smallest_key) < 0);
+ });
+
+ for (size_t i = 1; i < level0_sorted_file.size(); ++i) {
+ FdWithKeyRange& f = level0_sorted_file[i];
+ FdWithKeyRange& prev = level0_sorted_file[i - 1];
+ if (internal_comparator_->Compare(prev.largest_key, f.smallest_key) >= 0) {
+ level0_non_overlapping_ = false;
+ break;
+ }
+ }
+}
+
+void VersionStorageInfo::GenerateBottommostFiles() {
+ assert(!finalized_);
+ assert(bottommost_files_.empty());
+ for (size_t level = 0; level < level_files_brief_.size(); ++level) {
+ for (size_t file_idx = 0; file_idx < level_files_brief_[level].num_files;
+ ++file_idx) {
+ const FdWithKeyRange& f = level_files_brief_[level].files[file_idx];
+ int l0_file_idx;
+ if (level == 0) {
+ l0_file_idx = static_cast<int>(file_idx);
+ } else {
+ l0_file_idx = -1;
+ }
+ Slice smallest_user_key = ExtractUserKey(f.smallest_key);
+ Slice largest_user_key = ExtractUserKey(f.largest_key);
+ if (!RangeMightExistAfterSortedRun(smallest_user_key, largest_user_key,
+ static_cast<int>(level),
+ l0_file_idx)) {
+ bottommost_files_.emplace_back(static_cast<int>(level),
+ f.file_metadata);
+ }
+ }
+ }
+}
+
+void VersionStorageInfo::UpdateOldestSnapshot(SequenceNumber seqnum) {
+ assert(seqnum >= oldest_snapshot_seqnum_);
+ oldest_snapshot_seqnum_ = seqnum;
+ if (oldest_snapshot_seqnum_ > bottommost_files_mark_threshold_) {
+ ComputeBottommostFilesMarkedForCompaction();
+ }
+}
+
+void VersionStorageInfo::ComputeBottommostFilesMarkedForCompaction() {
+ bottommost_files_marked_for_compaction_.clear();
+ bottommost_files_mark_threshold_ = kMaxSequenceNumber;
+ for (auto& level_and_file : bottommost_files_) {
+ if (!level_and_file.second->being_compacted &&
+ level_and_file.second->fd.largest_seqno != 0 &&
+ level_and_file.second->num_deletions > 1) {
+ // largest_seqno might be nonzero due to containing the final key in an
+ // earlier compaction, whose seqnum we didn't zero out. Multiple deletions
+ // ensures the file really contains deleted or overwritten keys.
+ if (level_and_file.second->fd.largest_seqno < oldest_snapshot_seqnum_) {
+ bottommost_files_marked_for_compaction_.push_back(level_and_file);
+ } else {
+ bottommost_files_mark_threshold_ =
+ std::min(bottommost_files_mark_threshold_,
+ level_and_file.second->fd.largest_seqno);
+ }
+ }
+ }
+}
+
+void Version::Ref() {
+ ++refs_;
+}
+
+bool Version::Unref() {
+ assert(refs_ >= 1);
+ --refs_;
+ if (refs_ == 0) {
+ delete this;
+ return true;
+ }
+ return false;
+}
+
+bool VersionStorageInfo::OverlapInLevel(int level,
+ const Slice* smallest_user_key,
+ const Slice* largest_user_key) {
+ if (level >= num_non_empty_levels_) {
+ // empty level, no overlap
+ return false;
+ }
+ return SomeFileOverlapsRange(*internal_comparator_, (level > 0),
+ level_files_brief_[level], smallest_user_key,
+ largest_user_key);
+}
+
+// Store in "*inputs" all files in "level" that overlap [begin,end]
+// If hint_index is specified, then it points to a file in the
+// overlapping range.
+// The file_index returns a pointer to any file in an overlapping range.
+void VersionStorageInfo::GetOverlappingInputs(
+ int level, const InternalKey* begin, const InternalKey* end,
+ std::vector<FileMetaData*>* inputs, int hint_index, int* file_index,
+ bool expand_range, InternalKey** next_smallest) const {
+ if (level >= num_non_empty_levels_) {
+ // this level is empty, no overlapping inputs
+ return;
+ }
+
+ inputs->clear();
+ if (file_index) {
+ *file_index = -1;
+ }
+ const Comparator* user_cmp = user_comparator_;
+ if (level > 0) {
+ GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs, hint_index,
+ file_index, false, next_smallest);
+ return;
+ }
+
+ if (next_smallest) {
+ // next_smallest key only makes sense for non-level 0, where files are
+ // non-overlapping
+ *next_smallest = nullptr;
+ }
+
+ Slice user_begin, user_end;
+ if (begin != nullptr) {
+ user_begin = begin->user_key();
+ }
+ if (end != nullptr) {
+ user_end = end->user_key();
+ }
+
+ // index stores the file index need to check.
+ std::list<size_t> index;
+ for (size_t i = 0; i < level_files_brief_[level].num_files; i++) {
+ index.emplace_back(i);
+ }
+
+ while (!index.empty()) {
+ bool found_overlapping_file = false;
+ auto iter = index.begin();
+ while (iter != index.end()) {
+ FdWithKeyRange* f = &(level_files_brief_[level].files[*iter]);
+ const Slice file_start = ExtractUserKey(f->smallest_key);
+ const Slice file_limit = ExtractUserKey(f->largest_key);
+ if (begin != nullptr &&
+ user_cmp->CompareWithoutTimestamp(file_limit, user_begin) < 0) {
+ // "f" is completely before specified range; skip it
+ iter++;
+ } else if (end != nullptr &&
+ user_cmp->CompareWithoutTimestamp(file_start, user_end) > 0) {
+ // "f" is completely after specified range; skip it
+ iter++;
+ } else {
+ // if overlap
+ inputs->emplace_back(files_[level][*iter]);
+ found_overlapping_file = true;
+ // record the first file index.
+ if (file_index && *file_index == -1) {
+ *file_index = static_cast<int>(*iter);
+ }
+ // the related file is overlap, erase to avoid checking again.
+ iter = index.erase(iter);
+ if (expand_range) {
+ if (begin != nullptr &&
+ user_cmp->CompareWithoutTimestamp(file_start, user_begin) < 0) {
+ user_begin = file_start;
+ }
+ if (end != nullptr &&
+ user_cmp->CompareWithoutTimestamp(file_limit, user_end) > 0) {
+ user_end = file_limit;
+ }
+ }
+ }
+ }
+ // if all the files left are not overlap, break
+ if (!found_overlapping_file) {
+ break;
+ }
+ }
+}
+
+// Store in "*inputs" files in "level" that within range [begin,end]
+// Guarantee a "clean cut" boundary between the files in inputs
+// and the surrounding files and the maxinum number of files.
+// This will ensure that no parts of a key are lost during compaction.
+// If hint_index is specified, then it points to a file in the range.
+// The file_index returns a pointer to any file in an overlapping range.
+void VersionStorageInfo::GetCleanInputsWithinInterval(
+ int level, const InternalKey* begin, const InternalKey* end,
+ std::vector<FileMetaData*>* inputs, int hint_index, int* file_index) const {
+ inputs->clear();
+ if (file_index) {
+ *file_index = -1;
+ }
+ if (level >= num_non_empty_levels_ || level == 0 ||
+ level_files_brief_[level].num_files == 0) {
+ // this level is empty, no inputs within range
+ // also don't support clean input interval within L0
+ return;
+ }
+
+ GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs,
+ hint_index, file_index,
+ true /* within_interval */);
+}
+
+// Store in "*inputs" all files in "level" that overlap [begin,end]
+// Employ binary search to find at least one file that overlaps the
+// specified range. From that file, iterate backwards and
+// forwards to find all overlapping files.
+// if within_range is set, then only store the maximum clean inputs
+// within range [begin, end]. "clean" means there is a boudnary
+// between the files in "*inputs" and the surrounding files
+void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch(
+ int level, const InternalKey* begin, const InternalKey* end,
+ std::vector<FileMetaData*>* inputs, int hint_index, int* file_index,
+ bool within_interval, InternalKey** next_smallest) const {
+ assert(level > 0);
+
+ auto user_cmp = user_comparator_;
+ const FdWithKeyRange* files = level_files_brief_[level].files;
+ const int num_files = static_cast<int>(level_files_brief_[level].num_files);
+
+ // begin to use binary search to find lower bound
+ // and upper bound.
+ int start_index = 0;
+ int end_index = num_files;
+
+ if (begin != nullptr) {
+ // if within_interval is true, with file_key would find
+ // not overlapping ranges in std::lower_bound.
+ auto cmp = [&user_cmp, &within_interval](const FdWithKeyRange& f,
+ const InternalKey* k) {
+ auto& file_key = within_interval ? f.file_metadata->smallest
+ : f.file_metadata->largest;
+ return sstableKeyCompare(user_cmp, file_key, *k) < 0;
+ };
+
+ start_index = static_cast<int>(
+ std::lower_bound(files,
+ files + (hint_index == -1 ? num_files : hint_index),
+ begin, cmp) -
+ files);
+
+ if (start_index > 0 && within_interval) {
+ bool is_overlapping = true;
+ while (is_overlapping && start_index < num_files) {
+ auto& pre_limit = files[start_index - 1].file_metadata->largest;
+ auto& cur_start = files[start_index].file_metadata->smallest;
+ is_overlapping = sstableKeyCompare(user_cmp, pre_limit, cur_start) == 0;
+ start_index += is_overlapping;
+ }
+ }
+ }
+
+ if (end != nullptr) {
+ // if within_interval is true, with file_key would find
+ // not overlapping ranges in std::upper_bound.
+ auto cmp = [&user_cmp, &within_interval](const InternalKey* k,
+ const FdWithKeyRange& f) {
+ auto& file_key = within_interval ? f.file_metadata->largest
+ : f.file_metadata->smallest;
+ return sstableKeyCompare(user_cmp, *k, file_key) < 0;
+ };
+
+ end_index = static_cast<int>(
+ std::upper_bound(files + start_index, files + num_files, end, cmp) -
+ files);
+
+ if (end_index < num_files && within_interval) {
+ bool is_overlapping = true;
+ while (is_overlapping && end_index > start_index) {
+ auto& next_start = files[end_index].file_metadata->smallest;
+ auto& cur_limit = files[end_index - 1].file_metadata->largest;
+ is_overlapping =
+ sstableKeyCompare(user_cmp, cur_limit, next_start) == 0;
+ end_index -= is_overlapping;
+ }
+ }
+ }
+
+ assert(start_index <= end_index);
+
+ // If there were no overlapping files, return immediately.
+ if (start_index == end_index) {
+ if (next_smallest) {
+ *next_smallest = nullptr;
+ }
+ return;
+ }
+
+ assert(start_index < end_index);
+
+ // returns the index where an overlap is found
+ if (file_index) {
+ *file_index = start_index;
+ }
+
+ // insert overlapping files into vector
+ for (int i = start_index; i < end_index; i++) {
+ inputs->push_back(files_[level][i]);
+ }
+
+ if (next_smallest != nullptr) {
+ // Provide the next key outside the range covered by inputs
+ if (end_index < static_cast<int>(files_[level].size())) {
+ **next_smallest = files_[level][end_index]->smallest;
+ } else {
+ *next_smallest = nullptr;
+ }
+ }
+}
+
+uint64_t VersionStorageInfo::NumLevelBytes(int level) const {
+ assert(level >= 0);
+ assert(level < num_levels());
+ return TotalFileSize(files_[level]);
+}
+
+const char* VersionStorageInfo::LevelSummary(
+ LevelSummaryStorage* scratch) const {
+ int len = 0;
+ if (compaction_style_ == kCompactionStyleLevel && num_levels() > 1) {
+ assert(base_level_ < static_cast<int>(level_max_bytes_.size()));
+ if (level_multiplier_ != 0.0) {
+ len = snprintf(
+ scratch->buffer, sizeof(scratch->buffer),
+ "base level %d level multiplier %.2f max bytes base %" PRIu64 " ",
+ base_level_, level_multiplier_, level_max_bytes_[base_level_]);
+ }
+ }
+ len +=
+ snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "files[");
+ for (int i = 0; i < num_levels(); i++) {
+ int sz = sizeof(scratch->buffer) - len;
+ int ret = snprintf(scratch->buffer + len, sz, "%d ", int(files_[i].size()));
+ if (ret < 0 || ret >= sz) break;
+ len += ret;
+ }
+ if (len > 0) {
+ // overwrite the last space
+ --len;
+ }
+ len += snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
+ "] max score %.2f", compaction_score_[0]);
+
+ if (!files_marked_for_compaction_.empty()) {
+ snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
+ " (%" ROCKSDB_PRIszt " files need compaction)",
+ files_marked_for_compaction_.size());
+ }
+
+ return scratch->buffer;
+}
+
+const char* VersionStorageInfo::LevelFileSummary(FileSummaryStorage* scratch,
+ int level) const {
+ int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size[");
+ for (const auto& f : files_[level]) {
+ int sz = sizeof(scratch->buffer) - len;
+ char sztxt[16];
+ AppendHumanBytes(f->fd.GetFileSize(), sztxt, sizeof(sztxt));
+ int ret = snprintf(scratch->buffer + len, sz,
+ "#%" PRIu64 "(seq=%" PRIu64 ",sz=%s,%d) ",
+ f->fd.GetNumber(), f->fd.smallest_seqno, sztxt,
+ static_cast<int>(f->being_compacted));
+ if (ret < 0 || ret >= sz)
+ break;
+ len += ret;
+ }
+ // overwrite the last space (only if files_[level].size() is non-zero)
+ if (files_[level].size() && len > 0) {
+ --len;
+ }
+ snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
+ return scratch->buffer;
+}
+
+int64_t VersionStorageInfo::MaxNextLevelOverlappingBytes() {
+ uint64_t result = 0;
+ std::vector<FileMetaData*> overlaps;
+ for (int level = 1; level < num_levels() - 1; level++) {
+ for (const auto& f : files_[level]) {
+ GetOverlappingInputs(level + 1, &f->smallest, &f->largest, &overlaps);
+ const uint64_t sum = TotalFileSize(overlaps);
+ if (sum > result) {
+ result = sum;
+ }
+ }
+ }
+ return result;
+}
+
+uint64_t VersionStorageInfo::MaxBytesForLevel(int level) const {
+ // Note: the result for level zero is not really used since we set
+ // the level-0 compaction threshold based on number of files.
+ assert(level >= 0);
+ assert(level < static_cast<int>(level_max_bytes_.size()));
+ return level_max_bytes_[level];
+}
+
+void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& options) {
+ // Special logic to set number of sorted runs.
+ // It is to match the previous behavior when all files are in L0.
+ int num_l0_count = static_cast<int>(files_[0].size());
+ if (compaction_style_ == kCompactionStyleUniversal) {
+ // For universal compaction, we use level0 score to indicate
+ // compaction score for the whole DB. Adding other levels as if
+ // they are L0 files.
+ for (int i = 1; i < num_levels(); i++) {
+ if (!files_[i].empty()) {
+ num_l0_count++;
+ }
+ }
+ }
+ set_l0_delay_trigger_count(num_l0_count);
+
+ level_max_bytes_.resize(ioptions.num_levels);
+ if (!ioptions.level_compaction_dynamic_level_bytes) {
+ base_level_ = (ioptions.compaction_style == kCompactionStyleLevel) ? 1 : -1;
+
+ // Calculate for static bytes base case
+ for (int i = 0; i < ioptions.num_levels; ++i) {
+ if (i == 0 && ioptions.compaction_style == kCompactionStyleUniversal) {
+ level_max_bytes_[i] = options.max_bytes_for_level_base;
+ } else if (i > 1) {
+ level_max_bytes_[i] = MultiplyCheckOverflow(
+ MultiplyCheckOverflow(level_max_bytes_[i - 1],
+ options.max_bytes_for_level_multiplier),
+ options.MaxBytesMultiplerAdditional(i - 1));
+ } else {
+ level_max_bytes_[i] = options.max_bytes_for_level_base;
+ }
+ }
+ } else {
+ uint64_t max_level_size = 0;
+
+ int first_non_empty_level = -1;
+ // Find size of non-L0 level of most data.
+ // Cannot use the size of the last level because it can be empty or less
+ // than previous levels after compaction.
+ for (int i = 1; i < num_levels_; i++) {
+ uint64_t total_size = 0;
+ for (const auto& f : files_[i]) {
+ total_size += f->fd.GetFileSize();
+ }
+ if (total_size > 0 && first_non_empty_level == -1) {
+ first_non_empty_level = i;
+ }
+ if (total_size > max_level_size) {
+ max_level_size = total_size;
+ }
+ }
+
+ // Prefill every level's max bytes to disallow compaction from there.
+ for (int i = 0; i < num_levels_; i++) {
+ level_max_bytes_[i] = std::numeric_limits<uint64_t>::max();
+ }
+
+ if (max_level_size == 0) {
+ // No data for L1 and up. L0 compacts to last level directly.
+ // No compaction from L1+ needs to be scheduled.
+ base_level_ = num_levels_ - 1;
+ } else {
+ uint64_t l0_size = 0;
+ for (const auto& f : files_[0]) {
+ l0_size += f->fd.GetFileSize();
+ }
+
+ uint64_t base_bytes_max =
+ std::max(options.max_bytes_for_level_base, l0_size);
+ uint64_t base_bytes_min = static_cast<uint64_t>(
+ base_bytes_max / options.max_bytes_for_level_multiplier);
+
+ // Try whether we can make last level's target size to be max_level_size
+ uint64_t cur_level_size = max_level_size;
+ for (int i = num_levels_ - 2; i >= first_non_empty_level; i--) {
+ // Round up after dividing
+ cur_level_size = static_cast<uint64_t>(
+ cur_level_size / options.max_bytes_for_level_multiplier);
+ }
+
+ // Calculate base level and its size.
+ uint64_t base_level_size;
+ if (cur_level_size <= base_bytes_min) {
+ // Case 1. If we make target size of last level to be max_level_size,
+ // target size of the first non-empty level would be smaller than
+ // base_bytes_min. We set it be base_bytes_min.
+ base_level_size = base_bytes_min + 1U;
+ base_level_ = first_non_empty_level;
+ ROCKS_LOG_INFO(ioptions.info_log,
+ "More existing levels in DB than needed. "
+ "max_bytes_for_level_multiplier may not be guaranteed.");
+ } else {
+ // Find base level (where L0 data is compacted to).
+ base_level_ = first_non_empty_level;
+ while (base_level_ > 1 && cur_level_size > base_bytes_max) {
+ --base_level_;
+ cur_level_size = static_cast<uint64_t>(
+ cur_level_size / options.max_bytes_for_level_multiplier);
+ }
+ if (cur_level_size > base_bytes_max) {
+ // Even L1 will be too large
+ assert(base_level_ == 1);
+ base_level_size = base_bytes_max;
+ } else {
+ base_level_size = cur_level_size;
+ }
+ }
+
+ level_multiplier_ = options.max_bytes_for_level_multiplier;
+ assert(base_level_size > 0);
+ if (l0_size > base_level_size &&
+ (l0_size > options.max_bytes_for_level_base ||
+ static_cast<int>(files_[0].size() / 2) >=
+ options.level0_file_num_compaction_trigger)) {
+ // We adjust the base level according to actual L0 size, and adjust
+ // the level multiplier accordingly, when:
+ // 1. the L0 size is larger than level size base, or
+ // 2. number of L0 files reaches twice the L0->L1 compaction trigger
+ // We don't do this otherwise to keep the LSM-tree structure stable
+ // unless the L0 compation is backlogged.
+ base_level_size = l0_size;
+ if (base_level_ == num_levels_ - 1) {
+ level_multiplier_ = 1.0;
+ } else {
+ level_multiplier_ = std::pow(
+ static_cast<double>(max_level_size) /
+ static_cast<double>(base_level_size),
+ 1.0 / static_cast<double>(num_levels_ - base_level_ - 1));
+ }
+ }
+
+ uint64_t level_size = base_level_size;
+ for (int i = base_level_; i < num_levels_; i++) {
+ if (i > base_level_) {
+ level_size = MultiplyCheckOverflow(level_size, level_multiplier_);
+ }
+ // Don't set any level below base_bytes_max. Otherwise, the LSM can
+ // assume an hourglass shape where L1+ sizes are smaller than L0. This
+ // causes compaction scoring, which depends on level sizes, to favor L1+
+ // at the expense of L0, which may fill up and stall.
+ level_max_bytes_[i] = std::max(level_size, base_bytes_max);
+ }
+ }
+ }
+}
+
+uint64_t VersionStorageInfo::EstimateLiveDataSize() const {
+ // Estimate the live data size by adding up the size of the last level for all
+ // key ranges. Note: Estimate depends on the ordering of files in level 0
+ // because files in level 0 can be overlapping.
+ uint64_t size = 0;
+
+ auto ikey_lt = [this](InternalKey* x, InternalKey* y) {
+ return internal_comparator_->Compare(*x, *y) < 0;
+ };
+ // (Ordered) map of largest keys in non-overlapping files
+ std::map<InternalKey*, FileMetaData*, decltype(ikey_lt)> ranges(ikey_lt);
+
+ for (int l = num_levels_ - 1; l >= 0; l--) {
+ bool found_end = false;
+ for (auto file : files_[l]) {
+ // Find the first file where the largest key is larger than the smallest
+ // key of the current file. If this file does not overlap with the
+ // current file, none of the files in the map does. If there is
+ // no potential overlap, we can safely insert the rest of this level
+ // (if the level is not 0) into the map without checking again because
+ // the elements in the level are sorted and non-overlapping.
+ auto lb = (found_end && l != 0) ?
+ ranges.end() : ranges.lower_bound(&file->smallest);
+ found_end = (lb == ranges.end());
+ if (found_end || internal_comparator_->Compare(
+ file->largest, (*lb).second->smallest) < 0) {
+ ranges.emplace_hint(lb, &file->largest, file);
+ size += file->fd.file_size;
+ }
+ }
+ }
+ return size;
+}
+
+bool VersionStorageInfo::RangeMightExistAfterSortedRun(
+ const Slice& smallest_user_key, const Slice& largest_user_key,
+ int last_level, int last_l0_idx) {
+ assert((last_l0_idx != -1) == (last_level == 0));
+ // TODO(ajkr): this preserves earlier behavior where we considered an L0 file
+ // bottommost only if it's the oldest L0 file and there are no files on older
+ // levels. It'd be better to consider it bottommost if there's no overlap in
+ // older levels/files.
+ if (last_level == 0 &&
+ last_l0_idx != static_cast<int>(LevelFiles(0).size() - 1)) {
+ return true;
+ }
+
+ // Checks whether there are files living beyond the `last_level`. If lower
+ // levels have files, it checks for overlap between [`smallest_key`,
+ // `largest_key`] and those files. Bottomlevel optimizations can be made if
+ // there are no files in lower levels or if there is no overlap with the files
+ // in the lower levels.
+ for (int level = last_level + 1; level < num_levels(); level++) {
+ // The range is not in the bottommost level if there are files in lower
+ // levels when the `last_level` is 0 or if there are files in lower levels
+ // which overlap with [`smallest_key`, `largest_key`].
+ if (files_[level].size() > 0 &&
+ (last_level == 0 ||
+ OverlapInLevel(level, &smallest_user_key, &largest_user_key))) {
+ return true;
+ }
+ }
+ return false;
+}
+
+void Version::AddLiveFiles(std::vector<FileDescriptor>* live) {
+ for (int level = 0; level < storage_info_.num_levels(); level++) {
+ const std::vector<FileMetaData*>& files = storage_info_.files_[level];
+ for (const auto& file : files) {
+ live->push_back(file->fd);
+ }
+ }
+}
+
+std::string Version::DebugString(bool hex, bool print_stats) const {
+ std::string r;
+ for (int level = 0; level < storage_info_.num_levels_; level++) {
+ // E.g.,
+ // --- level 1 ---
+ // 17:123[1 .. 124]['a' .. 'd']
+ // 20:43[124 .. 128]['e' .. 'g']
+ //
+ // if print_stats=true:
+ // 17:123[1 .. 124]['a' .. 'd'](4096)
+ r.append("--- level ");
+ AppendNumberTo(&r, level);
+ r.append(" --- version# ");
+ AppendNumberTo(&r, version_number_);
+ r.append(" ---\n");
+ const std::vector<FileMetaData*>& files = storage_info_.files_[level];
+ for (size_t i = 0; i < files.size(); i++) {
+ r.push_back(' ');
+ AppendNumberTo(&r, files[i]->fd.GetNumber());
+ r.push_back(':');
+ AppendNumberTo(&r, files[i]->fd.GetFileSize());
+ r.append("[");
+ AppendNumberTo(&r, files[i]->fd.smallest_seqno);
+ r.append(" .. ");
+ AppendNumberTo(&r, files[i]->fd.largest_seqno);
+ r.append("]");
+ r.append("[");
+ r.append(files[i]->smallest.DebugString(hex));
+ r.append(" .. ");
+ r.append(files[i]->largest.DebugString(hex));
+ r.append("]");
+ if (files[i]->oldest_blob_file_number != kInvalidBlobFileNumber) {
+ r.append(" blob_file:");
+ AppendNumberTo(&r, files[i]->oldest_blob_file_number);
+ }
+ if (print_stats) {
+ r.append("(");
+ r.append(ToString(
+ files[i]->stats.num_reads_sampled.load(std::memory_order_relaxed)));
+ r.append(")");
+ }
+ r.append("\n");
+ }
+ }
+ return r;
+}
+
+// this is used to batch writes to the manifest file
+struct VersionSet::ManifestWriter {
+ Status status;
+ bool done;
+ InstrumentedCondVar cv;
+ ColumnFamilyData* cfd;
+ const MutableCFOptions mutable_cf_options;
+ const autovector<VersionEdit*>& edit_list;
+
+ explicit ManifestWriter(InstrumentedMutex* mu, ColumnFamilyData* _cfd,
+ const MutableCFOptions& cf_options,
+ const autovector<VersionEdit*>& e)
+ : done(false),
+ cv(mu),
+ cfd(_cfd),
+ mutable_cf_options(cf_options),
+ edit_list(e) {}
+};
+
+Status AtomicGroupReadBuffer::AddEdit(VersionEdit* edit) {
+ assert(edit);
+ if (edit->is_in_atomic_group_) {
+ TEST_SYNC_POINT("AtomicGroupReadBuffer::AddEdit:AtomicGroup");
+ if (replay_buffer_.empty()) {
+ replay_buffer_.resize(edit->remaining_entries_ + 1);
+ TEST_SYNC_POINT_CALLBACK(
+ "AtomicGroupReadBuffer::AddEdit:FirstInAtomicGroup", edit);
+ }
+ read_edits_in_atomic_group_++;
+ if (read_edits_in_atomic_group_ + edit->remaining_entries_ !=
+ static_cast<uint32_t>(replay_buffer_.size())) {
+ TEST_SYNC_POINT_CALLBACK(
+ "AtomicGroupReadBuffer::AddEdit:IncorrectAtomicGroupSize", edit);
+ return Status::Corruption("corrupted atomic group");
+ }
+ replay_buffer_[read_edits_in_atomic_group_ - 1] = *edit;
+ if (read_edits_in_atomic_group_ == replay_buffer_.size()) {
+ TEST_SYNC_POINT_CALLBACK(
+ "AtomicGroupReadBuffer::AddEdit:LastInAtomicGroup", edit);
+ return Status::OK();
+ }
+ return Status::OK();
+ }
+
+ // A normal edit.
+ if (!replay_buffer().empty()) {
+ TEST_SYNC_POINT_CALLBACK(
+ "AtomicGroupReadBuffer::AddEdit:AtomicGroupMixedWithNormalEdits", edit);
+ return Status::Corruption("corrupted atomic group");
+ }
+ return Status::OK();
+}
+
+bool AtomicGroupReadBuffer::IsFull() const {
+ return read_edits_in_atomic_group_ == replay_buffer_.size();
+}
+
+bool AtomicGroupReadBuffer::IsEmpty() const { return replay_buffer_.empty(); }
+
+void AtomicGroupReadBuffer::Clear() {
+ read_edits_in_atomic_group_ = 0;
+ replay_buffer_.clear();
+}
+
+VersionSet::VersionSet(const std::string& dbname,
+ const ImmutableDBOptions* _db_options,
+ const FileOptions& storage_options, Cache* table_cache,
+ WriteBufferManager* write_buffer_manager,
+ WriteController* write_controller,
+ BlockCacheTracer* const block_cache_tracer)
+ : column_family_set_(new ColumnFamilySet(
+ dbname, _db_options, storage_options, table_cache,
+ write_buffer_manager, write_controller, block_cache_tracer)),
+ env_(_db_options->env),
+ fs_(_db_options->fs.get()),
+ dbname_(dbname),
+ db_options_(_db_options),
+ next_file_number_(2),
+ manifest_file_number_(0), // Filled by Recover()
+ options_file_number_(0),
+ pending_manifest_file_number_(0),
+ last_sequence_(0),
+ last_allocated_sequence_(0),
+ last_published_sequence_(0),
+ prev_log_number_(0),
+ current_version_number_(0),
+ manifest_file_size_(0),
+ file_options_(storage_options),
+ block_cache_tracer_(block_cache_tracer) {}
+
+VersionSet::~VersionSet() {
+ // we need to delete column_family_set_ because its destructor depends on
+ // VersionSet
+ Cache* table_cache = column_family_set_->get_table_cache();
+ column_family_set_.reset();
+ for (auto& file : obsolete_files_) {
+ if (file.metadata->table_reader_handle) {
+ table_cache->Release(file.metadata->table_reader_handle);
+ TableCache::Evict(table_cache, file.metadata->fd.GetNumber());
+ }
+ file.DeleteMetadata();
+ }
+ obsolete_files_.clear();
+}
+
+void VersionSet::AppendVersion(ColumnFamilyData* column_family_data,
+ Version* v) {
+ // compute new compaction score
+ v->storage_info()->ComputeCompactionScore(
+ *column_family_data->ioptions(),
+ *column_family_data->GetLatestMutableCFOptions());
+
+ // Mark v finalized
+ v->storage_info_.SetFinalized();
+
+ // Make "v" current
+ assert(v->refs_ == 0);
+ Version* current = column_family_data->current();
+ assert(v != current);
+ if (current != nullptr) {
+ assert(current->refs_ > 0);
+ current->Unref();
+ }
+ column_family_data->SetCurrent(v);
+ v->Ref();
+
+ // Append to linked list
+ v->prev_ = column_family_data->dummy_versions()->prev_;
+ v->next_ = column_family_data->dummy_versions();
+ v->prev_->next_ = v;
+ v->next_->prev_ = v;
+}
+
+Status VersionSet::ProcessManifestWrites(
+ std::deque<ManifestWriter>& writers, InstrumentedMutex* mu,
+ Directory* db_directory, bool new_descriptor_log,
+ const ColumnFamilyOptions* new_cf_options) {
+ assert(!writers.empty());
+ ManifestWriter& first_writer = writers.front();
+ ManifestWriter* last_writer = &first_writer;
+
+ assert(!manifest_writers_.empty());
+ assert(manifest_writers_.front() == &first_writer);
+
+ autovector<VersionEdit*> batch_edits;
+ autovector<Version*> versions;
+ autovector<const MutableCFOptions*> mutable_cf_options_ptrs;
+ std::vector<std::unique_ptr<BaseReferencedVersionBuilder>> builder_guards;
+
+ if (first_writer.edit_list.front()->IsColumnFamilyManipulation()) {
+ // No group commits for column family add or drop
+ LogAndApplyCFHelper(first_writer.edit_list.front());
+ batch_edits.push_back(first_writer.edit_list.front());
+ } else {
+ auto it = manifest_writers_.cbegin();
+ size_t group_start = std::numeric_limits<size_t>::max();
+ while (it != manifest_writers_.cend()) {
+ if ((*it)->edit_list.front()->IsColumnFamilyManipulation()) {
+ // no group commits for column family add or drop
+ break;
+ }
+ last_writer = *(it++);
+ assert(last_writer != nullptr);
+ assert(last_writer->cfd != nullptr);
+ if (last_writer->cfd->IsDropped()) {
+ // If we detect a dropped CF at this point, and the corresponding
+ // version edits belong to an atomic group, then we need to find out
+ // the preceding version edits in the same atomic group, and update
+ // their `remaining_entries_` member variable because we are NOT going
+ // to write the version edits' of dropped CF to the MANIFEST. If we
+ // don't update, then Recover can report corrupted atomic group because
+ // the `remaining_entries_` do not match.
+ if (!batch_edits.empty()) {
+ if (batch_edits.back()->is_in_atomic_group_ &&
+ batch_edits.back()->remaining_entries_ > 0) {
+ assert(group_start < batch_edits.size());
+ const auto& edit_list = last_writer->edit_list;
+ size_t k = 0;
+ while (k < edit_list.size()) {
+ if (!edit_list[k]->is_in_atomic_group_) {
+ break;
+ } else if (edit_list[k]->remaining_entries_ == 0) {
+ ++k;
+ break;
+ }
+ ++k;
+ }
+ for (auto i = group_start; i < batch_edits.size(); ++i) {
+ assert(static_cast<uint32_t>(k) <=
+ batch_edits.back()->remaining_entries_);
+ batch_edits[i]->remaining_entries_ -= static_cast<uint32_t>(k);
+ }
+ }
+ }
+ continue;
+ }
+ // We do a linear search on versions because versions is small.
+ // TODO(yanqin) maybe consider unordered_map
+ Version* version = nullptr;
+ VersionBuilder* builder = nullptr;
+ for (int i = 0; i != static_cast<int>(versions.size()); ++i) {
+ uint32_t cf_id = last_writer->cfd->GetID();
+ if (versions[i]->cfd()->GetID() == cf_id) {
+ version = versions[i];
+ assert(!builder_guards.empty() &&
+ builder_guards.size() == versions.size());
+ builder = builder_guards[i]->version_builder();
+ TEST_SYNC_POINT_CALLBACK(
+ "VersionSet::ProcessManifestWrites:SameColumnFamily", &cf_id);
+ break;
+ }
+ }
+ if (version == nullptr) {
+ version = new Version(last_writer->cfd, this, file_options_,
+ last_writer->mutable_cf_options,
+ current_version_number_++);
+ versions.push_back(version);
+ mutable_cf_options_ptrs.push_back(&last_writer->mutable_cf_options);
+ builder_guards.emplace_back(
+ new BaseReferencedVersionBuilder(last_writer->cfd));
+ builder = builder_guards.back()->version_builder();
+ }
+ assert(builder != nullptr); // make checker happy
+ for (const auto& e : last_writer->edit_list) {
+ if (e->is_in_atomic_group_) {
+ if (batch_edits.empty() || !batch_edits.back()->is_in_atomic_group_ ||
+ (batch_edits.back()->is_in_atomic_group_ &&
+ batch_edits.back()->remaining_entries_ == 0)) {
+ group_start = batch_edits.size();
+ }
+ } else if (group_start != std::numeric_limits<size_t>::max()) {
+ group_start = std::numeric_limits<size_t>::max();
+ }
+ Status s = LogAndApplyHelper(last_writer->cfd, builder, e, mu);
+ if (!s.ok()) {
+ // free up the allocated memory
+ for (auto v : versions) {
+ delete v;
+ }
+ return s;
+ }
+ batch_edits.push_back(e);
+ }
+ }
+ for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
+ assert(!builder_guards.empty() &&
+ builder_guards.size() == versions.size());
+ auto* builder = builder_guards[i]->version_builder();
+ Status s = builder->SaveTo(versions[i]->storage_info());
+ if (!s.ok()) {
+ // free up the allocated memory
+ for (auto v : versions) {
+ delete v;
+ }
+ return s;
+ }
+ }
+ }
+
+#ifndef NDEBUG
+ // Verify that version edits of atomic groups have correct
+ // remaining_entries_.
+ size_t k = 0;
+ while (k < batch_edits.size()) {
+ while (k < batch_edits.size() && !batch_edits[k]->is_in_atomic_group_) {
+ ++k;
+ }
+ if (k == batch_edits.size()) {
+ break;
+ }
+ size_t i = k;
+ while (i < batch_edits.size()) {
+ if (!batch_edits[i]->is_in_atomic_group_) {
+ break;
+ }
+ assert(i - k + batch_edits[i]->remaining_entries_ ==
+ batch_edits[k]->remaining_entries_);
+ if (batch_edits[i]->remaining_entries_ == 0) {
+ ++i;
+ break;
+ }
+ ++i;
+ }
+ assert(batch_edits[i - 1]->is_in_atomic_group_);
+ assert(0 == batch_edits[i - 1]->remaining_entries_);
+ std::vector<VersionEdit*> tmp;
+ for (size_t j = k; j != i; ++j) {
+ tmp.emplace_back(batch_edits[j]);
+ }
+ TEST_SYNC_POINT_CALLBACK(
+ "VersionSet::ProcessManifestWrites:CheckOneAtomicGroup", &tmp);
+ k = i;
+ }
+#endif // NDEBUG
+
+ uint64_t new_manifest_file_size = 0;
+ Status s;
+
+ assert(pending_manifest_file_number_ == 0);
+ if (!descriptor_log_ ||
+ manifest_file_size_ > db_options_->max_manifest_file_size) {
+ TEST_SYNC_POINT("VersionSet::ProcessManifestWrites:BeforeNewManifest");
+ new_descriptor_log = true;
+ } else {
+ pending_manifest_file_number_ = manifest_file_number_;
+ }
+
+ // Local cached copy of state variable(s). WriteCurrentStateToManifest()
+ // reads its content after releasing db mutex to avoid race with
+ // SwitchMemtable().
+ std::unordered_map<uint32_t, MutableCFState> curr_state;
+ if (new_descriptor_log) {
+ pending_manifest_file_number_ = NewFileNumber();
+ batch_edits.back()->SetNextFile(next_file_number_.load());
+
+ // if we are writing out new snapshot make sure to persist max column
+ // family.
+ if (column_family_set_->GetMaxColumnFamily() > 0) {
+ first_writer.edit_list.front()->SetMaxColumnFamily(
+ column_family_set_->GetMaxColumnFamily());
+ }
+ for (const auto* cfd : *column_family_set_) {
+ assert(curr_state.find(cfd->GetID()) == curr_state.end());
+ curr_state[cfd->GetID()] = {cfd->GetLogNumber()};
+ }
+ }
+
+ {
+ FileOptions opt_file_opts = fs_->OptimizeForManifestWrite(file_options_);
+ mu->Unlock();
+
+ TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifest");
+ if (!first_writer.edit_list.front()->IsColumnFamilyManipulation()) {
+ for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
+ assert(!builder_guards.empty() &&
+ builder_guards.size() == versions.size());
+ assert(!mutable_cf_options_ptrs.empty() &&
+ builder_guards.size() == versions.size());
+ ColumnFamilyData* cfd = versions[i]->cfd_;
+ s = builder_guards[i]->version_builder()->LoadTableHandlers(
+ cfd->internal_stats(), cfd->ioptions()->optimize_filters_for_hits,
+ true /* prefetch_index_and_filter_in_cache */,
+ false /* is_initial_load */,
+ mutable_cf_options_ptrs[i]->prefix_extractor.get());
+ if (!s.ok()) {
+ if (db_options_->paranoid_checks) {
+ break;
+ }
+ s = Status::OK();
+ }
+ }
+ }
+
+ if (s.ok() && new_descriptor_log) {
+ // This is fine because everything inside of this block is serialized --
+ // only one thread can be here at the same time
+ // create new manifest file
+ ROCKS_LOG_INFO(db_options_->info_log, "Creating manifest %" PRIu64 "\n",
+ pending_manifest_file_number_);
+ std::string descriptor_fname =
+ DescriptorFileName(dbname_, pending_manifest_file_number_);
+ std::unique_ptr<FSWritableFile> descriptor_file;
+ s = NewWritableFile(fs_, descriptor_fname, &descriptor_file,
+ opt_file_opts);
+ if (s.ok()) {
+ descriptor_file->SetPreallocationBlockSize(
+ db_options_->manifest_preallocation_size);
+
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ std::move(descriptor_file), descriptor_fname, opt_file_opts, env_,
+ nullptr, db_options_->listeners));
+ descriptor_log_.reset(
+ new log::Writer(std::move(file_writer), 0, false));
+ s = WriteCurrentStateToManifest(curr_state, descriptor_log_.get());
+ }
+ }
+
+ if (s.ok()) {
+ if (!first_writer.edit_list.front()->IsColumnFamilyManipulation()) {
+ for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
+ versions[i]->PrepareApply(*mutable_cf_options_ptrs[i], true);
+ }
+ }
+
+ // Write new records to MANIFEST log
+#ifndef NDEBUG
+ size_t idx = 0;
+#endif
+ for (auto& e : batch_edits) {
+ std::string record;
+ if (!e->EncodeTo(&record)) {
+ s = Status::Corruption("Unable to encode VersionEdit:" +
+ e->DebugString(true));
+ break;
+ }
+ TEST_KILL_RANDOM("VersionSet::LogAndApply:BeforeAddRecord",
+ rocksdb_kill_odds * REDUCE_ODDS2);
+#ifndef NDEBUG
+ if (batch_edits.size() > 1 && batch_edits.size() - 1 == idx) {
+ TEST_SYNC_POINT_CALLBACK(
+ "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0",
+ nullptr);
+ TEST_SYNC_POINT(
+ "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:1");
+ }
+ ++idx;
+#endif /* !NDEBUG */
+ s = descriptor_log_->AddRecord(record);
+ if (!s.ok()) {
+ break;
+ }
+ }
+ if (s.ok()) {
+ s = SyncManifest(env_, db_options_, descriptor_log_->file());
+ }
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(db_options_->info_log, "MANIFEST write %s\n",
+ s.ToString().c_str());
+ }
+ }
+
+ // If we just created a new descriptor file, install it by writing a
+ // new CURRENT file that points to it.
+ if (s.ok() && new_descriptor_log) {
+ s = SetCurrentFile(env_, dbname_, pending_manifest_file_number_,
+ db_directory);
+ TEST_SYNC_POINT("VersionSet::ProcessManifestWrites:AfterNewManifest");
+ }
+
+ if (s.ok()) {
+ // find offset in manifest file where this version is stored.
+ new_manifest_file_size = descriptor_log_->file()->GetFileSize();
+ }
+
+ if (first_writer.edit_list.front()->is_column_family_drop_) {
+ TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:0");
+ TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:1");
+ TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:2");
+ }
+
+ LogFlush(db_options_->info_log);
+ TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifestDone");
+ mu->Lock();
+ }
+
+ // Append the old manifest file to the obsolete_manifest_ list to be deleted
+ // by PurgeObsoleteFiles later.
+ if (s.ok() && new_descriptor_log) {
+ obsolete_manifests_.emplace_back(
+ DescriptorFileName("", manifest_file_number_));
+ }
+
+ // Install the new versions
+ if (s.ok()) {
+ if (first_writer.edit_list.front()->is_column_family_add_) {
+ assert(batch_edits.size() == 1);
+ assert(new_cf_options != nullptr);
+ CreateColumnFamily(*new_cf_options, first_writer.edit_list.front());
+ } else if (first_writer.edit_list.front()->is_column_family_drop_) {
+ assert(batch_edits.size() == 1);
+ first_writer.cfd->SetDropped();
+ first_writer.cfd->UnrefAndTryDelete();
+ } else {
+ // Each version in versions corresponds to a column family.
+ // For each column family, update its log number indicating that logs
+ // with number smaller than this should be ignored.
+ for (const auto version : versions) {
+ uint64_t max_log_number_in_batch = 0;
+ uint32_t cf_id = version->cfd_->GetID();
+ for (const auto& e : batch_edits) {
+ if (e->has_log_number_ && e->column_family_ == cf_id) {
+ max_log_number_in_batch =
+ std::max(max_log_number_in_batch, e->log_number_);
+ }
+ }
+ if (max_log_number_in_batch != 0) {
+ assert(version->cfd_->GetLogNumber() <= max_log_number_in_batch);
+ version->cfd_->SetLogNumber(max_log_number_in_batch);
+ }
+ }
+
+ uint64_t last_min_log_number_to_keep = 0;
+ for (auto& e : batch_edits) {
+ if (e->has_min_log_number_to_keep_) {
+ last_min_log_number_to_keep =
+ std::max(last_min_log_number_to_keep, e->min_log_number_to_keep_);
+ }
+ }
+
+ if (last_min_log_number_to_keep != 0) {
+ // Should only be set in 2PC mode.
+ MarkMinLogNumberToKeep2PC(last_min_log_number_to_keep);
+ }
+
+ for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
+ ColumnFamilyData* cfd = versions[i]->cfd_;
+ AppendVersion(cfd, versions[i]);
+ }
+ }
+ manifest_file_number_ = pending_manifest_file_number_;
+ manifest_file_size_ = new_manifest_file_size;
+ prev_log_number_ = first_writer.edit_list.front()->prev_log_number_;
+ } else {
+ std::string version_edits;
+ for (auto& e : batch_edits) {
+ version_edits += ("\n" + e->DebugString(true));
+ }
+ ROCKS_LOG_ERROR(db_options_->info_log,
+ "Error in committing version edit to MANIFEST: %s",
+ version_edits.c_str());
+ for (auto v : versions) {
+ delete v;
+ }
+ // If manifest append failed for whatever reason, the file could be
+ // corrupted. So we need to force the next version update to start a
+ // new manifest file.
+ descriptor_log_.reset();
+ if (new_descriptor_log) {
+ ROCKS_LOG_INFO(db_options_->info_log,
+ "Deleting manifest %" PRIu64 " current manifest %" PRIu64
+ "\n",
+ manifest_file_number_, pending_manifest_file_number_);
+ env_->DeleteFile(
+ DescriptorFileName(dbname_, pending_manifest_file_number_));
+ }
+ }
+
+ pending_manifest_file_number_ = 0;
+
+ // wake up all the waiting writers
+ while (true) {
+ ManifestWriter* ready = manifest_writers_.front();
+ manifest_writers_.pop_front();
+ bool need_signal = true;
+ for (const auto& w : writers) {
+ if (&w == ready) {
+ need_signal = false;
+ break;
+ }
+ }
+ ready->status = s;
+ ready->done = true;
+ if (need_signal) {
+ ready->cv.Signal();
+ }
+ if (ready == last_writer) {
+ break;
+ }
+ }
+ if (!manifest_writers_.empty()) {
+ manifest_writers_.front()->cv.Signal();
+ }
+ return s;
+}
+
+// 'datas' is gramatically incorrect. We still use this notation to indicate
+// that this variable represents a collection of column_family_data.
+Status VersionSet::LogAndApply(
+ const autovector<ColumnFamilyData*>& column_family_datas,
+ const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+ const autovector<autovector<VersionEdit*>>& edit_lists,
+ InstrumentedMutex* mu, Directory* db_directory, bool new_descriptor_log,
+ const ColumnFamilyOptions* new_cf_options) {
+ mu->AssertHeld();
+ int num_edits = 0;
+ for (const auto& elist : edit_lists) {
+ num_edits += static_cast<int>(elist.size());
+ }
+ if (num_edits == 0) {
+ return Status::OK();
+ } else if (num_edits > 1) {
+#ifndef NDEBUG
+ for (const auto& edit_list : edit_lists) {
+ for (const auto& edit : edit_list) {
+ assert(!edit->IsColumnFamilyManipulation());
+ }
+ }
+#endif /* ! NDEBUG */
+ }
+
+ int num_cfds = static_cast<int>(column_family_datas.size());
+ if (num_cfds == 1 && column_family_datas[0] == nullptr) {
+ assert(edit_lists.size() == 1 && edit_lists[0].size() == 1);
+ assert(edit_lists[0][0]->is_column_family_add_);
+ assert(new_cf_options != nullptr);
+ }
+ std::deque<ManifestWriter> writers;
+ if (num_cfds > 0) {
+ assert(static_cast<size_t>(num_cfds) == mutable_cf_options_list.size());
+ assert(static_cast<size_t>(num_cfds) == edit_lists.size());
+ }
+ for (int i = 0; i < num_cfds; ++i) {
+ writers.emplace_back(mu, column_family_datas[i],
+ *mutable_cf_options_list[i], edit_lists[i]);
+ manifest_writers_.push_back(&writers[i]);
+ }
+ assert(!writers.empty());
+ ManifestWriter& first_writer = writers.front();
+ while (!first_writer.done && &first_writer != manifest_writers_.front()) {
+ first_writer.cv.Wait();
+ }
+ if (first_writer.done) {
+ // All non-CF-manipulation operations can be grouped together and committed
+ // to MANIFEST. They should all have finished. The status code is stored in
+ // the first manifest writer.
+#ifndef NDEBUG
+ for (const auto& writer : writers) {
+ assert(writer.done);
+ }
+#endif /* !NDEBUG */
+ return first_writer.status;
+ }
+
+ int num_undropped_cfds = 0;
+ for (auto cfd : column_family_datas) {
+ // if cfd == nullptr, it is a column family add.
+ if (cfd == nullptr || !cfd->IsDropped()) {
+ ++num_undropped_cfds;
+ }
+ }
+ if (0 == num_undropped_cfds) {
+ for (int i = 0; i != num_cfds; ++i) {
+ manifest_writers_.pop_front();
+ }
+ // Notify new head of manifest write queue.
+ if (!manifest_writers_.empty()) {
+ manifest_writers_.front()->cv.Signal();
+ }
+ return Status::ColumnFamilyDropped();
+ }
+
+ return ProcessManifestWrites(writers, mu, db_directory, new_descriptor_log,
+ new_cf_options);
+}
+
+void VersionSet::LogAndApplyCFHelper(VersionEdit* edit) {
+ assert(edit->IsColumnFamilyManipulation());
+ edit->SetNextFile(next_file_number_.load());
+ // The log might have data that is not visible to memtbale and hence have not
+ // updated the last_sequence_ yet. It is also possible that the log has is
+ // expecting some new data that is not written yet. Since LastSequence is an
+ // upper bound on the sequence, it is ok to record
+ // last_allocated_sequence_ as the last sequence.
+ edit->SetLastSequence(db_options_->two_write_queues ? last_allocated_sequence_
+ : last_sequence_);
+ if (edit->is_column_family_drop_) {
+ // if we drop column family, we have to make sure to save max column family,
+ // so that we don't reuse existing ID
+ edit->SetMaxColumnFamily(column_family_set_->GetMaxColumnFamily());
+ }
+}
+
+Status VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd,
+ VersionBuilder* builder, VersionEdit* edit,
+ InstrumentedMutex* mu) {
+#ifdef NDEBUG
+ (void)cfd;
+#endif
+ mu->AssertHeld();
+ assert(!edit->IsColumnFamilyManipulation());
+
+ if (edit->has_log_number_) {
+ assert(edit->log_number_ >= cfd->GetLogNumber());
+ assert(edit->log_number_ < next_file_number_.load());
+ }
+
+ if (!edit->has_prev_log_number_) {
+ edit->SetPrevLogNumber(prev_log_number_);
+ }
+ edit->SetNextFile(next_file_number_.load());
+ // The log might have data that is not visible to memtbale and hence have not
+ // updated the last_sequence_ yet. It is also possible that the log has is
+ // expecting some new data that is not written yet. Since LastSequence is an
+ // upper bound on the sequence, it is ok to record
+ // last_allocated_sequence_ as the last sequence.
+ edit->SetLastSequence(db_options_->two_write_queues ? last_allocated_sequence_
+ : last_sequence_);
+
+ Status s = builder->Apply(edit);
+
+ return s;
+}
+
+Status VersionSet::ApplyOneVersionEditToBuilder(
+ VersionEdit& edit,
+ const std::unordered_map<std::string, ColumnFamilyOptions>& name_to_options,
+ std::unordered_map<int, std::string>& column_families_not_found,
+ std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>&
+ builders,
+ VersionEditParams* version_edit_params) {
+ // Not found means that user didn't supply that column
+ // family option AND we encountered column family add
+ // record. Once we encounter column family drop record,
+ // we will delete the column family from
+ // column_families_not_found.
+ bool cf_in_not_found = (column_families_not_found.find(edit.column_family_) !=
+ column_families_not_found.end());
+ // in builders means that user supplied that column family
+ // option AND that we encountered column family add record
+ bool cf_in_builders = builders.find(edit.column_family_) != builders.end();
+
+ // they can't both be true
+ assert(!(cf_in_not_found && cf_in_builders));
+
+ ColumnFamilyData* cfd = nullptr;
+
+ if (edit.is_column_family_add_) {
+ if (cf_in_builders || cf_in_not_found) {
+ return Status::Corruption(
+ "Manifest adding the same column family twice: " +
+ edit.column_family_name_);
+ }
+ auto cf_options = name_to_options.find(edit.column_family_name_);
+ // implicitly add persistent_stats column family without requiring user
+ // to specify
+ bool is_persistent_stats_column_family =
+ edit.column_family_name_.compare(kPersistentStatsColumnFamilyName) == 0;
+ if (cf_options == name_to_options.end() &&
+ !is_persistent_stats_column_family) {
+ column_families_not_found.insert(
+ {edit.column_family_, edit.column_family_name_});
+ } else {
+ // recover persistent_stats CF from a DB that already contains it
+ if (is_persistent_stats_column_family) {
+ ColumnFamilyOptions cfo;
+ OptimizeForPersistentStats(&cfo);
+ cfd = CreateColumnFamily(cfo, &edit);
+ } else {
+ cfd = CreateColumnFamily(cf_options->second, &edit);
+ }
+ cfd->set_initialized();
+ builders.insert(std::make_pair(
+ edit.column_family_, std::unique_ptr<BaseReferencedVersionBuilder>(
+ new BaseReferencedVersionBuilder(cfd))));
+ }
+ } else if (edit.is_column_family_drop_) {
+ if (cf_in_builders) {
+ auto builder = builders.find(edit.column_family_);
+ assert(builder != builders.end());
+ builders.erase(builder);
+ cfd = column_family_set_->GetColumnFamily(edit.column_family_);
+ assert(cfd != nullptr);
+ if (cfd->UnrefAndTryDelete()) {
+ cfd = nullptr;
+ } else {
+ // who else can have reference to cfd!?
+ assert(false);
+ }
+ } else if (cf_in_not_found) {
+ column_families_not_found.erase(edit.column_family_);
+ } else {
+ return Status::Corruption(
+ "Manifest - dropping non-existing column family");
+ }
+ } else if (!cf_in_not_found) {
+ if (!cf_in_builders) {
+ return Status::Corruption(
+ "Manifest record referencing unknown column family");
+ }
+
+ cfd = column_family_set_->GetColumnFamily(edit.column_family_);
+ // this should never happen since cf_in_builders is true
+ assert(cfd != nullptr);
+
+ // if it is not column family add or column family drop,
+ // then it's a file add/delete, which should be forwarded
+ // to builder
+ auto builder = builders.find(edit.column_family_);
+ assert(builder != builders.end());
+ Status s = builder->second->version_builder()->Apply(&edit);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ return ExtractInfoFromVersionEdit(cfd, edit, version_edit_params);
+}
+
+Status VersionSet::ExtractInfoFromVersionEdit(
+ ColumnFamilyData* cfd, const VersionEdit& from_edit,
+ VersionEditParams* version_edit_params) {
+ if (cfd != nullptr) {
+ if (from_edit.has_db_id_) {
+ version_edit_params->SetDBId(from_edit.db_id_);
+ }
+ if (from_edit.has_log_number_) {
+ if (cfd->GetLogNumber() > from_edit.log_number_) {
+ ROCKS_LOG_WARN(
+ db_options_->info_log,
+ "MANIFEST corruption detected, but ignored - Log numbers in "
+ "records NOT monotonically increasing");
+ } else {
+ cfd->SetLogNumber(from_edit.log_number_);
+ version_edit_params->SetLogNumber(from_edit.log_number_);
+ }
+ }
+ if (from_edit.has_comparator_ &&
+ from_edit.comparator_ != cfd->user_comparator()->Name()) {
+ return Status::InvalidArgument(
+ cfd->user_comparator()->Name(),
+ "does not match existing comparator " + from_edit.comparator_);
+ }
+ }
+
+ if (from_edit.has_prev_log_number_) {
+ version_edit_params->SetPrevLogNumber(from_edit.prev_log_number_);
+ }
+
+ if (from_edit.has_next_file_number_) {
+ version_edit_params->SetNextFile(from_edit.next_file_number_);
+ }
+
+ if (from_edit.has_max_column_family_) {
+ version_edit_params->SetMaxColumnFamily(from_edit.max_column_family_);
+ }
+
+ if (from_edit.has_min_log_number_to_keep_) {
+ version_edit_params->min_log_number_to_keep_ =
+ std::max(version_edit_params->min_log_number_to_keep_,
+ from_edit.min_log_number_to_keep_);
+ }
+
+ if (from_edit.has_last_sequence_) {
+ version_edit_params->SetLastSequence(from_edit.last_sequence_);
+ }
+ return Status::OK();
+}
+
+Status VersionSet::GetCurrentManifestPath(const std::string& dbname,
+ FileSystem* fs,
+ std::string* manifest_path,
+ uint64_t* manifest_file_number) {
+ assert(fs != nullptr);
+ assert(manifest_path != nullptr);
+ assert(manifest_file_number != nullptr);
+
+ std::string fname;
+ Status s = ReadFileToString(fs, CurrentFileName(dbname), &fname);
+ if (!s.ok()) {
+ return s;
+ }
+ if (fname.empty() || fname.back() != '\n') {
+ return Status::Corruption("CURRENT file does not end with newline");
+ }
+ // remove the trailing '\n'
+ fname.resize(fname.size() - 1);
+ FileType type;
+ bool parse_ok = ParseFileName(fname, manifest_file_number, &type);
+ if (!parse_ok || type != kDescriptorFile) {
+ return Status::Corruption("CURRENT file corrupted");
+ }
+ *manifest_path = dbname;
+ if (dbname.back() != '/') {
+ manifest_path->push_back('/');
+ }
+ *manifest_path += fname;
+ return Status::OK();
+}
+
+Status VersionSet::ReadAndRecover(
+ log::Reader* reader, AtomicGroupReadBuffer* read_buffer,
+ const std::unordered_map<std::string, ColumnFamilyOptions>& name_to_options,
+ std::unordered_map<int, std::string>& column_families_not_found,
+ std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>&
+ builders,
+ VersionEditParams* version_edit_params, std::string* db_id) {
+ assert(reader != nullptr);
+ assert(read_buffer != nullptr);
+ Status s;
+ Slice record;
+ std::string scratch;
+ size_t recovered_edits = 0;
+ while (reader->ReadRecord(&record, &scratch) && s.ok()) {
+ VersionEdit edit;
+ s = edit.DecodeFrom(record);
+ if (!s.ok()) {
+ break;
+ }
+ if (edit.has_db_id_) {
+ db_id_ = edit.GetDbId();
+ if (db_id != nullptr) {
+ db_id->assign(edit.GetDbId());
+ }
+ }
+ s = read_buffer->AddEdit(&edit);
+ if (!s.ok()) {
+ break;
+ }
+ if (edit.is_in_atomic_group_) {
+ if (read_buffer->IsFull()) {
+ // Apply edits in an atomic group when we have read all edits in the
+ // group.
+ for (auto& e : read_buffer->replay_buffer()) {
+ s = ApplyOneVersionEditToBuilder(e, name_to_options,
+ column_families_not_found, builders,
+ version_edit_params);
+ if (!s.ok()) {
+ break;
+ }
+ recovered_edits++;
+ }
+ if (!s.ok()) {
+ break;
+ }
+ read_buffer->Clear();
+ }
+ } else {
+ // Apply a normal edit immediately.
+ s = ApplyOneVersionEditToBuilder(edit, name_to_options,
+ column_families_not_found, builders,
+ version_edit_params);
+ if (s.ok()) {
+ recovered_edits++;
+ }
+ }
+ }
+ if (!s.ok()) {
+ // Clear the buffer if we fail to decode/apply an edit.
+ read_buffer->Clear();
+ }
+ TEST_SYNC_POINT_CALLBACK("VersionSet::ReadAndRecover:RecoveredEdits",
+ &recovered_edits);
+ return s;
+}
+
+Status VersionSet::Recover(
+ const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
+ std::string* db_id) {
+ std::unordered_map<std::string, ColumnFamilyOptions> cf_name_to_options;
+ for (const auto& cf : column_families) {
+ cf_name_to_options.emplace(cf.name, cf.options);
+ }
+ // keeps track of column families in manifest that were not found in
+ // column families parameters. if those column families are not dropped
+ // by subsequent manifest records, Recover() will return failure status
+ std::unordered_map<int, std::string> column_families_not_found;
+
+ // Read "CURRENT" file, which contains a pointer to the current manifest file
+ std::string manifest_path;
+ Status s = GetCurrentManifestPath(dbname_, fs_, &manifest_path,
+ &manifest_file_number_);
+ if (!s.ok()) {
+ return s;
+ }
+
+ ROCKS_LOG_INFO(db_options_->info_log, "Recovering from manifest file: %s\n",
+ manifest_path.c_str());
+
+ std::unique_ptr<SequentialFileReader> manifest_file_reader;
+ {
+ std::unique_ptr<FSSequentialFile> manifest_file;
+ s = fs_->NewSequentialFile(manifest_path,
+ fs_->OptimizeForManifestRead(file_options_),
+ &manifest_file, nullptr);
+ if (!s.ok()) {
+ return s;
+ }
+ manifest_file_reader.reset(
+ new SequentialFileReader(std::move(manifest_file), manifest_path,
+ db_options_->log_readahead_size));
+ }
+
+ std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>
+ builders;
+
+ // add default column family
+ auto default_cf_iter = cf_name_to_options.find(kDefaultColumnFamilyName);
+ if (default_cf_iter == cf_name_to_options.end()) {
+ return Status::InvalidArgument("Default column family not specified");
+ }
+ VersionEdit default_cf_edit;
+ default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName);
+ default_cf_edit.SetColumnFamily(0);
+ ColumnFamilyData* default_cfd =
+ CreateColumnFamily(default_cf_iter->second, &default_cf_edit);
+ // In recovery, nobody else can access it, so it's fine to set it to be
+ // initialized earlier.
+ default_cfd->set_initialized();
+ builders.insert(
+ std::make_pair(0, std::unique_ptr<BaseReferencedVersionBuilder>(
+ new BaseReferencedVersionBuilder(default_cfd))));
+ uint64_t current_manifest_file_size = 0;
+ VersionEditParams version_edit_params;
+ {
+ VersionSet::LogReporter reporter;
+ reporter.status = &s;
+ log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter,
+ true /* checksum */, 0 /* log_number */);
+ Slice record;
+ std::string scratch;
+ AtomicGroupReadBuffer read_buffer;
+ s = ReadAndRecover(&reader, &read_buffer, cf_name_to_options,
+ column_families_not_found, builders,
+ &version_edit_params, db_id);
+ current_manifest_file_size = reader.GetReadOffset();
+ assert(current_manifest_file_size != 0);
+ }
+
+ if (s.ok()) {
+ if (!version_edit_params.has_next_file_number_) {
+ s = Status::Corruption("no meta-nextfile entry in descriptor");
+ } else if (!version_edit_params.has_log_number_) {
+ s = Status::Corruption("no meta-lognumber entry in descriptor");
+ } else if (!version_edit_params.has_last_sequence_) {
+ s = Status::Corruption("no last-sequence-number entry in descriptor");
+ }
+
+ if (!version_edit_params.has_prev_log_number_) {
+ version_edit_params.SetPrevLogNumber(0);
+ }
+
+ column_family_set_->UpdateMaxColumnFamily(
+ version_edit_params.max_column_family_);
+
+ // When reading DB generated using old release, min_log_number_to_keep=0.
+ // All log files will be scanned for potential prepare entries.
+ MarkMinLogNumberToKeep2PC(version_edit_params.min_log_number_to_keep_);
+ MarkFileNumberUsed(version_edit_params.prev_log_number_);
+ MarkFileNumberUsed(version_edit_params.log_number_);
+ }
+
+ // there were some column families in the MANIFEST that weren't specified
+ // in the argument. This is OK in read_only mode
+ if (read_only == false && !column_families_not_found.empty()) {
+ std::string list_of_not_found;
+ for (const auto& cf : column_families_not_found) {
+ list_of_not_found += ", " + cf.second;
+ }
+ list_of_not_found = list_of_not_found.substr(2);
+ s = Status::InvalidArgument(
+ "You have to open all column families. Column families not opened: " +
+ list_of_not_found);
+ }
+
+ if (s.ok()) {
+ for (auto cfd : *column_family_set_) {
+ assert(builders.count(cfd->GetID()) > 0);
+ auto* builder = builders[cfd->GetID()]->version_builder();
+ if (!builder->CheckConsistencyForNumLevels()) {
+ s = Status::InvalidArgument(
+ "db has more levels than options.num_levels");
+ break;
+ }
+ }
+ }
+
+ if (s.ok()) {
+ for (auto cfd : *column_family_set_) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ if (read_only) {
+ cfd->table_cache()->SetTablesAreImmortal();
+ }
+ assert(cfd->initialized());
+ auto builders_iter = builders.find(cfd->GetID());
+ assert(builders_iter != builders.end());
+ auto builder = builders_iter->second->version_builder();
+
+ // unlimited table cache. Pre-load table handle now.
+ // Need to do it out of the mutex.
+ s = builder->LoadTableHandlers(
+ cfd->internal_stats(), db_options_->max_file_opening_threads,
+ false /* prefetch_index_and_filter_in_cache */,
+ true /* is_initial_load */,
+ cfd->GetLatestMutableCFOptions()->prefix_extractor.get());
+ if (!s.ok()) {
+ if (db_options_->paranoid_checks) {
+ return s;
+ }
+ s = Status::OK();
+ }
+
+ Version* v = new Version(cfd, this, file_options_,
+ *cfd->GetLatestMutableCFOptions(),
+ current_version_number_++);
+ builder->SaveTo(v->storage_info());
+
+ // Install recovered version
+ v->PrepareApply(*cfd->GetLatestMutableCFOptions(),
+ !(db_options_->skip_stats_update_on_db_open));
+ AppendVersion(cfd, v);
+ }
+
+ manifest_file_size_ = current_manifest_file_size;
+ next_file_number_.store(version_edit_params.next_file_number_ + 1);
+ last_allocated_sequence_ = version_edit_params.last_sequence_;
+ last_published_sequence_ = version_edit_params.last_sequence_;
+ last_sequence_ = version_edit_params.last_sequence_;
+ prev_log_number_ = version_edit_params.prev_log_number_;
+
+ ROCKS_LOG_INFO(
+ db_options_->info_log,
+ "Recovered from manifest file:%s succeeded,"
+ "manifest_file_number is %" PRIu64 ", next_file_number is %" PRIu64
+ ", last_sequence is %" PRIu64 ", log_number is %" PRIu64
+ ",prev_log_number is %" PRIu64 ",max_column_family is %" PRIu32
+ ",min_log_number_to_keep is %" PRIu64 "\n",
+ manifest_path.c_str(), manifest_file_number_, next_file_number_.load(),
+ last_sequence_.load(), version_edit_params.log_number_,
+ prev_log_number_, column_family_set_->GetMaxColumnFamily(),
+ min_log_number_to_keep_2pc());
+
+ for (auto cfd : *column_family_set_) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ ROCKS_LOG_INFO(db_options_->info_log,
+ "Column family [%s] (ID %" PRIu32
+ "), log number is %" PRIu64 "\n",
+ cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber());
+ }
+ }
+
+ return s;
+}
+
+Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
+ const std::string& dbname,
+ FileSystem* fs) {
+ // these are just for performance reasons, not correcntes,
+ // so we're fine using the defaults
+ FileOptions soptions;
+ // Read "CURRENT" file, which contains a pointer to the current manifest file
+ std::string manifest_path;
+ uint64_t manifest_file_number;
+ Status s =
+ GetCurrentManifestPath(dbname, fs, &manifest_path, &manifest_file_number);
+ if (!s.ok()) {
+ return s;
+ }
+
+ std::unique_ptr<SequentialFileReader> file_reader;
+ {
+ std::unique_ptr<FSSequentialFile> file;
+ s = fs->NewSequentialFile(manifest_path, soptions, &file, nullptr);
+ if (!s.ok()) {
+ return s;
+ }
+ file_reader.reset(new SequentialFileReader(std::move(file), manifest_path));
+ }
+
+ std::map<uint32_t, std::string> column_family_names;
+ // default column family is always implicitly there
+ column_family_names.insert({0, kDefaultColumnFamilyName});
+ VersionSet::LogReporter reporter;
+ reporter.status = &s;
+ log::Reader reader(nullptr, std::move(file_reader), &reporter,
+ true /* checksum */, 0 /* log_number */);
+ Slice record;
+ std::string scratch;
+ while (reader.ReadRecord(&record, &scratch) && s.ok()) {
+ VersionEdit edit;
+ s = edit.DecodeFrom(record);
+ if (!s.ok()) {
+ break;
+ }
+ if (edit.is_column_family_add_) {
+ if (column_family_names.find(edit.column_family_) !=
+ column_family_names.end()) {
+ s = Status::Corruption("Manifest adding the same column family twice");
+ break;
+ }
+ column_family_names.insert(
+ {edit.column_family_, edit.column_family_name_});
+ } else if (edit.is_column_family_drop_) {
+ if (column_family_names.find(edit.column_family_) ==
+ column_family_names.end()) {
+ s = Status::Corruption(
+ "Manifest - dropping non-existing column family");
+ break;
+ }
+ column_family_names.erase(edit.column_family_);
+ }
+ }
+
+ column_families->clear();
+ if (s.ok()) {
+ for (const auto& iter : column_family_names) {
+ column_families->push_back(iter.second);
+ }
+ }
+
+ return s;
+}
+
+#ifndef ROCKSDB_LITE
+Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
+ const Options* options,
+ const FileOptions& file_options,
+ int new_levels) {
+ if (new_levels <= 1) {
+ return Status::InvalidArgument(
+ "Number of levels needs to be bigger than 1");
+ }
+
+ ImmutableDBOptions db_options(*options);
+ ColumnFamilyOptions cf_options(*options);
+ std::shared_ptr<Cache> tc(NewLRUCache(options->max_open_files - 10,
+ options->table_cache_numshardbits));
+ WriteController wc(options->delayed_write_rate);
+ WriteBufferManager wb(options->db_write_buffer_size);
+ VersionSet versions(dbname, &db_options, file_options, tc.get(), &wb, &wc,
+ /*block_cache_tracer=*/nullptr);
+ Status status;
+
+ std::vector<ColumnFamilyDescriptor> dummy;
+ ColumnFamilyDescriptor dummy_descriptor(kDefaultColumnFamilyName,
+ ColumnFamilyOptions(*options));
+ dummy.push_back(dummy_descriptor);
+ status = versions.Recover(dummy);
+ if (!status.ok()) {
+ return status;
+ }
+
+ Version* current_version =
+ versions.GetColumnFamilySet()->GetDefault()->current();
+ auto* vstorage = current_version->storage_info();
+ int current_levels = vstorage->num_levels();
+
+ if (current_levels <= new_levels) {
+ return Status::OK();
+ }
+
+ // Make sure there are file only on one level from
+ // (new_levels-1) to (current_levels-1)
+ int first_nonempty_level = -1;
+ int first_nonempty_level_filenum = 0;
+ for (int i = new_levels - 1; i < current_levels; i++) {
+ int file_num = vstorage->NumLevelFiles(i);
+ if (file_num != 0) {
+ if (first_nonempty_level < 0) {
+ first_nonempty_level = i;
+ first_nonempty_level_filenum = file_num;
+ } else {
+ char msg[255];
+ snprintf(msg, sizeof(msg),
+ "Found at least two levels containing files: "
+ "[%d:%d],[%d:%d].\n",
+ first_nonempty_level, first_nonempty_level_filenum, i,
+ file_num);
+ return Status::InvalidArgument(msg);
+ }
+ }
+ }
+
+ // we need to allocate an array with the old number of levels size to
+ // avoid SIGSEGV in WriteCurrentStatetoManifest()
+ // however, all levels bigger or equal to new_levels will be empty
+ std::vector<FileMetaData*>* new_files_list =
+ new std::vector<FileMetaData*>[current_levels];
+ for (int i = 0; i < new_levels - 1; i++) {
+ new_files_list[i] = vstorage->LevelFiles(i);
+ }
+
+ if (first_nonempty_level > 0) {
+ new_files_list[new_levels - 1] = vstorage->LevelFiles(first_nonempty_level);
+ }
+
+ delete[] vstorage -> files_;
+ vstorage->files_ = new_files_list;
+ vstorage->num_levels_ = new_levels;
+
+ MutableCFOptions mutable_cf_options(*options);
+ VersionEdit ve;
+ InstrumentedMutex dummy_mutex;
+ InstrumentedMutexLock l(&dummy_mutex);
+ return versions.LogAndApply(
+ versions.GetColumnFamilySet()->GetDefault(),
+ mutable_cf_options, &ve, &dummy_mutex, nullptr, true);
+}
+
+// Get the checksum information including the checksum and checksum function
+// name of all SST files in VersionSet. Store the information in
+// FileChecksumList which contains a map from file number to its checksum info.
+// If DB is not running, make sure call VersionSet::Recover() to load the file
+// metadata from Manifest to VersionSet before calling this function.
+Status VersionSet::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) {
+ // Clean the previously stored checksum information if any.
+ if (checksum_list == nullptr) {
+ return Status::InvalidArgument("checksum_list is nullptr");
+ }
+ checksum_list->reset();
+
+ for (auto cfd : *column_family_set_) {
+ if (cfd->IsDropped() || !cfd->initialized()) {
+ continue;
+ }
+ for (int level = 0; level < cfd->NumberLevels(); level++) {
+ for (const auto& file :
+ cfd->current()->storage_info()->LevelFiles(level)) {
+ checksum_list->InsertOneFileChecksum(file->fd.GetNumber(),
+ file->file_checksum,
+ file->file_checksum_func_name);
+ }
+ }
+ }
+ return Status::OK();
+}
+
+Status VersionSet::DumpManifest(Options& options, std::string& dscname,
+ bool verbose, bool hex, bool json) {
+ // Open the specified manifest file.
+ std::unique_ptr<SequentialFileReader> file_reader;
+ Status s;
+ {
+ std::unique_ptr<FSSequentialFile> file;
+ s = options.file_system->NewSequentialFile(
+ dscname,
+ options.file_system->OptimizeForManifestRead(file_options_), &file,
+ nullptr);
+ if (!s.ok()) {
+ return s;
+ }
+ file_reader.reset(new SequentialFileReader(
+ std::move(file), dscname, db_options_->log_readahead_size));
+ }
+
+ bool have_prev_log_number = false;
+ bool have_next_file = false;
+ bool have_last_sequence = false;
+ uint64_t next_file = 0;
+ uint64_t last_sequence = 0;
+ uint64_t previous_log_number = 0;
+ int count = 0;
+ std::unordered_map<uint32_t, std::string> comparators;
+ std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>
+ builders;
+
+ // add default column family
+ VersionEdit default_cf_edit;
+ default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName);
+ default_cf_edit.SetColumnFamily(0);
+ ColumnFamilyData* default_cfd =
+ CreateColumnFamily(ColumnFamilyOptions(options), &default_cf_edit);
+ builders.insert(
+ std::make_pair(0, std::unique_ptr<BaseReferencedVersionBuilder>(
+ new BaseReferencedVersionBuilder(default_cfd))));
+
+ {
+ VersionSet::LogReporter reporter;
+ reporter.status = &s;
+ log::Reader reader(nullptr, std::move(file_reader), &reporter,
+ true /* checksum */, 0 /* log_number */);
+ Slice record;
+ std::string scratch;
+ while (reader.ReadRecord(&record, &scratch) && s.ok()) {
+ VersionEdit edit;
+ s = edit.DecodeFrom(record);
+ if (!s.ok()) {
+ break;
+ }
+
+ // Write out each individual edit
+ if (verbose && !json) {
+ printf("%s\n", edit.DebugString(hex).c_str());
+ } else if (json) {
+ printf("%s\n", edit.DebugJSON(count, hex).c_str());
+ }
+ count++;
+
+ bool cf_in_builders =
+ builders.find(edit.column_family_) != builders.end();
+
+ if (edit.has_comparator_) {
+ comparators.insert({edit.column_family_, edit.comparator_});
+ }
+
+ ColumnFamilyData* cfd = nullptr;
+
+ if (edit.is_column_family_add_) {
+ if (cf_in_builders) {
+ s = Status::Corruption(
+ "Manifest adding the same column family twice");
+ break;
+ }
+ cfd = CreateColumnFamily(ColumnFamilyOptions(options), &edit);
+ cfd->set_initialized();
+ builders.insert(std::make_pair(
+ edit.column_family_, std::unique_ptr<BaseReferencedVersionBuilder>(
+ new BaseReferencedVersionBuilder(cfd))));
+ } else if (edit.is_column_family_drop_) {
+ if (!cf_in_builders) {
+ s = Status::Corruption(
+ "Manifest - dropping non-existing column family");
+ break;
+ }
+ auto builder_iter = builders.find(edit.column_family_);
+ builders.erase(builder_iter);
+ comparators.erase(edit.column_family_);
+ cfd = column_family_set_->GetColumnFamily(edit.column_family_);
+ assert(cfd != nullptr);
+ cfd->UnrefAndTryDelete();
+ cfd = nullptr;
+ } else {
+ if (!cf_in_builders) {
+ s = Status::Corruption(
+ "Manifest record referencing unknown column family");
+ break;
+ }
+
+ cfd = column_family_set_->GetColumnFamily(edit.column_family_);
+ // this should never happen since cf_in_builders is true
+ assert(cfd != nullptr);
+
+ // if it is not column family add or column family drop,
+ // then it's a file add/delete, which should be forwarded
+ // to builder
+ auto builder = builders.find(edit.column_family_);
+ assert(builder != builders.end());
+ s = builder->second->version_builder()->Apply(&edit);
+ if (!s.ok()) {
+ break;
+ }
+ }
+
+ if (cfd != nullptr && edit.has_log_number_) {
+ cfd->SetLogNumber(edit.log_number_);
+ }
+
+
+ if (edit.has_prev_log_number_) {
+ previous_log_number = edit.prev_log_number_;
+ have_prev_log_number = true;
+ }
+
+ if (edit.has_next_file_number_) {
+ next_file = edit.next_file_number_;
+ have_next_file = true;
+ }
+
+ if (edit.has_last_sequence_) {
+ last_sequence = edit.last_sequence_;
+ have_last_sequence = true;
+ }
+
+ if (edit.has_max_column_family_) {
+ column_family_set_->UpdateMaxColumnFamily(edit.max_column_family_);
+ }
+
+ if (edit.has_min_log_number_to_keep_) {
+ MarkMinLogNumberToKeep2PC(edit.min_log_number_to_keep_);
+ }
+ }
+ }
+ file_reader.reset();
+
+ if (s.ok()) {
+ if (!have_next_file) {
+ s = Status::Corruption("no meta-nextfile entry in descriptor");
+ printf("no meta-nextfile entry in descriptor");
+ } else if (!have_last_sequence) {
+ printf("no last-sequence-number entry in descriptor");
+ s = Status::Corruption("no last-sequence-number entry in descriptor");
+ }
+
+ if (!have_prev_log_number) {
+ previous_log_number = 0;
+ }
+ }
+
+ if (s.ok()) {
+ for (auto cfd : *column_family_set_) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ auto builders_iter = builders.find(cfd->GetID());
+ assert(builders_iter != builders.end());
+ auto builder = builders_iter->second->version_builder();
+
+ Version* v = new Version(cfd, this, file_options_,
+ *cfd->GetLatestMutableCFOptions(),
+ current_version_number_++);
+ builder->SaveTo(v->storage_info());
+ v->PrepareApply(*cfd->GetLatestMutableCFOptions(), false);
+
+ printf("--------------- Column family \"%s\" (ID %" PRIu32
+ ") --------------\n",
+ cfd->GetName().c_str(), cfd->GetID());
+ printf("log number: %" PRIu64 "\n", cfd->GetLogNumber());
+ auto comparator = comparators.find(cfd->GetID());
+ if (comparator != comparators.end()) {
+ printf("comparator: %s\n", comparator->second.c_str());
+ } else {
+ printf("comparator: <NO COMPARATOR>\n");
+ }
+ printf("%s \n", v->DebugString(hex).c_str());
+ delete v;
+ }
+
+ next_file_number_.store(next_file + 1);
+ last_allocated_sequence_ = last_sequence;
+ last_published_sequence_ = last_sequence;
+ last_sequence_ = last_sequence;
+ prev_log_number_ = previous_log_number;
+
+ printf("next_file_number %" PRIu64 " last_sequence %" PRIu64
+ " prev_log_number %" PRIu64 " max_column_family %" PRIu32
+ " min_log_number_to_keep "
+ "%" PRIu64 "\n",
+ next_file_number_.load(), last_sequence, previous_log_number,
+ column_family_set_->GetMaxColumnFamily(),
+ min_log_number_to_keep_2pc());
+ }
+
+ return s;
+}
+#endif // ROCKSDB_LITE
+
+void VersionSet::MarkFileNumberUsed(uint64_t number) {
+ // only called during recovery and repair which are single threaded, so this
+ // works because there can't be concurrent calls
+ if (next_file_number_.load(std::memory_order_relaxed) <= number) {
+ next_file_number_.store(number + 1, std::memory_order_relaxed);
+ }
+}
+// Called only either from ::LogAndApply which is protected by mutex or during
+// recovery which is single-threaded.
+void VersionSet::MarkMinLogNumberToKeep2PC(uint64_t number) {
+ if (min_log_number_to_keep_2pc_.load(std::memory_order_relaxed) < number) {
+ min_log_number_to_keep_2pc_.store(number, std::memory_order_relaxed);
+ }
+}
+
+Status VersionSet::WriteCurrentStateToManifest(
+ const std::unordered_map<uint32_t, MutableCFState>& curr_state,
+ log::Writer* log) {
+ // TODO: Break up into multiple records to reduce memory usage on recovery?
+
+ // WARNING: This method doesn't hold a mutex!!
+
+ // This is done without DB mutex lock held, but only within single-threaded
+ // LogAndApply. Column family manipulations can only happen within LogAndApply
+ // (the same single thread), so we're safe to iterate.
+
+ if (db_options_->write_dbid_to_manifest) {
+ VersionEdit edit_for_db_id;
+ assert(!db_id_.empty());
+ edit_for_db_id.SetDBId(db_id_);
+ std::string db_id_record;
+ if (!edit_for_db_id.EncodeTo(&db_id_record)) {
+ return Status::Corruption("Unable to Encode VersionEdit:" +
+ edit_for_db_id.DebugString(true));
+ }
+ Status add_record = log->AddRecord(db_id_record);
+ if (!add_record.ok()) {
+ return add_record;
+ }
+ }
+
+ for (auto cfd : *column_family_set_) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ assert(cfd->initialized());
+ {
+ // Store column family info
+ VersionEdit edit;
+ if (cfd->GetID() != 0) {
+ // default column family is always there,
+ // no need to explicitly write it
+ edit.AddColumnFamily(cfd->GetName());
+ edit.SetColumnFamily(cfd->GetID());
+ }
+ edit.SetComparatorName(
+ cfd->internal_comparator().user_comparator()->Name());
+ std::string record;
+ if (!edit.EncodeTo(&record)) {
+ return Status::Corruption(
+ "Unable to Encode VersionEdit:" + edit.DebugString(true));
+ }
+ Status s = log->AddRecord(record);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ {
+ // Save files
+ VersionEdit edit;
+ edit.SetColumnFamily(cfd->GetID());
+
+ for (int level = 0; level < cfd->NumberLevels(); level++) {
+ for (const auto& f :
+ cfd->current()->storage_info()->LevelFiles(level)) {
+ edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(),
+ f->fd.GetFileSize(), f->smallest, f->largest,
+ f->fd.smallest_seqno, f->fd.largest_seqno,
+ f->marked_for_compaction, f->oldest_blob_file_number,
+ f->oldest_ancester_time, f->file_creation_time,
+ f->file_checksum, f->file_checksum_func_name);
+ }
+ }
+ const auto iter = curr_state.find(cfd->GetID());
+ assert(iter != curr_state.end());
+ uint64_t log_number = iter->second.log_number;
+ edit.SetLogNumber(log_number);
+ std::string record;
+ if (!edit.EncodeTo(&record)) {
+ return Status::Corruption(
+ "Unable to Encode VersionEdit:" + edit.DebugString(true));
+ }
+ Status s = log->AddRecord(record);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ }
+ return Status::OK();
+}
+
+// TODO(aekmekji): in CompactionJob::GenSubcompactionBoundaries(), this
+// function is called repeatedly with consecutive pairs of slices. For example
+// if the slice list is [a, b, c, d] this function is called with arguments
+// (a,b) then (b,c) then (c,d). Knowing this, an optimization is possible where
+// we avoid doing binary search for the keys b and c twice and instead somehow
+// maintain state of where they first appear in the files.
+uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
+ Version* v, const Slice& start,
+ const Slice& end, int start_level,
+ int end_level, TableReaderCaller caller) {
+ const auto& icmp = v->cfd_->internal_comparator();
+
+ // pre-condition
+ assert(icmp.Compare(start, end) <= 0);
+
+ uint64_t total_full_size = 0;
+ const auto* vstorage = v->storage_info();
+ const int num_non_empty_levels = vstorage->num_non_empty_levels();
+ end_level = (end_level == -1) ? num_non_empty_levels
+ : std::min(end_level, num_non_empty_levels);
+
+ assert(start_level <= end_level);
+
+ // Outline of the optimization that uses options.files_size_error_margin.
+ // When approximating the files total size that is used to store a keys range,
+ // we first sum up the sizes of the files that fully fall into the range.
+ // Then we sum up the sizes of all the files that may intersect with the range
+ // (this includes all files in L0 as well). Then, if total_intersecting_size
+ // is smaller than total_full_size * options.files_size_error_margin - we can
+ // infer that the intersecting files have a sufficiently negligible
+ // contribution to the total size, and we can approximate the storage required
+ // for the keys in range as just half of the intersecting_files_size.
+ // E.g., if the value of files_size_error_margin is 0.1, then the error of the
+ // approximation is limited to only ~10% of the total size of files that fully
+ // fall into the keys range. In such case, this helps to avoid a costly
+ // process of binary searching the intersecting files that is required only
+ // for a more precise calculation of the total size.
+
+ autovector<FdWithKeyRange*, 32> first_files;
+ autovector<FdWithKeyRange*, 16> last_files;
+
+ // scan all the levels
+ for (int level = start_level; level < end_level; ++level) {
+ const LevelFilesBrief& files_brief = vstorage->LevelFilesBrief(level);
+ if (files_brief.num_files == 0) {
+ // empty level, skip exploration
+ continue;
+ }
+
+ if (level == 0) {
+ // level 0 files are not in sorted order, we need to iterate through
+ // the list to compute the total bytes that require scanning,
+ // so handle the case explicitly (similarly to first_files case)
+ for (size_t i = 0; i < files_brief.num_files; i++) {
+ first_files.push_back(&files_brief.files[i]);
+ }
+ continue;
+ }
+
+ assert(level > 0);
+ assert(files_brief.num_files > 0);
+
+ // identify the file position for start key
+ const int idx_start =
+ FindFileInRange(icmp, files_brief, start, 0,
+ static_cast<uint32_t>(files_brief.num_files - 1));
+ assert(static_cast<size_t>(idx_start) < files_brief.num_files);
+
+ // identify the file position for end key
+ int idx_end = idx_start;
+ if (icmp.Compare(files_brief.files[idx_end].largest_key, end) < 0) {
+ idx_end =
+ FindFileInRange(icmp, files_brief, end, idx_start,
+ static_cast<uint32_t>(files_brief.num_files - 1));
+ }
+ assert(idx_end >= idx_start &&
+ static_cast<size_t>(idx_end) < files_brief.num_files);
+
+ // scan all files from the starting index to the ending index
+ // (inferred from the sorted order)
+
+ // first scan all the intermediate full files (excluding first and last)
+ for (int i = idx_start + 1; i < idx_end; ++i) {
+ uint64_t file_size = files_brief.files[i].fd.GetFileSize();
+ // The entire file falls into the range, so we can just take its size.
+ assert(file_size ==
+ ApproximateSize(v, files_brief.files[i], start, end, caller));
+ total_full_size += file_size;
+ }
+
+ // save the first and the last files (which may be the same file), so we
+ // can scan them later.
+ first_files.push_back(&files_brief.files[idx_start]);
+ if (idx_start != idx_end) {
+ // we need to estimate size for both files, only if they are different
+ last_files.push_back(&files_brief.files[idx_end]);
+ }
+ }
+
+ // The sum of all file sizes that intersect the [start, end] keys range.
+ uint64_t total_intersecting_size = 0;
+ for (const auto* file_ptr : first_files) {
+ total_intersecting_size += file_ptr->fd.GetFileSize();
+ }
+ for (const auto* file_ptr : last_files) {
+ total_intersecting_size += file_ptr->fd.GetFileSize();
+ }
+
+ // Now scan all the first & last files at each level, and estimate their size.
+ // If the total_intersecting_size is less than X% of the total_full_size - we
+ // want to approximate the result in order to avoid the costly binary search
+ // inside ApproximateSize. We use half of file size as an approximation below.
+
+ const double margin = options.files_size_error_margin;
+ if (margin > 0 && total_intersecting_size <
+ static_cast<uint64_t>(total_full_size * margin)) {
+ total_full_size += total_intersecting_size / 2;
+ } else {
+ // Estimate for all the first files, at each level
+ for (const auto file_ptr : first_files) {
+ total_full_size += ApproximateSize(v, *file_ptr, start, end, caller);
+ }
+
+ // Estimate for all the last files, at each level
+ for (const auto file_ptr : last_files) {
+ // We could use ApproximateSize here, but calling ApproximateOffsetOf
+ // directly is just more efficient.
+ total_full_size += ApproximateOffsetOf(v, *file_ptr, end, caller);
+ }
+ }
+
+ return total_full_size;
+}
+
+uint64_t VersionSet::ApproximateOffsetOf(Version* v, const FdWithKeyRange& f,
+ const Slice& key,
+ TableReaderCaller caller) {
+ // pre-condition
+ assert(v);
+ const auto& icmp = v->cfd_->internal_comparator();
+
+ uint64_t result = 0;
+ if (icmp.Compare(f.largest_key, key) <= 0) {
+ // Entire file is before "key", so just add the file size
+ result = f.fd.GetFileSize();
+ } else if (icmp.Compare(f.smallest_key, key) > 0) {
+ // Entire file is after "key", so ignore
+ result = 0;
+ } else {
+ // "key" falls in the range for this table. Add the
+ // approximate offset of "key" within the table.
+ TableCache* table_cache = v->cfd_->table_cache();
+ if (table_cache != nullptr) {
+ result = table_cache->ApproximateOffsetOf(
+ key, f.file_metadata->fd, caller, icmp,
+ v->GetMutableCFOptions().prefix_extractor.get());
+ }
+ }
+ return result;
+}
+
+uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
+ const Slice& start, const Slice& end,
+ TableReaderCaller caller) {
+ // pre-condition
+ assert(v);
+ const auto& icmp = v->cfd_->internal_comparator();
+ assert(icmp.Compare(start, end) <= 0);
+
+ if (icmp.Compare(f.largest_key, start) <= 0 ||
+ icmp.Compare(f.smallest_key, end) > 0) {
+ // Entire file is before or after the start/end keys range
+ return 0;
+ }
+
+ if (icmp.Compare(f.smallest_key, start) >= 0) {
+ // Start of the range is before the file start - approximate by end offset
+ return ApproximateOffsetOf(v, f, end, caller);
+ }
+
+ if (icmp.Compare(f.largest_key, end) < 0) {
+ // End of the range is after the file end - approximate by subtracting
+ // start offset from the file size
+ uint64_t start_offset = ApproximateOffsetOf(v, f, start, caller);
+ assert(f.fd.GetFileSize() >= start_offset);
+ return f.fd.GetFileSize() - start_offset;
+ }
+
+ // The interval falls entirely in the range for this file.
+ TableCache* table_cache = v->cfd_->table_cache();
+ if (table_cache == nullptr) {
+ return 0;
+ }
+ return table_cache->ApproximateSize(
+ start, end, f.file_metadata->fd, caller, icmp,
+ v->GetMutableCFOptions().prefix_extractor.get());
+}
+
+void VersionSet::AddLiveFiles(std::vector<FileDescriptor>* live_list) {
+ // pre-calculate space requirement
+ int64_t total_files = 0;
+ for (auto cfd : *column_family_set_) {
+ if (!cfd->initialized()) {
+ continue;
+ }
+ Version* dummy_versions = cfd->dummy_versions();
+ for (Version* v = dummy_versions->next_; v != dummy_versions;
+ v = v->next_) {
+ const auto* vstorage = v->storage_info();
+ for (int level = 0; level < vstorage->num_levels(); level++) {
+ total_files += vstorage->LevelFiles(level).size();
+ }
+ }
+ }
+
+ // just one time extension to the right size
+ live_list->reserve(live_list->size() + static_cast<size_t>(total_files));
+
+ for (auto cfd : *column_family_set_) {
+ if (!cfd->initialized()) {
+ continue;
+ }
+ auto* current = cfd->current();
+ bool found_current = false;
+ Version* dummy_versions = cfd->dummy_versions();
+ for (Version* v = dummy_versions->next_; v != dummy_versions;
+ v = v->next_) {
+ v->AddLiveFiles(live_list);
+ if (v == current) {
+ found_current = true;
+ }
+ }
+ if (!found_current && current != nullptr) {
+ // Should never happen unless it is a bug.
+ assert(false);
+ current->AddLiveFiles(live_list);
+ }
+ }
+}
+
+InternalIterator* VersionSet::MakeInputIterator(
+ const Compaction* c, RangeDelAggregator* range_del_agg,
+ const FileOptions& file_options_compactions) {
+ auto cfd = c->column_family_data();
+ ReadOptions read_options;
+ read_options.verify_checksums = true;
+ read_options.fill_cache = false;
+ // Compaction iterators shouldn't be confined to a single prefix.
+ // Compactions use Seek() for
+ // (a) concurrent compactions,
+ // (b) CompactionFilter::Decision::kRemoveAndSkipUntil.
+ read_options.total_order_seek = true;
+
+ // Level-0 files have to be merged together. For other levels,
+ // we will make a concatenating iterator per level.
+ // TODO(opt): use concatenating iterator for level-0 if there is no overlap
+ const size_t space = (c->level() == 0 ? c->input_levels(0)->num_files +
+ c->num_input_levels() - 1
+ : c->num_input_levels());
+ InternalIterator** list = new InternalIterator* [space];
+ size_t num = 0;
+ for (size_t which = 0; which < c->num_input_levels(); which++) {
+ if (c->input_levels(which)->num_files != 0) {
+ if (c->level(which) == 0) {
+ const LevelFilesBrief* flevel = c->input_levels(which);
+ for (size_t i = 0; i < flevel->num_files; i++) {
+ list[num++] = cfd->table_cache()->NewIterator(
+ read_options, file_options_compactions,
+ cfd->internal_comparator(),
+ *flevel->files[i].file_metadata, range_del_agg,
+ c->mutable_cf_options()->prefix_extractor.get(),
+ /*table_reader_ptr=*/nullptr,
+ /*file_read_hist=*/nullptr, TableReaderCaller::kCompaction,
+ /*arena=*/nullptr,
+ /*skip_filters=*/false, /*level=*/static_cast<int>(which),
+ /*smallest_compaction_key=*/nullptr,
+ /*largest_compaction_key=*/nullptr);
+ }
+ } else {
+ // Create concatenating iterator for the files from this level
+ list[num++] = new LevelIterator(
+ cfd->table_cache(), read_options, file_options_compactions,
+ cfd->internal_comparator(), c->input_levels(which),
+ c->mutable_cf_options()->prefix_extractor.get(),
+ /*should_sample=*/false,
+ /*no per level latency histogram=*/nullptr,
+ TableReaderCaller::kCompaction, /*skip_filters=*/false,
+ /*level=*/static_cast<int>(which), range_del_agg,
+ c->boundaries(which));
+ }
+ }
+ }
+ assert(num <= space);
+ InternalIterator* result =
+ NewMergingIterator(&c->column_family_data()->internal_comparator(), list,
+ static_cast<int>(num));
+ delete[] list;
+ return result;
+}
+
+// verify that the files listed in this compaction are present
+// in the current version
+bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) {
+#ifndef NDEBUG
+ Version* version = c->column_family_data()->current();
+ const VersionStorageInfo* vstorage = version->storage_info();
+ if (c->input_version() != version) {
+ ROCKS_LOG_INFO(
+ db_options_->info_log,
+ "[%s] compaction output being applied to a different base version from"
+ " input version",
+ c->column_family_data()->GetName().c_str());
+
+ if (vstorage->compaction_style_ == kCompactionStyleLevel &&
+ c->start_level() == 0 && c->num_input_levels() > 2U) {
+ // We are doing a L0->base_level compaction. The assumption is if
+ // base level is not L1, levels from L1 to base_level - 1 is empty.
+ // This is ensured by having one compaction from L0 going on at the
+ // same time in level-based compaction. So that during the time, no
+ // compaction/flush can put files to those levels.
+ for (int l = c->start_level() + 1; l < c->output_level(); l++) {
+ if (vstorage->NumLevelFiles(l) != 0) {
+ return false;
+ }
+ }
+ }
+ }
+
+ for (size_t input = 0; input < c->num_input_levels(); ++input) {
+ int level = c->level(input);
+ for (size_t i = 0; i < c->num_input_files(input); ++i) {
+ uint64_t number = c->input(input, i)->fd.GetNumber();
+ bool found = false;
+ for (size_t j = 0; j < vstorage->files_[level].size(); j++) {
+ FileMetaData* f = vstorage->files_[level][j];
+ if (f->fd.GetNumber() == number) {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ return false; // input files non existent in current version
+ }
+ }
+ }
+#else
+ (void)c;
+#endif
+ return true; // everything good
+}
+
+Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel,
+ FileMetaData** meta,
+ ColumnFamilyData** cfd) {
+ for (auto cfd_iter : *column_family_set_) {
+ if (!cfd_iter->initialized()) {
+ continue;
+ }
+ Version* version = cfd_iter->current();
+ const auto* vstorage = version->storage_info();
+ for (int level = 0; level < vstorage->num_levels(); level++) {
+ for (const auto& file : vstorage->LevelFiles(level)) {
+ if (file->fd.GetNumber() == number) {
+ *meta = file;
+ *filelevel = level;
+ *cfd = cfd_iter;
+ return Status::OK();
+ }
+ }
+ }
+ }
+ return Status::NotFound("File not present in any level");
+}
+
+void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
+ for (auto cfd : *column_family_set_) {
+ if (cfd->IsDropped() || !cfd->initialized()) {
+ continue;
+ }
+ for (int level = 0; level < cfd->NumberLevels(); level++) {
+ for (const auto& file :
+ cfd->current()->storage_info()->LevelFiles(level)) {
+ LiveFileMetaData filemetadata;
+ filemetadata.column_family_name = cfd->GetName();
+ uint32_t path_id = file->fd.GetPathId();
+ if (path_id < cfd->ioptions()->cf_paths.size()) {
+ filemetadata.db_path = cfd->ioptions()->cf_paths[path_id].path;
+ } else {
+ assert(!cfd->ioptions()->cf_paths.empty());
+ filemetadata.db_path = cfd->ioptions()->cf_paths.back().path;
+ }
+ const uint64_t file_number = file->fd.GetNumber();
+ filemetadata.name = MakeTableFileName("", file_number);
+ filemetadata.file_number = file_number;
+ filemetadata.level = level;
+ filemetadata.size = static_cast<size_t>(file->fd.GetFileSize());
+ filemetadata.smallestkey = file->smallest.user_key().ToString();
+ filemetadata.largestkey = file->largest.user_key().ToString();
+ filemetadata.smallest_seqno = file->fd.smallest_seqno;
+ filemetadata.largest_seqno = file->fd.largest_seqno;
+ filemetadata.num_reads_sampled = file->stats.num_reads_sampled.load(
+ std::memory_order_relaxed);
+ filemetadata.being_compacted = file->being_compacted;
+ filemetadata.num_entries = file->num_entries;
+ filemetadata.num_deletions = file->num_deletions;
+ filemetadata.oldest_blob_file_number = file->oldest_blob_file_number;
+ filemetadata.file_checksum = file->file_checksum;
+ filemetadata.file_checksum_func_name = file->file_checksum_func_name;
+ metadata->push_back(filemetadata);
+ }
+ }
+ }
+}
+
+void VersionSet::GetObsoleteFiles(std::vector<ObsoleteFileInfo>* files,
+ std::vector<std::string>* manifest_filenames,
+ uint64_t min_pending_output) {
+ assert(manifest_filenames->empty());
+ obsolete_manifests_.swap(*manifest_filenames);
+ std::vector<ObsoleteFileInfo> pending_files;
+ for (auto& f : obsolete_files_) {
+ if (f.metadata->fd.GetNumber() < min_pending_output) {
+ files->push_back(std::move(f));
+ } else {
+ pending_files.push_back(std::move(f));
+ }
+ }
+ obsolete_files_.swap(pending_files);
+}
+
+ColumnFamilyData* VersionSet::CreateColumnFamily(
+ const ColumnFamilyOptions& cf_options, VersionEdit* edit) {
+ assert(edit->is_column_family_add_);
+
+ MutableCFOptions dummy_cf_options;
+ Version* dummy_versions =
+ new Version(nullptr, this, file_options_, dummy_cf_options);
+ // Ref() dummy version once so that later we can call Unref() to delete it
+ // by avoiding calling "delete" explicitly (~Version is private)
+ dummy_versions->Ref();
+ auto new_cfd = column_family_set_->CreateColumnFamily(
+ edit->column_family_name_, edit->column_family_, dummy_versions,
+ cf_options);
+
+ Version* v = new Version(new_cfd, this, file_options_,
+ *new_cfd->GetLatestMutableCFOptions(),
+ current_version_number_++);
+
+ // Fill level target base information.
+ v->storage_info()->CalculateBaseBytes(*new_cfd->ioptions(),
+ *new_cfd->GetLatestMutableCFOptions());
+ AppendVersion(new_cfd, v);
+ // GetLatestMutableCFOptions() is safe here without mutex since the
+ // cfd is not available to client
+ new_cfd->CreateNewMemtable(*new_cfd->GetLatestMutableCFOptions(),
+ LastSequence());
+ new_cfd->SetLogNumber(edit->log_number_);
+ return new_cfd;
+}
+
+uint64_t VersionSet::GetNumLiveVersions(Version* dummy_versions) {
+ uint64_t count = 0;
+ for (Version* v = dummy_versions->next_; v != dummy_versions; v = v->next_) {
+ count++;
+ }
+ return count;
+}
+
+uint64_t VersionSet::GetTotalSstFilesSize(Version* dummy_versions) {
+ std::unordered_set<uint64_t> unique_files;
+ uint64_t total_files_size = 0;
+ for (Version* v = dummy_versions->next_; v != dummy_versions; v = v->next_) {
+ VersionStorageInfo* storage_info = v->storage_info();
+ for (int level = 0; level < storage_info->num_levels_; level++) {
+ for (const auto& file_meta : storage_info->LevelFiles(level)) {
+ if (unique_files.find(file_meta->fd.packed_number_and_path_id) ==
+ unique_files.end()) {
+ unique_files.insert(file_meta->fd.packed_number_and_path_id);
+ total_files_size += file_meta->fd.GetFileSize();
+ }
+ }
+ }
+ }
+ return total_files_size;
+}
+
+ReactiveVersionSet::ReactiveVersionSet(const std::string& dbname,
+ const ImmutableDBOptions* _db_options,
+ const FileOptions& _file_options,
+ Cache* table_cache,
+ WriteBufferManager* write_buffer_manager,
+ WriteController* write_controller)
+ : VersionSet(dbname, _db_options, _file_options, table_cache,
+ write_buffer_manager, write_controller,
+ /*block_cache_tracer=*/nullptr),
+ number_of_edits_to_skip_(0) {}
+
+ReactiveVersionSet::~ReactiveVersionSet() {}
+
+Status ReactiveVersionSet::Recover(
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
+ std::unique_ptr<log::Reader::Reporter>* manifest_reporter,
+ std::unique_ptr<Status>* manifest_reader_status) {
+ assert(manifest_reader != nullptr);
+ assert(manifest_reporter != nullptr);
+ assert(manifest_reader_status != nullptr);
+
+ std::unordered_map<std::string, ColumnFamilyOptions> cf_name_to_options;
+ for (const auto& cf : column_families) {
+ cf_name_to_options.insert({cf.name, cf.options});
+ }
+
+ // add default column family
+ auto default_cf_iter = cf_name_to_options.find(kDefaultColumnFamilyName);
+ if (default_cf_iter == cf_name_to_options.end()) {
+ return Status::InvalidArgument("Default column family not specified");
+ }
+ VersionEdit default_cf_edit;
+ default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName);
+ default_cf_edit.SetColumnFamily(0);
+ ColumnFamilyData* default_cfd =
+ CreateColumnFamily(default_cf_iter->second, &default_cf_edit);
+ // In recovery, nobody else can access it, so it's fine to set it to be
+ // initialized earlier.
+ default_cfd->set_initialized();
+ std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>
+ builders;
+ std::unordered_map<int, std::string> column_families_not_found;
+ builders.insert(
+ std::make_pair(0, std::unique_ptr<BaseReferencedVersionBuilder>(
+ new BaseReferencedVersionBuilder(default_cfd))));
+
+ manifest_reader_status->reset(new Status());
+ manifest_reporter->reset(new LogReporter());
+ static_cast<LogReporter*>(manifest_reporter->get())->status =
+ manifest_reader_status->get();
+ Status s = MaybeSwitchManifest(manifest_reporter->get(), manifest_reader);
+ log::Reader* reader = manifest_reader->get();
+
+ int retry = 0;
+ VersionEdit version_edit;
+ while (s.ok() && retry < 1) {
+ assert(reader != nullptr);
+ Slice record;
+ std::string scratch;
+ s = ReadAndRecover(reader, &read_buffer_, cf_name_to_options,
+ column_families_not_found, builders, &version_edit);
+ if (s.ok()) {
+ bool enough = version_edit.has_next_file_number_ &&
+ version_edit.has_log_number_ &&
+ version_edit.has_last_sequence_;
+ if (enough) {
+ for (const auto& cf : column_families) {
+ auto cfd = column_family_set_->GetColumnFamily(cf.name);
+ if (cfd == nullptr) {
+ enough = false;
+ break;
+ }
+ }
+ }
+ if (enough) {
+ for (const auto& cf : column_families) {
+ auto cfd = column_family_set_->GetColumnFamily(cf.name);
+ assert(cfd != nullptr);
+ if (!cfd->IsDropped()) {
+ auto builder_iter = builders.find(cfd->GetID());
+ assert(builder_iter != builders.end());
+ auto builder = builder_iter->second->version_builder();
+ assert(builder != nullptr);
+ s = builder->LoadTableHandlers(
+ cfd->internal_stats(), db_options_->max_file_opening_threads,
+ false /* prefetch_index_and_filter_in_cache */,
+ true /* is_initial_load */,
+ cfd->GetLatestMutableCFOptions()->prefix_extractor.get());
+ if (!s.ok()) {
+ enough = false;
+ if (s.IsPathNotFound()) {
+ s = Status::OK();
+ }
+ break;
+ }
+ }
+ }
+ }
+ if (enough) {
+ break;
+ }
+ }
+ ++retry;
+ }
+
+ if (s.ok()) {
+ if (!version_edit.has_prev_log_number_) {
+ version_edit.prev_log_number_ = 0;
+ }
+ column_family_set_->UpdateMaxColumnFamily(version_edit.max_column_family_);
+
+ MarkMinLogNumberToKeep2PC(version_edit.min_log_number_to_keep_);
+ MarkFileNumberUsed(version_edit.prev_log_number_);
+ MarkFileNumberUsed(version_edit.log_number_);
+
+ for (auto cfd : *column_family_set_) {
+ assert(builders.count(cfd->GetID()) > 0);
+ auto builder = builders[cfd->GetID()]->version_builder();
+ if (!builder->CheckConsistencyForNumLevels()) {
+ s = Status::InvalidArgument(
+ "db has more levels than options.num_levels");
+ break;
+ }
+ }
+ }
+
+ if (s.ok()) {
+ for (auto cfd : *column_family_set_) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ assert(cfd->initialized());
+ auto builders_iter = builders.find(cfd->GetID());
+ assert(builders_iter != builders.end());
+ auto* builder = builders_iter->second->version_builder();
+
+ Version* v = new Version(cfd, this, file_options_,
+ *cfd->GetLatestMutableCFOptions(),
+ current_version_number_++);
+ builder->SaveTo(v->storage_info());
+
+ // Install recovered version
+ v->PrepareApply(*cfd->GetLatestMutableCFOptions(),
+ !(db_options_->skip_stats_update_on_db_open));
+ AppendVersion(cfd, v);
+ }
+ next_file_number_.store(version_edit.next_file_number_ + 1);
+ last_allocated_sequence_ = version_edit.last_sequence_;
+ last_published_sequence_ = version_edit.last_sequence_;
+ last_sequence_ = version_edit.last_sequence_;
+ prev_log_number_ = version_edit.prev_log_number_;
+ for (auto cfd : *column_family_set_) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ ROCKS_LOG_INFO(db_options_->info_log,
+ "Column family [%s] (ID %u), log number is %" PRIu64 "\n",
+ cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber());
+ }
+ }
+ return s;
+}
+
+Status ReactiveVersionSet::ReadAndApply(
+ InstrumentedMutex* mu,
+ std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
+ std::unordered_set<ColumnFamilyData*>* cfds_changed) {
+ assert(manifest_reader != nullptr);
+ assert(cfds_changed != nullptr);
+ mu->AssertHeld();
+
+ Status s;
+ uint64_t applied_edits = 0;
+ while (s.ok()) {
+ Slice record;
+ std::string scratch;
+ log::Reader* reader = manifest_reader->get();
+ std::string old_manifest_path = reader->file()->file_name();
+ while (reader->ReadRecord(&record, &scratch)) {
+ VersionEdit edit;
+ s = edit.DecodeFrom(record);
+ if (!s.ok()) {
+ break;
+ }
+
+ // Skip the first VersionEdits of each MANIFEST generated by
+ // VersionSet::WriteCurrentStatetoManifest.
+ if (number_of_edits_to_skip_ > 0) {
+ ColumnFamilyData* cfd =
+ column_family_set_->GetColumnFamily(edit.column_family_);
+ if (cfd != nullptr && !cfd->IsDropped()) {
+ --number_of_edits_to_skip_;
+ }
+ continue;
+ }
+
+ s = read_buffer_.AddEdit(&edit);
+ if (!s.ok()) {
+ break;
+ }
+ VersionEdit temp_edit;
+ if (edit.is_in_atomic_group_) {
+ if (read_buffer_.IsFull()) {
+ // Apply edits in an atomic group when we have read all edits in the
+ // group.
+ for (auto& e : read_buffer_.replay_buffer()) {
+ s = ApplyOneVersionEditToBuilder(e, cfds_changed, &temp_edit);
+ if (!s.ok()) {
+ break;
+ }
+ applied_edits++;
+ }
+ if (!s.ok()) {
+ break;
+ }
+ read_buffer_.Clear();
+ }
+ } else {
+ // Apply a normal edit immediately.
+ s = ApplyOneVersionEditToBuilder(edit, cfds_changed, &temp_edit);
+ if (s.ok()) {
+ applied_edits++;
+ }
+ }
+ }
+ if (!s.ok()) {
+ // Clear the buffer if we fail to decode/apply an edit.
+ read_buffer_.Clear();
+ }
+ // It's possible that:
+ // 1) s.IsCorruption(), indicating the current MANIFEST is corrupted.
+ // 2) we have finished reading the current MANIFEST.
+ // 3) we have encountered an IOError reading the current MANIFEST.
+ // We need to look for the next MANIFEST and start from there. If we cannot
+ // find the next MANIFEST, we should exit the loop.
+ s = MaybeSwitchManifest(reader->GetReporter(), manifest_reader);
+ reader = manifest_reader->get();
+ if (s.ok()) {
+ if (reader->file()->file_name() == old_manifest_path) {
+ // Still processing the same MANIFEST, thus no need to continue this
+ // loop since no record is available if we have reached here.
+ break;
+ } else {
+ // We have switched to a new MANIFEST whose first records have been
+ // generated by VersionSet::WriteCurrentStatetoManifest. Since the
+ // secondary instance has already finished recovering upon start, there
+ // is no need for the secondary to process these records. Actually, if
+ // the secondary were to replay these records, the secondary may end up
+ // adding the same SST files AGAIN to each column family, causing
+ // consistency checks done by VersionBuilder to fail. Therefore, we
+ // record the number of records to skip at the beginning of the new
+ // MANIFEST and ignore them.
+ number_of_edits_to_skip_ = 0;
+ for (auto* cfd : *column_family_set_) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ // Increase number_of_edits_to_skip by 2 because
+ // WriteCurrentStatetoManifest() writes 2 version edits for each
+ // column family at the beginning of the newly-generated MANIFEST.
+ // TODO(yanqin) remove hard-coded value.
+ if (db_options_->write_dbid_to_manifest) {
+ number_of_edits_to_skip_ += 3;
+ } else {
+ number_of_edits_to_skip_ += 2;
+ }
+ }
+ }
+ }
+ }
+
+ if (s.ok()) {
+ for (auto cfd : *column_family_set_) {
+ auto builder_iter = active_version_builders_.find(cfd->GetID());
+ if (builder_iter == active_version_builders_.end()) {
+ continue;
+ }
+ auto builder = builder_iter->second->version_builder();
+ if (!builder->CheckConsistencyForNumLevels()) {
+ s = Status::InvalidArgument(
+ "db has more levels than options.num_levels");
+ break;
+ }
+ }
+ }
+ TEST_SYNC_POINT_CALLBACK("ReactiveVersionSet::ReadAndApply:AppliedEdits",
+ &applied_edits);
+ return s;
+}
+
+Status ReactiveVersionSet::ApplyOneVersionEditToBuilder(
+ VersionEdit& edit, std::unordered_set<ColumnFamilyData*>* cfds_changed,
+ VersionEdit* version_edit) {
+ ColumnFamilyData* cfd =
+ column_family_set_->GetColumnFamily(edit.column_family_);
+
+ // If we cannot find this column family in our column family set, then it
+ // may be a new column family created by the primary after the secondary
+ // starts. It is also possible that the secondary instance opens only a subset
+ // of column families. Ignore it for now.
+ if (nullptr == cfd) {
+ return Status::OK();
+ }
+ if (active_version_builders_.find(edit.column_family_) ==
+ active_version_builders_.end() &&
+ !cfd->IsDropped()) {
+ std::unique_ptr<BaseReferencedVersionBuilder> builder_guard(
+ new BaseReferencedVersionBuilder(cfd));
+ active_version_builders_.insert(
+ std::make_pair(edit.column_family_, std::move(builder_guard)));
+ }
+
+ auto builder_iter = active_version_builders_.find(edit.column_family_);
+ assert(builder_iter != active_version_builders_.end());
+ auto builder = builder_iter->second->version_builder();
+ assert(builder != nullptr);
+
+ if (edit.is_column_family_add_) {
+ // TODO (yanqin) for now the secondary ignores column families created
+ // after Open. This also simplifies handling of switching to a new MANIFEST
+ // and processing the snapshot of the system at the beginning of the
+ // MANIFEST.
+ } else if (edit.is_column_family_drop_) {
+ // Drop the column family by setting it to be 'dropped' without destroying
+ // the column family handle.
+ // TODO (haoyu) figure out how to handle column faimly drop for
+ // secondary instance. (Is it possible that the ref count for cfd is 0 but
+ // the ref count for its versions is higher than 0?)
+ cfd->SetDropped();
+ if (cfd->UnrefAndTryDelete()) {
+ cfd = nullptr;
+ }
+ active_version_builders_.erase(builder_iter);
+ } else {
+ Status s = builder->Apply(&edit);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ Status s = ExtractInfoFromVersionEdit(cfd, edit, version_edit);
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (cfd != nullptr && !cfd->IsDropped()) {
+ s = builder->LoadTableHandlers(
+ cfd->internal_stats(), db_options_->max_file_opening_threads,
+ false /* prefetch_index_and_filter_in_cache */,
+ false /* is_initial_load */,
+ cfd->GetLatestMutableCFOptions()->prefix_extractor.get());
+ TEST_SYNC_POINT_CALLBACK(
+ "ReactiveVersionSet::ApplyOneVersionEditToBuilder:"
+ "AfterLoadTableHandlers",
+ &s);
+
+ if (s.ok()) {
+ auto version = new Version(cfd, this, file_options_,
+ *cfd->GetLatestMutableCFOptions(),
+ current_version_number_++);
+ builder->SaveTo(version->storage_info());
+ version->PrepareApply(*cfd->GetLatestMutableCFOptions(), true);
+ AppendVersion(cfd, version);
+ active_version_builders_.erase(builder_iter);
+ if (cfds_changed->count(cfd) == 0) {
+ cfds_changed->insert(cfd);
+ }
+ } else if (s.IsPathNotFound()) {
+ s = Status::OK();
+ }
+ // Some other error has occurred during LoadTableHandlers.
+ }
+
+ if (version_edit->HasNextFile()) {
+ next_file_number_.store(version_edit->next_file_number_ + 1);
+ }
+ if (version_edit->has_last_sequence_) {
+ last_allocated_sequence_ = version_edit->last_sequence_;
+ last_published_sequence_ = version_edit->last_sequence_;
+ last_sequence_ = version_edit->last_sequence_;
+ }
+ if (version_edit->has_prev_log_number_) {
+ prev_log_number_ = version_edit->prev_log_number_;
+ MarkFileNumberUsed(version_edit->prev_log_number_);
+ }
+ if (version_edit->has_log_number_) {
+ MarkFileNumberUsed(version_edit->log_number_);
+ }
+ column_family_set_->UpdateMaxColumnFamily(version_edit->max_column_family_);
+ MarkMinLogNumberToKeep2PC(version_edit->min_log_number_to_keep_);
+ return s;
+}
+
+Status ReactiveVersionSet::MaybeSwitchManifest(
+ log::Reader::Reporter* reporter,
+ std::unique_ptr<log::FragmentBufferedReader>* manifest_reader) {
+ assert(manifest_reader != nullptr);
+ Status s;
+ do {
+ std::string manifest_path;
+ s = GetCurrentManifestPath(dbname_, fs_, &manifest_path,
+ &manifest_file_number_);
+ std::unique_ptr<FSSequentialFile> manifest_file;
+ if (s.ok()) {
+ if (nullptr == manifest_reader->get() ||
+ manifest_reader->get()->file()->file_name() != manifest_path) {
+ TEST_SYNC_POINT(
+ "ReactiveVersionSet::MaybeSwitchManifest:"
+ "AfterGetCurrentManifestPath:0");
+ TEST_SYNC_POINT(
+ "ReactiveVersionSet::MaybeSwitchManifest:"
+ "AfterGetCurrentManifestPath:1");
+ s = fs_->NewSequentialFile(manifest_path,
+ env_->OptimizeForManifestRead(file_options_),
+ &manifest_file, nullptr);
+ } else {
+ // No need to switch manifest.
+ break;
+ }
+ }
+ std::unique_ptr<SequentialFileReader> manifest_file_reader;
+ if (s.ok()) {
+ manifest_file_reader.reset(
+ new SequentialFileReader(std::move(manifest_file), manifest_path,
+ db_options_->log_readahead_size));
+ manifest_reader->reset(new log::FragmentBufferedReader(
+ nullptr, std::move(manifest_file_reader), reporter,
+ true /* checksum */, 0 /* log_number */));
+ ROCKS_LOG_INFO(db_options_->info_log, "Switched to new manifest: %s\n",
+ manifest_path.c_str());
+ // TODO (yanqin) every time we switch to a new MANIFEST, we clear the
+ // active_version_builders_ map because we choose to construct the
+ // versions from scratch, thanks to the first part of each MANIFEST
+ // written by VersionSet::WriteCurrentStatetoManifest. This is not
+ // necessary, but we choose this at present for the sake of simplicity.
+ active_version_builders_.clear();
+ }
+ } while (s.IsPathNotFound());
+ return s;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_set.h b/src/rocksdb/db/version_set.h
new file mode 100644
index 000000000..2ab09a5f8
--- /dev/null
+++ b/src/rocksdb/db/version_set.h
@@ -0,0 +1,1251 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// The representation of a DBImpl consists of a set of Versions. The
+// newest version is called "current". Older versions may be kept
+// around to provide a consistent view to live iterators.
+//
+// Each Version keeps track of a set of Table files per level. The
+// entire set of versions is maintained in a VersionSet.
+//
+// Version,VersionSet are thread-compatible, but require external
+// synchronization on all accesses.
+
+#pragma once
+#include <atomic>
+#include <deque>
+#include <limits>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/compaction/compaction.h"
+#include "db/compaction/compaction_picker.h"
+#include "db/dbformat.h"
+#include "db/file_indexer.h"
+#include "db/log_reader.h"
+#include "db/range_del_aggregator.h"
+#include "db/read_callback.h"
+#include "db/table_cache.h"
+#include "db/version_builder.h"
+#include "db/version_edit.h"
+#include "db/write_controller.h"
+#include "monitoring/instrumented_mutex.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_checksum.h"
+#include "table/get_context.h"
+#include "table/multiget_context.h"
+#include "trace_replay/block_cache_tracer.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace log {
+class Writer;
+}
+
+class Compaction;
+class LogBuffer;
+class LookupKey;
+class MemTable;
+class Version;
+class VersionSet;
+class WriteBufferManager;
+class MergeContext;
+class ColumnFamilySet;
+class MergeIteratorBuilder;
+
+// VersionEdit is always supposed to be valid and it is used to point at
+// entries in Manifest. Ideally it should not be used as a container to
+// carry around few of its fields as function params because it can cause
+// readers to think it's a valid entry from Manifest. To avoid that confusion
+// introducing VersionEditParams to simply carry around multiple VersionEdit
+// params. It need not point to a valid record in Manifest.
+using VersionEditParams = VersionEdit;
+
+// Return the smallest index i such that file_level.files[i]->largest >= key.
+// Return file_level.num_files if there is no such file.
+// REQUIRES: "file_level.files" contains a sorted list of
+// non-overlapping files.
+extern int FindFile(const InternalKeyComparator& icmp,
+ const LevelFilesBrief& file_level, const Slice& key);
+
+// Returns true iff some file in "files" overlaps the user key range
+// [*smallest,*largest].
+// smallest==nullptr represents a key smaller than all keys in the DB.
+// largest==nullptr represents a key largest than all keys in the DB.
+// REQUIRES: If disjoint_sorted_files, file_level.files[]
+// contains disjoint ranges in sorted order.
+extern bool SomeFileOverlapsRange(const InternalKeyComparator& icmp,
+ bool disjoint_sorted_files,
+ const LevelFilesBrief& file_level,
+ const Slice* smallest_user_key,
+ const Slice* largest_user_key);
+
+// Generate LevelFilesBrief from vector<FdWithKeyRange*>
+// Would copy smallest_key and largest_key data to sequential memory
+// arena: Arena used to allocate the memory
+extern void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level,
+ const std::vector<FileMetaData*>& files,
+ Arena* arena);
+
+// Information of the storage associated with each Version, including number of
+// levels of LSM tree, files information at each level, files marked for
+// compaction, etc.
+class VersionStorageInfo {
+ public:
+ VersionStorageInfo(const InternalKeyComparator* internal_comparator,
+ const Comparator* user_comparator, int num_levels,
+ CompactionStyle compaction_style,
+ VersionStorageInfo* src_vstorage,
+ bool _force_consistency_checks);
+ // No copying allowed
+ VersionStorageInfo(const VersionStorageInfo&) = delete;
+ void operator=(const VersionStorageInfo&) = delete;
+ ~VersionStorageInfo();
+
+ void Reserve(int level, size_t size) { files_[level].reserve(size); }
+
+ void AddFile(int level, FileMetaData* f, Logger* info_log = nullptr);
+
+ void SetFinalized();
+
+ // Update num_non_empty_levels_.
+ void UpdateNumNonEmptyLevels();
+
+ void GenerateFileIndexer() {
+ file_indexer_.UpdateIndex(&arena_, num_non_empty_levels_, files_);
+ }
+
+ // Update the accumulated stats from a file-meta.
+ void UpdateAccumulatedStats(FileMetaData* file_meta);
+
+ // Decrease the current stat from a to-be-deleted file-meta
+ void RemoveCurrentStats(FileMetaData* file_meta);
+
+ void ComputeCompensatedSizes();
+
+ // Updates internal structures that keep track of compaction scores
+ // We use compaction scores to figure out which compaction to do next
+ // REQUIRES: db_mutex held!!
+ // TODO find a better way to pass compaction_options_fifo.
+ void ComputeCompactionScore(const ImmutableCFOptions& immutable_cf_options,
+ const MutableCFOptions& mutable_cf_options);
+
+ // Estimate est_comp_needed_bytes_
+ void EstimateCompactionBytesNeeded(
+ const MutableCFOptions& mutable_cf_options);
+
+ // This computes files_marked_for_compaction_ and is called by
+ // ComputeCompactionScore()
+ void ComputeFilesMarkedForCompaction();
+
+ // This computes ttl_expired_files_ and is called by
+ // ComputeCompactionScore()
+ void ComputeExpiredTtlFiles(const ImmutableCFOptions& ioptions,
+ const uint64_t ttl);
+
+ // This computes files_marked_for_periodic_compaction_ and is called by
+ // ComputeCompactionScore()
+ void ComputeFilesMarkedForPeriodicCompaction(
+ const ImmutableCFOptions& ioptions,
+ const uint64_t periodic_compaction_seconds);
+
+ // This computes bottommost_files_marked_for_compaction_ and is called by
+ // ComputeCompactionScore() or UpdateOldestSnapshot().
+ //
+ // Among bottommost files (assumes they've already been computed), marks the
+ // ones that have keys that would be eliminated if recompacted, according to
+ // the seqnum of the oldest existing snapshot. Must be called every time
+ // oldest snapshot changes as that is when bottom-level files can become
+ // eligible for compaction.
+ //
+ // REQUIRES: DB mutex held
+ void ComputeBottommostFilesMarkedForCompaction();
+
+ // Generate level_files_brief_ from files_
+ void GenerateLevelFilesBrief();
+ // Sort all files for this version based on their file size and
+ // record results in files_by_compaction_pri_. The largest files are listed
+ // first.
+ void UpdateFilesByCompactionPri(CompactionPri compaction_pri);
+
+ void GenerateLevel0NonOverlapping();
+ bool level0_non_overlapping() const {
+ return level0_non_overlapping_;
+ }
+
+ // Check whether each file in this version is bottommost (i.e., nothing in its
+ // key-range could possibly exist in an older file/level).
+ // REQUIRES: This version has not been saved
+ void GenerateBottommostFiles();
+
+ // Updates the oldest snapshot and related internal state, like the bottommost
+ // files marked for compaction.
+ // REQUIRES: DB mutex held
+ void UpdateOldestSnapshot(SequenceNumber oldest_snapshot_seqnum);
+
+ int MaxInputLevel() const;
+ int MaxOutputLevel(bool allow_ingest_behind) const;
+
+ // Return level number that has idx'th highest score
+ int CompactionScoreLevel(int idx) const { return compaction_level_[idx]; }
+
+ // Return idx'th highest score
+ double CompactionScore(int idx) const { return compaction_score_[idx]; }
+
+ void GetOverlappingInputs(
+ int level, const InternalKey* begin, // nullptr means before all keys
+ const InternalKey* end, // nullptr means after all keys
+ std::vector<FileMetaData*>* inputs,
+ int hint_index = -1, // index of overlap file
+ int* file_index = nullptr, // return index of overlap file
+ bool expand_range = true, // if set, returns files which overlap the
+ // range and overlap each other. If false,
+ // then just files intersecting the range
+ InternalKey** next_smallest = nullptr) // if non-null, returns the
+ const; // smallest key of next file not included
+ void GetCleanInputsWithinInterval(
+ int level, const InternalKey* begin, // nullptr means before all keys
+ const InternalKey* end, // nullptr means after all keys
+ std::vector<FileMetaData*>* inputs,
+ int hint_index = -1, // index of overlap file
+ int* file_index = nullptr) // return index of overlap file
+ const;
+
+ void GetOverlappingInputsRangeBinarySearch(
+ int level, // level > 0
+ const InternalKey* begin, // nullptr means before all keys
+ const InternalKey* end, // nullptr means after all keys
+ std::vector<FileMetaData*>* inputs,
+ int hint_index, // index of overlap file
+ int* file_index, // return index of overlap file
+ bool within_interval = false, // if set, force the inputs within interval
+ InternalKey** next_smallest = nullptr) // if non-null, returns the
+ const; // smallest key of next file not included
+
+ // Returns true iff some file in the specified level overlaps
+ // some part of [*smallest_user_key,*largest_user_key].
+ // smallest_user_key==NULL represents a key smaller than all keys in the DB.
+ // largest_user_key==NULL represents a key largest than all keys in the DB.
+ bool OverlapInLevel(int level, const Slice* smallest_user_key,
+ const Slice* largest_user_key);
+
+ // Returns true iff the first or last file in inputs contains
+ // an overlapping user key to the file "just outside" of it (i.e.
+ // just after the last file, or just before the first file)
+ // REQUIRES: "*inputs" is a sorted list of non-overlapping files
+ bool HasOverlappingUserKey(const std::vector<FileMetaData*>* inputs,
+ int level);
+
+ int num_levels() const { return num_levels_; }
+
+ // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+ int num_non_empty_levels() const {
+ assert(finalized_);
+ return num_non_empty_levels_;
+ }
+
+ // REQUIRES: This version has been finalized.
+ // (CalculateBaseBytes() is called)
+ // This may or may not return number of level files. It is to keep backward
+ // compatible behavior in universal compaction.
+ int l0_delay_trigger_count() const { return l0_delay_trigger_count_; }
+
+ void set_l0_delay_trigger_count(int v) { l0_delay_trigger_count_ = v; }
+
+ // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+ int NumLevelFiles(int level) const {
+ assert(finalized_);
+ return static_cast<int>(files_[level].size());
+ }
+
+ // Return the combined file size of all files at the specified level.
+ uint64_t NumLevelBytes(int level) const;
+
+ // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+ const std::vector<FileMetaData*>& LevelFiles(int level) const {
+ return files_[level];
+ }
+
+ const ROCKSDB_NAMESPACE::LevelFilesBrief& LevelFilesBrief(int level) const {
+ assert(level < static_cast<int>(level_files_brief_.size()));
+ return level_files_brief_[level];
+ }
+
+ // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+ const std::vector<int>& FilesByCompactionPri(int level) const {
+ assert(finalized_);
+ return files_by_compaction_pri_[level];
+ }
+
+ // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+ // REQUIRES: DB mutex held during access
+ const autovector<std::pair<int, FileMetaData*>>& FilesMarkedForCompaction()
+ const {
+ assert(finalized_);
+ return files_marked_for_compaction_;
+ }
+
+ // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+ // REQUIRES: DB mutex held during access
+ const autovector<std::pair<int, FileMetaData*>>& ExpiredTtlFiles() const {
+ assert(finalized_);
+ return expired_ttl_files_;
+ }
+
+ // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+ // REQUIRES: DB mutex held during access
+ const autovector<std::pair<int, FileMetaData*>>&
+ FilesMarkedForPeriodicCompaction() const {
+ assert(finalized_);
+ return files_marked_for_periodic_compaction_;
+ }
+
+ void TEST_AddFileMarkedForPeriodicCompaction(int level, FileMetaData* f) {
+ files_marked_for_periodic_compaction_.emplace_back(level, f);
+ }
+
+ // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+ // REQUIRES: DB mutex held during access
+ const autovector<std::pair<int, FileMetaData*>>&
+ BottommostFilesMarkedForCompaction() const {
+ assert(finalized_);
+ return bottommost_files_marked_for_compaction_;
+ }
+
+ int base_level() const { return base_level_; }
+ double level_multiplier() const { return level_multiplier_; }
+
+ // REQUIRES: lock is held
+ // Set the index that is used to offset into files_by_compaction_pri_ to find
+ // the next compaction candidate file.
+ void SetNextCompactionIndex(int level, int index) {
+ next_file_to_compact_by_size_[level] = index;
+ }
+
+ // REQUIRES: lock is held
+ int NextCompactionIndex(int level) const {
+ return next_file_to_compact_by_size_[level];
+ }
+
+ // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+ const FileIndexer& file_indexer() const {
+ assert(finalized_);
+ return file_indexer_;
+ }
+
+ // Only the first few entries of files_by_compaction_pri_ are sorted.
+ // There is no need to sort all the files because it is likely
+ // that on a running system, we need to look at only the first
+ // few largest files because a new version is created every few
+ // seconds/minutes (because of concurrent compactions).
+ static const size_t kNumberFilesToSort = 50;
+
+ // Return a human-readable short (single-line) summary of the number
+ // of files per level. Uses *scratch as backing store.
+ struct LevelSummaryStorage {
+ char buffer[1000];
+ };
+ struct FileSummaryStorage {
+ char buffer[3000];
+ };
+ const char* LevelSummary(LevelSummaryStorage* scratch) const;
+ // Return a human-readable short (single-line) summary of files
+ // in a specified level. Uses *scratch as backing store.
+ const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const;
+
+ // Return the maximum overlapping data (in bytes) at next level for any
+ // file at a level >= 1.
+ int64_t MaxNextLevelOverlappingBytes();
+
+ // Return a human readable string that describes this version's contents.
+ std::string DebugString(bool hex = false) const;
+
+ uint64_t GetAverageValueSize() const {
+ if (accumulated_num_non_deletions_ == 0) {
+ return 0;
+ }
+ assert(accumulated_raw_key_size_ + accumulated_raw_value_size_ > 0);
+ assert(accumulated_file_size_ > 0);
+ return accumulated_raw_value_size_ / accumulated_num_non_deletions_ *
+ accumulated_file_size_ /
+ (accumulated_raw_key_size_ + accumulated_raw_value_size_);
+ }
+
+ uint64_t GetEstimatedActiveKeys() const;
+
+ double GetEstimatedCompressionRatioAtLevel(int level) const;
+
+ // re-initializes the index that is used to offset into
+ // files_by_compaction_pri_
+ // to find the next compaction candidate file.
+ void ResetNextCompactionIndex(int level) {
+ next_file_to_compact_by_size_[level] = 0;
+ }
+
+ const InternalKeyComparator* InternalComparator() {
+ return internal_comparator_;
+ }
+
+ // Returns maximum total bytes of data on a given level.
+ uint64_t MaxBytesForLevel(int level) const;
+
+ // Must be called after any change to MutableCFOptions.
+ void CalculateBaseBytes(const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& options);
+
+ // Returns an estimate of the amount of live data in bytes.
+ uint64_t EstimateLiveDataSize() const;
+
+ uint64_t estimated_compaction_needed_bytes() const {
+ return estimated_compaction_needed_bytes_;
+ }
+
+ void TEST_set_estimated_compaction_needed_bytes(uint64_t v) {
+ estimated_compaction_needed_bytes_ = v;
+ }
+
+ bool force_consistency_checks() const { return force_consistency_checks_; }
+
+ SequenceNumber bottommost_files_mark_threshold() const {
+ return bottommost_files_mark_threshold_;
+ }
+
+ // Returns whether any key in [`smallest_key`, `largest_key`] could appear in
+ // an older L0 file than `last_l0_idx` or in a greater level than `last_level`
+ //
+ // @param last_level Level after which we check for overlap
+ // @param last_l0_idx If `last_level == 0`, index of L0 file after which we
+ // check for overlap; otherwise, must be -1
+ bool RangeMightExistAfterSortedRun(const Slice& smallest_user_key,
+ const Slice& largest_user_key,
+ int last_level, int last_l0_idx);
+
+ private:
+ const InternalKeyComparator* internal_comparator_;
+ const Comparator* user_comparator_;
+ int num_levels_; // Number of levels
+ int num_non_empty_levels_; // Number of levels. Any level larger than it
+ // is guaranteed to be empty.
+ // Per-level max bytes
+ std::vector<uint64_t> level_max_bytes_;
+
+ // A short brief metadata of files per level
+ autovector<ROCKSDB_NAMESPACE::LevelFilesBrief> level_files_brief_;
+ FileIndexer file_indexer_;
+ Arena arena_; // Used to allocate space for file_levels_
+
+ CompactionStyle compaction_style_;
+
+ // List of files per level, files in each level are arranged
+ // in increasing order of keys
+ std::vector<FileMetaData*>* files_;
+
+ // Level that L0 data should be compacted to. All levels < base_level_ should
+ // be empty. -1 if it is not level-compaction so it's not applicable.
+ int base_level_;
+
+ double level_multiplier_;
+
+ // A list for the same set of files that are stored in files_,
+ // but files in each level are now sorted based on file
+ // size. The file with the largest size is at the front.
+ // This vector stores the index of the file from files_.
+ std::vector<std::vector<int>> files_by_compaction_pri_;
+
+ // If true, means that files in L0 have keys with non overlapping ranges
+ bool level0_non_overlapping_;
+
+ // An index into files_by_compaction_pri_ that specifies the first
+ // file that is not yet compacted
+ std::vector<int> next_file_to_compact_by_size_;
+
+ // Only the first few entries of files_by_compaction_pri_ are sorted.
+ // There is no need to sort all the files because it is likely
+ // that on a running system, we need to look at only the first
+ // few largest files because a new version is created every few
+ // seconds/minutes (because of concurrent compactions).
+ static const size_t number_of_files_to_sort_ = 50;
+
+ // This vector contains list of files marked for compaction and also not
+ // currently being compacted. It is protected by DB mutex. It is calculated in
+ // ComputeCompactionScore()
+ autovector<std::pair<int, FileMetaData*>> files_marked_for_compaction_;
+
+ autovector<std::pair<int, FileMetaData*>> expired_ttl_files_;
+
+ autovector<std::pair<int, FileMetaData*>>
+ files_marked_for_periodic_compaction_;
+
+ // These files are considered bottommost because none of their keys can exist
+ // at lower levels. They are not necessarily all in the same level. The marked
+ // ones are eligible for compaction because they contain duplicate key
+ // versions that are no longer protected by snapshot. These variables are
+ // protected by DB mutex and are calculated in `GenerateBottommostFiles()` and
+ // `ComputeBottommostFilesMarkedForCompaction()`.
+ autovector<std::pair<int, FileMetaData*>> bottommost_files_;
+ autovector<std::pair<int, FileMetaData*>>
+ bottommost_files_marked_for_compaction_;
+
+ // Threshold for needing to mark another bottommost file. Maintain it so we
+ // can quickly check when releasing a snapshot whether more bottommost files
+ // became eligible for compaction. It's defined as the min of the max nonzero
+ // seqnums of unmarked bottommost files.
+ SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber;
+
+ // Monotonically increases as we release old snapshots. Zero indicates no
+ // snapshots have been released yet. When no snapshots remain we set it to the
+ // current seqnum, which needs to be protected as a snapshot can still be
+ // created that references it.
+ SequenceNumber oldest_snapshot_seqnum_ = 0;
+
+ // Level that should be compacted next and its compaction score.
+ // Score < 1 means compaction is not strictly needed. These fields
+ // are initialized by Finalize().
+ // The most critical level to be compacted is listed first
+ // These are used to pick the best compaction level
+ std::vector<double> compaction_score_;
+ std::vector<int> compaction_level_;
+ int l0_delay_trigger_count_ = 0; // Count used to trigger slow down and stop
+ // for number of L0 files.
+
+ // the following are the sampled temporary stats.
+ // the current accumulated size of sampled files.
+ uint64_t accumulated_file_size_;
+ // the current accumulated size of all raw keys based on the sampled files.
+ uint64_t accumulated_raw_key_size_;
+ // the current accumulated size of all raw keys based on the sampled files.
+ uint64_t accumulated_raw_value_size_;
+ // total number of non-deletion entries
+ uint64_t accumulated_num_non_deletions_;
+ // total number of deletion entries
+ uint64_t accumulated_num_deletions_;
+ // current number of non_deletion entries
+ uint64_t current_num_non_deletions_;
+ // current number of deletion entries
+ uint64_t current_num_deletions_;
+ // current number of file samples
+ uint64_t current_num_samples_;
+ // Estimated bytes needed to be compacted until all levels' size is down to
+ // target sizes.
+ uint64_t estimated_compaction_needed_bytes_;
+
+ bool finalized_;
+
+ // If set to true, we will run consistency checks even if RocksDB
+ // is compiled in release mode
+ bool force_consistency_checks_;
+
+ friend class Version;
+ friend class VersionSet;
+};
+
+using MultiGetRange = MultiGetContext::Range;
+// A column family's version consists of the SST files owned by the column
+// family at a certain point in time.
+class Version {
+ public:
+ // Append to *iters a sequence of iterators that will
+ // yield the contents of this Version when merged together.
+ // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+ void AddIterators(const ReadOptions&, const FileOptions& soptions,
+ MergeIteratorBuilder* merger_iter_builder,
+ RangeDelAggregator* range_del_agg);
+
+ void AddIteratorsForLevel(const ReadOptions&, const FileOptions& soptions,
+ MergeIteratorBuilder* merger_iter_builder,
+ int level, RangeDelAggregator* range_del_agg);
+
+ Status OverlapWithLevelIterator(const ReadOptions&, const FileOptions&,
+ const Slice& smallest_user_key,
+ const Slice& largest_user_key,
+ int level, bool* overlap);
+
+ // Lookup the value for key or get all merge operands for key.
+ // If do_merge = true (default) then lookup value for key.
+ // Behavior if do_merge = true:
+ // If found, store it in *value and
+ // return OK. Else return a non-OK status.
+ // Uses *operands to store merge_operator operations to apply later.
+ //
+ // If the ReadOptions.read_tier is set to do a read-only fetch, then
+ // *value_found will be set to false if it cannot be determined whether
+ // this value exists without doing IO.
+ //
+ // If the key is Deleted, *status will be set to NotFound and
+ // *key_exists will be set to true.
+ // If no key was found, *status will be set to NotFound and
+ // *key_exists will be set to false.
+ // If seq is non-null, *seq will be set to the sequence number found
+ // for the key if a key was found.
+ // Behavior if do_merge = false
+ // If the key has any merge operands then store them in
+ // merge_context.operands_list and don't merge the operands
+ // REQUIRES: lock is not held
+ void Get(const ReadOptions&, const LookupKey& key, PinnableSlice* value,
+ Status* status, MergeContext* merge_context,
+ SequenceNumber* max_covering_tombstone_seq,
+ bool* value_found = nullptr, bool* key_exists = nullptr,
+ SequenceNumber* seq = nullptr, ReadCallback* callback = nullptr,
+ bool* is_blob = nullptr, bool do_merge = true);
+
+ void MultiGet(const ReadOptions&, MultiGetRange* range,
+ ReadCallback* callback = nullptr, bool* is_blob = nullptr);
+
+ // Loads some stats information from files. Call without mutex held. It needs
+ // to be called before applying the version to the version set.
+ void PrepareApply(const MutableCFOptions& mutable_cf_options,
+ bool update_stats);
+
+ // Reference count management (so Versions do not disappear out from
+ // under live iterators)
+ void Ref();
+ // Decrease reference count. Delete the object if no reference left
+ // and return true. Otherwise, return false.
+ bool Unref();
+
+ // Add all files listed in the current version to *live.
+ void AddLiveFiles(std::vector<FileDescriptor>* live);
+
+ // Return a human readable string that describes this version's contents.
+ std::string DebugString(bool hex = false, bool print_stats = false) const;
+
+ // Returns the version number of this version
+ uint64_t GetVersionNumber() const { return version_number_; }
+
+ // REQUIRES: lock is held
+ // On success, "tp" will contains the table properties of the file
+ // specified in "file_meta". If the file name of "file_meta" is
+ // known ahead, passing it by a non-null "fname" can save a
+ // file-name conversion.
+ Status GetTableProperties(std::shared_ptr<const TableProperties>* tp,
+ const FileMetaData* file_meta,
+ const std::string* fname = nullptr) const;
+
+ // REQUIRES: lock is held
+ // On success, *props will be populated with all SSTables' table properties.
+ // The keys of `props` are the sst file name, the values of `props` are the
+ // tables' properties, represented as std::shared_ptr.
+ Status GetPropertiesOfAllTables(TablePropertiesCollection* props);
+ Status GetPropertiesOfAllTables(TablePropertiesCollection* props, int level);
+ Status GetPropertiesOfTablesInRange(const Range* range, std::size_t n,
+ TablePropertiesCollection* props) const;
+
+ // Print summary of range delete tombstones in SST files into out_str,
+ // with maximum max_entries_to_print entries printed out.
+ Status TablesRangeTombstoneSummary(int max_entries_to_print,
+ std::string* out_str);
+
+ // REQUIRES: lock is held
+ // On success, "tp" will contains the aggregated table property among
+ // the table properties of all sst files in this version.
+ Status GetAggregatedTableProperties(
+ std::shared_ptr<const TableProperties>* tp, int level = -1);
+
+ uint64_t GetEstimatedActiveKeys() {
+ return storage_info_.GetEstimatedActiveKeys();
+ }
+
+ size_t GetMemoryUsageByTableReaders();
+
+ ColumnFamilyData* cfd() const { return cfd_; }
+
+ // Return the next Version in the linked list. Used for debug only
+ Version* TEST_Next() const {
+ return next_;
+ }
+
+ int TEST_refs() const { return refs_; }
+
+ VersionStorageInfo* storage_info() { return &storage_info_; }
+
+ VersionSet* version_set() { return vset_; }
+
+ void GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta);
+
+ uint64_t GetSstFilesSize();
+
+ // Retrieves the file_creation_time of the oldest file in the DB.
+ // Prerequisite for this API is max_open_files = -1
+ void GetCreationTimeOfOldestFile(uint64_t* creation_time);
+
+ const MutableCFOptions& GetMutableCFOptions() { return mutable_cf_options_; }
+
+ private:
+ Env* env_;
+ FileSystem* fs_;
+ friend class ReactiveVersionSet;
+ friend class VersionSet;
+
+ const InternalKeyComparator* internal_comparator() const {
+ return storage_info_.internal_comparator_;
+ }
+ const Comparator* user_comparator() const {
+ return storage_info_.user_comparator_;
+ }
+
+ bool PrefixMayMatch(const ReadOptions& read_options,
+ InternalIterator* level_iter,
+ const Slice& internal_prefix) const;
+
+ // Returns true if the filter blocks in the specified level will not be
+ // checked during read operations. In certain cases (trivial move or preload),
+ // the filter block may already be cached, but we still do not access it such
+ // that it eventually expires from the cache.
+ bool IsFilterSkipped(int level, bool is_file_last_in_level = false);
+
+ // The helper function of UpdateAccumulatedStats, which may fill the missing
+ // fields of file_meta from its associated TableProperties.
+ // Returns true if it does initialize FileMetaData.
+ bool MaybeInitializeFileMetaData(FileMetaData* file_meta);
+
+ // Update the accumulated stats associated with the current version.
+ // This accumulated stats will be used in compaction.
+ void UpdateAccumulatedStats(bool update_stats);
+
+ // Sort all files for this version based on their file size and
+ // record results in files_by_compaction_pri_. The largest files are listed
+ // first.
+ void UpdateFilesByCompactionPri();
+
+ ColumnFamilyData* cfd_; // ColumnFamilyData to which this Version belongs
+ Logger* info_log_;
+ Statistics* db_statistics_;
+ TableCache* table_cache_;
+ const MergeOperator* merge_operator_;
+
+ VersionStorageInfo storage_info_;
+ VersionSet* vset_; // VersionSet to which this Version belongs
+ Version* next_; // Next version in linked list
+ Version* prev_; // Previous version in linked list
+ int refs_; // Number of live refs to this version
+ const FileOptions file_options_;
+ const MutableCFOptions mutable_cf_options_;
+
+ // A version number that uniquely represents this version. This is
+ // used for debugging and logging purposes only.
+ uint64_t version_number_;
+
+ Version(ColumnFamilyData* cfd, VersionSet* vset, const FileOptions& file_opt,
+ MutableCFOptions mutable_cf_options, uint64_t version_number = 0);
+
+ ~Version();
+
+ // No copying allowed
+ Version(const Version&) = delete;
+ void operator=(const Version&) = delete;
+};
+
+struct ObsoleteFileInfo {
+ FileMetaData* metadata;
+ std::string path;
+
+ ObsoleteFileInfo() noexcept : metadata(nullptr) {}
+ ObsoleteFileInfo(FileMetaData* f, const std::string& file_path)
+ : metadata(f), path(file_path) {}
+
+ ObsoleteFileInfo(const ObsoleteFileInfo&) = delete;
+ ObsoleteFileInfo& operator=(const ObsoleteFileInfo&) = delete;
+
+ ObsoleteFileInfo(ObsoleteFileInfo&& rhs) noexcept :
+ ObsoleteFileInfo() {
+ *this = std::move(rhs);
+ }
+
+ ObsoleteFileInfo& operator=(ObsoleteFileInfo&& rhs) noexcept {
+ path = std::move(rhs.path);
+ metadata = rhs.metadata;
+ rhs.metadata = nullptr;
+
+ return *this;
+ }
+
+ void DeleteMetadata() {
+ delete metadata;
+ metadata = nullptr;
+ }
+};
+
+class BaseReferencedVersionBuilder;
+
+class AtomicGroupReadBuffer {
+ public:
+ Status AddEdit(VersionEdit* edit);
+ void Clear();
+ bool IsFull() const;
+ bool IsEmpty() const;
+
+ uint64_t TEST_read_edits_in_atomic_group() const {
+ return read_edits_in_atomic_group_;
+ }
+ std::vector<VersionEdit>& replay_buffer() { return replay_buffer_; }
+
+ private:
+ uint64_t read_edits_in_atomic_group_ = 0;
+ std::vector<VersionEdit> replay_buffer_;
+};
+
+// VersionSet is the collection of versions of all the column families of the
+// database. Each database owns one VersionSet. A VersionSet has access to all
+// column families via ColumnFamilySet, i.e. set of the column families.
+class VersionSet {
+ public:
+ VersionSet(const std::string& dbname, const ImmutableDBOptions* db_options,
+ const FileOptions& file_options, Cache* table_cache,
+ WriteBufferManager* write_buffer_manager,
+ WriteController* write_controller,
+ BlockCacheTracer* const block_cache_tracer);
+ // No copying allowed
+ VersionSet(const VersionSet&) = delete;
+ void operator=(const VersionSet&) = delete;
+
+ virtual ~VersionSet();
+
+ // Apply *edit to the current version to form a new descriptor that
+ // is both saved to persistent state and installed as the new
+ // current version. Will release *mu while actually writing to the file.
+ // column_family_options has to be set if edit is column family add
+ // REQUIRES: *mu is held on entry.
+ // REQUIRES: no other thread concurrently calls LogAndApply()
+ Status LogAndApply(
+ ColumnFamilyData* column_family_data,
+ const MutableCFOptions& mutable_cf_options, VersionEdit* edit,
+ InstrumentedMutex* mu, Directory* db_directory = nullptr,
+ bool new_descriptor_log = false,
+ const ColumnFamilyOptions* column_family_options = nullptr) {
+ autovector<ColumnFamilyData*> cfds;
+ cfds.emplace_back(column_family_data);
+ autovector<const MutableCFOptions*> mutable_cf_options_list;
+ mutable_cf_options_list.emplace_back(&mutable_cf_options);
+ autovector<autovector<VersionEdit*>> edit_lists;
+ autovector<VersionEdit*> edit_list;
+ edit_list.emplace_back(edit);
+ edit_lists.emplace_back(edit_list);
+ return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu,
+ db_directory, new_descriptor_log, column_family_options);
+ }
+ // The batch version. If edit_list.size() > 1, caller must ensure that
+ // no edit in the list column family add or drop
+ Status LogAndApply(
+ ColumnFamilyData* column_family_data,
+ const MutableCFOptions& mutable_cf_options,
+ const autovector<VersionEdit*>& edit_list, InstrumentedMutex* mu,
+ Directory* db_directory = nullptr, bool new_descriptor_log = false,
+ const ColumnFamilyOptions* column_family_options = nullptr) {
+ autovector<ColumnFamilyData*> cfds;
+ cfds.emplace_back(column_family_data);
+ autovector<const MutableCFOptions*> mutable_cf_options_list;
+ mutable_cf_options_list.emplace_back(&mutable_cf_options);
+ autovector<autovector<VersionEdit*>> edit_lists;
+ edit_lists.emplace_back(edit_list);
+ return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu,
+ db_directory, new_descriptor_log, column_family_options);
+ }
+
+ // The across-multi-cf batch version. If edit_lists contain more than
+ // 1 version edits, caller must ensure that no edit in the []list is column
+ // family manipulation.
+ virtual Status LogAndApply(
+ const autovector<ColumnFamilyData*>& cfds,
+ const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+ const autovector<autovector<VersionEdit*>>& edit_lists,
+ InstrumentedMutex* mu, Directory* db_directory = nullptr,
+ bool new_descriptor_log = false,
+ const ColumnFamilyOptions* new_cf_options = nullptr);
+
+ static Status GetCurrentManifestPath(const std::string& dbname,
+ FileSystem* fs,
+ std::string* manifest_filename,
+ uint64_t* manifest_file_number);
+
+ // Recover the last saved descriptor from persistent storage.
+ // If read_only == true, Recover() will not complain if some column families
+ // are not opened
+ Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
+ bool read_only = false, std::string* db_id = nullptr);
+
+ // Reads a manifest file and returns a list of column families in
+ // column_families.
+ static Status ListColumnFamilies(std::vector<std::string>* column_families,
+ const std::string& dbname, FileSystem* fs);
+
+#ifndef ROCKSDB_LITE
+ // Try to reduce the number of levels. This call is valid when
+ // only one level from the new max level to the old
+ // max level containing files.
+ // The call is static, since number of levels is immutable during
+ // the lifetime of a RocksDB instance. It reduces number of levels
+ // in a DB by applying changes to manifest.
+ // For example, a db currently has 7 levels [0-6], and a call to
+ // to reduce to 5 [0-4] can only be executed when only one level
+ // among [4-6] contains files.
+ static Status ReduceNumberOfLevels(const std::string& dbname,
+ const Options* options,
+ const FileOptions& file_options,
+ int new_levels);
+
+ // Get the checksum information of all live files
+ Status GetLiveFilesChecksumInfo(FileChecksumList* checksum_list);
+
+ // printf contents (for debugging)
+ Status DumpManifest(Options& options, std::string& manifestFileName,
+ bool verbose, bool hex = false, bool json = false);
+
+#endif // ROCKSDB_LITE
+
+ // Return the current manifest file number
+ uint64_t manifest_file_number() const { return manifest_file_number_; }
+
+ uint64_t options_file_number() const { return options_file_number_; }
+
+ uint64_t pending_manifest_file_number() const {
+ return pending_manifest_file_number_;
+ }
+
+ uint64_t current_next_file_number() const { return next_file_number_.load(); }
+
+ uint64_t min_log_number_to_keep_2pc() const {
+ return min_log_number_to_keep_2pc_.load();
+ }
+
+ // Allocate and return a new file number
+ uint64_t NewFileNumber() { return next_file_number_.fetch_add(1); }
+
+ // Fetch And Add n new file number
+ uint64_t FetchAddFileNumber(uint64_t n) {
+ return next_file_number_.fetch_add(n);
+ }
+
+ // Return the last sequence number.
+ uint64_t LastSequence() const {
+ return last_sequence_.load(std::memory_order_acquire);
+ }
+
+ // Note: memory_order_acquire must be sufficient.
+ uint64_t LastAllocatedSequence() const {
+ return last_allocated_sequence_.load(std::memory_order_seq_cst);
+ }
+
+ // Note: memory_order_acquire must be sufficient.
+ uint64_t LastPublishedSequence() const {
+ return last_published_sequence_.load(std::memory_order_seq_cst);
+ }
+
+ // Set the last sequence number to s.
+ void SetLastSequence(uint64_t s) {
+ assert(s >= last_sequence_);
+ // Last visible sequence must always be less than last written seq
+ assert(!db_options_->two_write_queues || s <= last_allocated_sequence_);
+ last_sequence_.store(s, std::memory_order_release);
+ }
+
+ // Note: memory_order_release must be sufficient
+ void SetLastPublishedSequence(uint64_t s) {
+ assert(s >= last_published_sequence_);
+ last_published_sequence_.store(s, std::memory_order_seq_cst);
+ }
+
+ // Note: memory_order_release must be sufficient
+ void SetLastAllocatedSequence(uint64_t s) {
+ assert(s >= last_allocated_sequence_);
+ last_allocated_sequence_.store(s, std::memory_order_seq_cst);
+ }
+
+ // Note: memory_order_release must be sufficient
+ uint64_t FetchAddLastAllocatedSequence(uint64_t s) {
+ return last_allocated_sequence_.fetch_add(s, std::memory_order_seq_cst);
+ }
+
+ // Mark the specified file number as used.
+ // REQUIRED: this is only called during single-threaded recovery or repair.
+ void MarkFileNumberUsed(uint64_t number);
+
+ // Mark the specified log number as deleted
+ // REQUIRED: this is only called during single-threaded recovery or repair, or
+ // from ::LogAndApply where the global mutex is held.
+ void MarkMinLogNumberToKeep2PC(uint64_t number);
+
+ // Return the log file number for the log file that is currently
+ // being compacted, or zero if there is no such log file.
+ uint64_t prev_log_number() const { return prev_log_number_; }
+
+ // Returns the minimum log number which still has data not flushed to any SST
+ // file.
+ // In non-2PC mode, all the log numbers smaller than this number can be safely
+ // deleted.
+ uint64_t MinLogNumberWithUnflushedData() const {
+ return PreComputeMinLogNumberWithUnflushedData(nullptr);
+ }
+ // Returns the minimum log number which still has data not flushed to any SST
+ // file, except data from `cfd_to_skip`.
+ uint64_t PreComputeMinLogNumberWithUnflushedData(
+ const ColumnFamilyData* cfd_to_skip) const {
+ uint64_t min_log_num = std::numeric_limits<uint64_t>::max();
+ for (auto cfd : *column_family_set_) {
+ if (cfd == cfd_to_skip) {
+ continue;
+ }
+ // It's safe to ignore dropped column families here:
+ // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST.
+ if (min_log_num > cfd->GetLogNumber() && !cfd->IsDropped()) {
+ min_log_num = cfd->GetLogNumber();
+ }
+ }
+ return min_log_num;
+ }
+
+ // Create an iterator that reads over the compaction inputs for "*c".
+ // The caller should delete the iterator when no longer needed.
+ InternalIterator* MakeInputIterator(
+ const Compaction* c, RangeDelAggregator* range_del_agg,
+ const FileOptions& file_options_compactions);
+
+ // Add all files listed in any live version to *live.
+ void AddLiveFiles(std::vector<FileDescriptor>* live_list);
+
+ // Return the approximate size of data to be scanned for range [start, end)
+ // in levels [start_level, end_level). If end_level == -1 it will search
+ // through all non-empty levels
+ uint64_t ApproximateSize(const SizeApproximationOptions& options, Version* v,
+ const Slice& start, const Slice& end,
+ int start_level, int end_level,
+ TableReaderCaller caller);
+
+ // Return the size of the current manifest file
+ uint64_t manifest_file_size() const { return manifest_file_size_; }
+
+ // verify that the files that we started with for a compaction
+ // still exist in the current version and in the same original level.
+ // This ensures that a concurrent compaction did not erroneously
+ // pick the same files to compact.
+ bool VerifyCompactionFileConsistency(Compaction* c);
+
+ Status GetMetadataForFile(uint64_t number, int* filelevel,
+ FileMetaData** metadata, ColumnFamilyData** cfd);
+
+ // This function doesn't support leveldb SST filenames
+ void GetLiveFilesMetaData(std::vector<LiveFileMetaData> *metadata);
+
+ void GetObsoleteFiles(std::vector<ObsoleteFileInfo>* files,
+ std::vector<std::string>* manifest_filenames,
+ uint64_t min_pending_output);
+
+ ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
+ const FileOptions& file_options() { return file_options_; }
+ void ChangeFileOptions(const MutableDBOptions& new_options) {
+ file_options_.writable_file_max_buffer_size =
+ new_options.writable_file_max_buffer_size;
+ }
+
+ const ImmutableDBOptions* db_options() const { return db_options_; }
+
+ static uint64_t GetNumLiveVersions(Version* dummy_versions);
+
+ static uint64_t GetTotalSstFilesSize(Version* dummy_versions);
+
+ protected:
+ struct ManifestWriter;
+
+ friend class Version;
+ friend class DBImpl;
+ friend class DBImplReadOnly;
+
+ struct LogReporter : public log::Reader::Reporter {
+ Status* status;
+ virtual void Corruption(size_t /*bytes*/, const Status& s) override {
+ if (this->status->ok()) *this->status = s;
+ }
+ };
+
+ // Returns approximated offset of a key in a file for a given version.
+ uint64_t ApproximateOffsetOf(Version* v, const FdWithKeyRange& f,
+ const Slice& key, TableReaderCaller caller);
+
+ // Returns approximated data size between start and end keys in a file
+ // for a given version.
+ uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f,
+ const Slice& start, const Slice& end,
+ TableReaderCaller caller);
+
+ struct MutableCFState {
+ uint64_t log_number;
+ };
+
+ // Save current contents to *log
+ Status WriteCurrentStateToManifest(
+ const std::unordered_map<uint32_t, MutableCFState>& curr_state,
+ log::Writer* log);
+
+ void AppendVersion(ColumnFamilyData* column_family_data, Version* v);
+
+ ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options,
+ VersionEdit* edit);
+
+ Status ReadAndRecover(
+ log::Reader* reader, AtomicGroupReadBuffer* read_buffer,
+ const std::unordered_map<std::string, ColumnFamilyOptions>&
+ name_to_options,
+ std::unordered_map<int, std::string>& column_families_not_found,
+ std::unordered_map<
+ uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>& builders,
+ VersionEditParams* version_edit, std::string* db_id = nullptr);
+
+ // REQUIRES db mutex
+ Status ApplyOneVersionEditToBuilder(
+ VersionEdit& edit,
+ const std::unordered_map<std::string, ColumnFamilyOptions>& name_to_opts,
+ std::unordered_map<int, std::string>& column_families_not_found,
+ std::unordered_map<
+ uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>& builders,
+ VersionEditParams* version_edit);
+
+ Status ExtractInfoFromVersionEdit(ColumnFamilyData* cfd,
+ const VersionEdit& from_edit,
+ VersionEditParams* version_edit_params);
+
+ std::unique_ptr<ColumnFamilySet> column_family_set_;
+
+ Env* const env_;
+ FileSystem* const fs_;
+ const std::string dbname_;
+ std::string db_id_;
+ const ImmutableDBOptions* const db_options_;
+ std::atomic<uint64_t> next_file_number_;
+ // Any log number equal or lower than this should be ignored during recovery,
+ // and is qualified for being deleted in 2PC mode. In non-2PC mode, this
+ // number is ignored.
+ std::atomic<uint64_t> min_log_number_to_keep_2pc_ = {0};
+ uint64_t manifest_file_number_;
+ uint64_t options_file_number_;
+ uint64_t pending_manifest_file_number_;
+ // The last seq visible to reads. It normally indicates the last sequence in
+ // the memtable but when using two write queues it could also indicate the
+ // last sequence in the WAL visible to reads.
+ std::atomic<uint64_t> last_sequence_;
+ // The last seq that is already allocated. It is applicable only when we have
+ // two write queues. In that case seq might or might not have appreated in
+ // memtable but it is expected to appear in the WAL.
+ // We have last_sequence <= last_allocated_sequence_
+ std::atomic<uint64_t> last_allocated_sequence_;
+ // The last allocated sequence that is also published to the readers. This is
+ // applicable only when last_seq_same_as_publish_seq_ is not set. Otherwise
+ // last_sequence_ also indicates the last published seq.
+ // We have last_sequence <= last_published_sequence_ <=
+ // last_allocated_sequence_
+ std::atomic<uint64_t> last_published_sequence_;
+ uint64_t prev_log_number_; // 0 or backing store for memtable being compacted
+
+ // Opened lazily
+ std::unique_ptr<log::Writer> descriptor_log_;
+
+ // generates a increasing version number for every new version
+ uint64_t current_version_number_;
+
+ // Queue of writers to the manifest file
+ std::deque<ManifestWriter*> manifest_writers_;
+
+ // Current size of manifest file
+ uint64_t manifest_file_size_;
+
+ std::vector<ObsoleteFileInfo> obsolete_files_;
+ std::vector<std::string> obsolete_manifests_;
+
+ // env options for all reads and writes except compactions
+ FileOptions file_options_;
+
+ BlockCacheTracer* const block_cache_tracer_;
+
+ private:
+ // REQUIRES db mutex at beginning. may release and re-acquire db mutex
+ Status ProcessManifestWrites(std::deque<ManifestWriter>& writers,
+ InstrumentedMutex* mu, Directory* db_directory,
+ bool new_descriptor_log,
+ const ColumnFamilyOptions* new_cf_options);
+
+ void LogAndApplyCFHelper(VersionEdit* edit);
+ Status LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* b,
+ VersionEdit* edit, InstrumentedMutex* mu);
+};
+
+// ReactiveVersionSet represents a collection of versions of the column
+// families of the database. Users of ReactiveVersionSet, e.g. DBImplSecondary,
+// need to replay the MANIFEST (description log in older terms) in order to
+// reconstruct and install versions.
+class ReactiveVersionSet : public VersionSet {
+ public:
+ ReactiveVersionSet(const std::string& dbname,
+ const ImmutableDBOptions* _db_options,
+ const FileOptions& _file_options, Cache* table_cache,
+ WriteBufferManager* write_buffer_manager,
+ WriteController* write_controller);
+
+ ~ReactiveVersionSet() override;
+
+ Status ReadAndApply(
+ InstrumentedMutex* mu,
+ std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
+ std::unordered_set<ColumnFamilyData*>* cfds_changed);
+
+ Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
+ std::unique_ptr<log::Reader::Reporter>* manifest_reporter,
+ std::unique_ptr<Status>* manifest_reader_status);
+
+ uint64_t TEST_read_edits_in_atomic_group() const {
+ return read_buffer_.TEST_read_edits_in_atomic_group();
+ }
+ std::vector<VersionEdit>& replay_buffer() {
+ return read_buffer_.replay_buffer();
+ }
+
+ protected:
+ using VersionSet::ApplyOneVersionEditToBuilder;
+
+ // REQUIRES db mutex
+ Status ApplyOneVersionEditToBuilder(
+ VersionEdit& edit, std::unordered_set<ColumnFamilyData*>* cfds_changed,
+ VersionEdit* version_edit);
+
+ Status MaybeSwitchManifest(
+ log::Reader::Reporter* reporter,
+ std::unique_ptr<log::FragmentBufferedReader>* manifest_reader);
+
+ private:
+ std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>
+ active_version_builders_;
+ AtomicGroupReadBuffer read_buffer_;
+ // Number of version edits to skip by ReadAndApply at the beginning of a new
+ // MANIFEST created by primary.
+ int number_of_edits_to_skip_;
+
+ using VersionSet::LogAndApply;
+ using VersionSet::Recover;
+
+ Status LogAndApply(
+ const autovector<ColumnFamilyData*>& /*cfds*/,
+ const autovector<const MutableCFOptions*>& /*mutable_cf_options_list*/,
+ const autovector<autovector<VersionEdit*>>& /*edit_lists*/,
+ InstrumentedMutex* /*mu*/, Directory* /*db_directory*/,
+ bool /*new_descriptor_log*/,
+ const ColumnFamilyOptions* /*new_cf_option*/) override {
+ return Status::NotSupported("not supported in reactive mode");
+ }
+
+ // No copy allowed
+ ReactiveVersionSet(const ReactiveVersionSet&);
+ ReactiveVersionSet& operator=(const ReactiveVersionSet&);
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_set_test.cc b/src/rocksdb/db/version_set_test.cc
new file mode 100644
index 000000000..03e0e26d2
--- /dev/null
+++ b/src/rocksdb/db/version_set_test.cc
@@ -0,0 +1,1287 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_set.h"
+#include "db/db_impl/db_impl.h"
+#include "db/log_writer.h"
+#include "logging/logging.h"
+#include "table/mock_table.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class GenerateLevelFilesBriefTest : public testing::Test {
+ public:
+ std::vector<FileMetaData*> files_;
+ LevelFilesBrief file_level_;
+ Arena arena_;
+
+ GenerateLevelFilesBriefTest() { }
+
+ ~GenerateLevelFilesBriefTest() override {
+ for (size_t i = 0; i < files_.size(); i++) {
+ delete files_[i];
+ }
+ }
+
+ void Add(const char* smallest, const char* largest,
+ SequenceNumber smallest_seq = 100,
+ SequenceNumber largest_seq = 100) {
+ FileMetaData* f = new FileMetaData(
+ files_.size() + 1, 0, 0,
+ InternalKey(smallest, smallest_seq, kTypeValue),
+ InternalKey(largest, largest_seq, kTypeValue), smallest_seq,
+ largest_seq, /* marked_for_compact */ false, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+ files_.push_back(f);
+ }
+
+ int Compare() {
+ int diff = 0;
+ for (size_t i = 0; i < files_.size(); i++) {
+ if (file_level_.files[i].fd.GetNumber() != files_[i]->fd.GetNumber()) {
+ diff++;
+ }
+ }
+ return diff;
+ }
+};
+
+TEST_F(GenerateLevelFilesBriefTest, Empty) {
+ DoGenerateLevelFilesBrief(&file_level_, files_, &arena_);
+ ASSERT_EQ(0u, file_level_.num_files);
+ ASSERT_EQ(0, Compare());
+}
+
+TEST_F(GenerateLevelFilesBriefTest, Single) {
+ Add("p", "q");
+ DoGenerateLevelFilesBrief(&file_level_, files_, &arena_);
+ ASSERT_EQ(1u, file_level_.num_files);
+ ASSERT_EQ(0, Compare());
+}
+
+TEST_F(GenerateLevelFilesBriefTest, Multiple) {
+ Add("150", "200");
+ Add("200", "250");
+ Add("300", "350");
+ Add("400", "450");
+ DoGenerateLevelFilesBrief(&file_level_, files_, &arena_);
+ ASSERT_EQ(4u, file_level_.num_files);
+ ASSERT_EQ(0, Compare());
+}
+
+class CountingLogger : public Logger {
+ public:
+ CountingLogger() : log_count(0) {}
+ using Logger::Logv;
+ void Logv(const char* /*format*/, va_list /*ap*/) override { log_count++; }
+ int log_count;
+};
+
+Options GetOptionsWithNumLevels(int num_levels,
+ std::shared_ptr<CountingLogger> logger) {
+ Options opt;
+ opt.num_levels = num_levels;
+ opt.info_log = logger;
+ return opt;
+}
+
+class VersionStorageInfoTest : public testing::Test {
+ public:
+ const Comparator* ucmp_;
+ InternalKeyComparator icmp_;
+ std::shared_ptr<CountingLogger> logger_;
+ Options options_;
+ ImmutableCFOptions ioptions_;
+ MutableCFOptions mutable_cf_options_;
+ VersionStorageInfo vstorage_;
+
+ InternalKey GetInternalKey(const char* ukey,
+ SequenceNumber smallest_seq = 100) {
+ return InternalKey(ukey, smallest_seq, kTypeValue);
+ }
+
+ VersionStorageInfoTest()
+ : ucmp_(BytewiseComparator()),
+ icmp_(ucmp_),
+ logger_(new CountingLogger()),
+ options_(GetOptionsWithNumLevels(6, logger_)),
+ ioptions_(options_),
+ mutable_cf_options_(options_),
+ vstorage_(&icmp_, ucmp_, 6, kCompactionStyleLevel, nullptr, false) {}
+
+ ~VersionStorageInfoTest() override {
+ for (int i = 0; i < vstorage_.num_levels(); i++) {
+ for (auto* f : vstorage_.LevelFiles(i)) {
+ if (--f->refs == 0) {
+ delete f;
+ }
+ }
+ }
+ }
+
+ void Add(int level, uint32_t file_number, const char* smallest,
+ const char* largest, uint64_t file_size = 0) {
+ assert(level < vstorage_.num_levels());
+ FileMetaData* f = new FileMetaData(
+ file_number, 0, file_size, GetInternalKey(smallest, 0),
+ GetInternalKey(largest, 0), /* smallest_seq */ 0, /* largest_seq */ 0,
+ /* marked_for_compact */ false, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+ f->compensated_file_size = file_size;
+ vstorage_.AddFile(level, f);
+ }
+
+ void Add(int level, uint32_t file_number, const InternalKey& smallest,
+ const InternalKey& largest, uint64_t file_size = 0) {
+ assert(level < vstorage_.num_levels());
+ FileMetaData* f = new FileMetaData(
+ file_number, 0, file_size, smallest, largest, /* smallest_seq */ 0,
+ /* largest_seq */ 0, /* marked_for_compact */ false,
+ kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+ kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName);
+ f->compensated_file_size = file_size;
+ vstorage_.AddFile(level, f);
+ }
+
+ std::string GetOverlappingFiles(int level, const InternalKey& begin,
+ const InternalKey& end) {
+ std::vector<FileMetaData*> inputs;
+ vstorage_.GetOverlappingInputs(level, &begin, &end, &inputs);
+
+ std::string result;
+ for (size_t i = 0; i < inputs.size(); ++i) {
+ if (i > 0) {
+ result += ",";
+ }
+ AppendNumberTo(&result, inputs[i]->fd.GetNumber());
+ }
+ return result;
+ }
+};
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelStatic) {
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ mutable_cf_options_.max_bytes_for_level_base = 10;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+ Add(4, 100U, "1", "2");
+ Add(5, 101U, "1", "2");
+
+ vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(1), 10U);
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 50U);
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 250U);
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1250U);
+
+ ASSERT_EQ(0, logger_->log_count);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamic) {
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.max_bytes_for_level_base = 1000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+ Add(5, 1U, "1", "2", 500U);
+
+ vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+ ASSERT_EQ(0, logger_->log_count);
+ ASSERT_EQ(vstorage_.base_level(), 5);
+
+ Add(5, 2U, "3", "4", 550U);
+ vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+ ASSERT_EQ(0, logger_->log_count);
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1000U);
+ ASSERT_EQ(vstorage_.base_level(), 4);
+
+ Add(4, 3U, "3", "4", 550U);
+ vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+ ASSERT_EQ(0, logger_->log_count);
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1000U);
+ ASSERT_EQ(vstorage_.base_level(), 4);
+
+ Add(3, 4U, "3", "4", 250U);
+ Add(3, 5U, "5", "7", 300U);
+ vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+ ASSERT_EQ(1, logger_->log_count);
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1005U);
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 1000U);
+ ASSERT_EQ(vstorage_.base_level(), 3);
+
+ Add(1, 6U, "3", "4", 5U);
+ Add(1, 7U, "8", "9", 5U);
+ logger_->log_count = 0;
+ vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+ ASSERT_EQ(1, logger_->log_count);
+ ASSERT_GT(vstorage_.MaxBytesForLevel(4), 1005U);
+ ASSERT_GT(vstorage_.MaxBytesForLevel(3), 1005U);
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 1005U);
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(1), 1000U);
+ ASSERT_EQ(vstorage_.base_level(), 1);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicLotsOfData) {
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.max_bytes_for_level_base = 100;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 2;
+ Add(0, 1U, "1", "2", 50U);
+ Add(1, 2U, "1", "2", 50U);
+ Add(2, 3U, "1", "2", 500U);
+ Add(3, 4U, "1", "2", 500U);
+ Add(4, 5U, "1", "2", 1700U);
+ Add(5, 6U, "1", "2", 500U);
+
+ vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 800U);
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 400U);
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 200U);
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(1), 100U);
+ ASSERT_EQ(vstorage_.base_level(), 1);
+ ASSERT_EQ(0, logger_->log_count);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicLargeLevel) {
+ uint64_t kOneGB = 1000U * 1000U * 1000U;
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.max_bytes_for_level_base = 10U * kOneGB;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ Add(0, 1U, "1", "2", 50U);
+ Add(3, 4U, "1", "2", 32U * kOneGB);
+ Add(4, 5U, "1", "2", 500U * kOneGB);
+ Add(5, 6U, "1", "2", 3000U * kOneGB);
+
+ vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(5), 3000U * kOneGB);
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 300U * kOneGB);
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 30U * kOneGB);
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 10U * kOneGB);
+ ASSERT_EQ(vstorage_.base_level(), 2);
+ ASSERT_EQ(0, logger_->log_count);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_1) {
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.max_bytes_for_level_base = 40000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+
+ Add(0, 1U, "1", "2", 10000U);
+ Add(0, 2U, "1", "2", 10000U);
+ Add(0, 3U, "1", "2", 10000U);
+
+ Add(5, 4U, "1", "2", 1286250U);
+ Add(4, 5U, "1", "2", 200000U);
+ Add(3, 6U, "1", "2", 40000U);
+ Add(2, 7U, "1", "2", 8000U);
+
+ vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+ ASSERT_EQ(0, logger_->log_count);
+ ASSERT_EQ(2, vstorage_.base_level());
+ // level multiplier should be 3.5
+ ASSERT_EQ(vstorage_.level_multiplier(), 5.0);
+ // Level size should be around 30,000, 105,000, 367,500
+ ASSERT_EQ(40000U, vstorage_.MaxBytesForLevel(2));
+ ASSERT_EQ(51450U, vstorage_.MaxBytesForLevel(3));
+ ASSERT_EQ(257250U, vstorage_.MaxBytesForLevel(4));
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_2) {
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.max_bytes_for_level_base = 10000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+
+ Add(0, 11U, "1", "2", 10000U);
+ Add(0, 12U, "1", "2", 10000U);
+ Add(0, 13U, "1", "2", 10000U);
+
+ Add(5, 4U, "1", "2", 1286250U);
+ Add(4, 5U, "1", "2", 200000U);
+ Add(3, 6U, "1", "2", 40000U);
+ Add(2, 7U, "1", "2", 8000U);
+
+ vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+ ASSERT_EQ(0, logger_->log_count);
+ ASSERT_EQ(2, vstorage_.base_level());
+ // level multiplier should be 3.5
+ ASSERT_LT(vstorage_.level_multiplier(), 3.6);
+ ASSERT_GT(vstorage_.level_multiplier(), 3.4);
+ // Level size should be around 30,000, 105,000, 367,500
+ ASSERT_EQ(30000U, vstorage_.MaxBytesForLevel(2));
+ ASSERT_LT(vstorage_.MaxBytesForLevel(3), 110000U);
+ ASSERT_GT(vstorage_.MaxBytesForLevel(3), 100000U);
+ ASSERT_LT(vstorage_.MaxBytesForLevel(4), 370000U);
+ ASSERT_GT(vstorage_.MaxBytesForLevel(4), 360000U);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_3) {
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.max_bytes_for_level_base = 10000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+
+ Add(0, 11U, "1", "2", 5000U);
+ Add(0, 12U, "1", "2", 5000U);
+ Add(0, 13U, "1", "2", 5000U);
+ Add(0, 14U, "1", "2", 5000U);
+ Add(0, 15U, "1", "2", 5000U);
+ Add(0, 16U, "1", "2", 5000U);
+
+ Add(5, 4U, "1", "2", 1286250U);
+ Add(4, 5U, "1", "2", 200000U);
+ Add(3, 6U, "1", "2", 40000U);
+ Add(2, 7U, "1", "2", 8000U);
+
+ vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+ ASSERT_EQ(0, logger_->log_count);
+ ASSERT_EQ(2, vstorage_.base_level());
+ // level multiplier should be 3.5
+ ASSERT_LT(vstorage_.level_multiplier(), 3.6);
+ ASSERT_GT(vstorage_.level_multiplier(), 3.4);
+ // Level size should be around 30,000, 105,000, 367,500
+ ASSERT_EQ(30000U, vstorage_.MaxBytesForLevel(2));
+ ASSERT_LT(vstorage_.MaxBytesForLevel(3), 110000U);
+ ASSERT_GT(vstorage_.MaxBytesForLevel(3), 100000U);
+ ASSERT_LT(vstorage_.MaxBytesForLevel(4), 370000U);
+ ASSERT_GT(vstorage_.MaxBytesForLevel(4), 360000U);
+}
+
+TEST_F(VersionStorageInfoTest, EstimateLiveDataSize) {
+ // Test whether the overlaps are detected as expected
+ Add(1, 1U, "4", "7", 1U); // Perfect overlap with last level
+ Add(2, 2U, "3", "5", 1U); // Partial overlap with last level
+ Add(2, 3U, "6", "8", 1U); // Partial overlap with last level
+ Add(3, 4U, "1", "9", 1U); // Contains range of last level
+ Add(4, 5U, "4", "5", 1U); // Inside range of last level
+ Add(4, 5U, "6", "7", 1U); // Inside range of last level
+ Add(5, 6U, "4", "7", 10U);
+ ASSERT_EQ(10U, vstorage_.EstimateLiveDataSize());
+}
+
+TEST_F(VersionStorageInfoTest, EstimateLiveDataSize2) {
+ Add(0, 1U, "9", "9", 1U); // Level 0 is not ordered
+ Add(0, 1U, "5", "6", 1U); // Ignored because of [5,6] in l1
+ Add(1, 1U, "1", "2", 1U); // Ignored because of [2,3] in l2
+ Add(1, 2U, "3", "4", 1U); // Ignored because of [2,3] in l2
+ Add(1, 3U, "5", "6", 1U);
+ Add(2, 4U, "2", "3", 1U);
+ Add(3, 5U, "7", "8", 1U);
+ ASSERT_EQ(4U, vstorage_.EstimateLiveDataSize());
+}
+
+TEST_F(VersionStorageInfoTest, GetOverlappingInputs) {
+ // Two files that overlap at the range deletion tombstone sentinel.
+ Add(1, 1U, {"a", 0, kTypeValue}, {"b", kMaxSequenceNumber, kTypeRangeDeletion}, 1);
+ Add(1, 2U, {"b", 0, kTypeValue}, {"c", 0, kTypeValue}, 1);
+ // Two files that overlap at the same user key.
+ Add(1, 3U, {"d", 0, kTypeValue}, {"e", kMaxSequenceNumber, kTypeValue}, 1);
+ Add(1, 4U, {"e", 0, kTypeValue}, {"f", 0, kTypeValue}, 1);
+ // Two files that do not overlap.
+ Add(1, 5U, {"g", 0, kTypeValue}, {"h", 0, kTypeValue}, 1);
+ Add(1, 6U, {"i", 0, kTypeValue}, {"j", 0, kTypeValue}, 1);
+ vstorage_.UpdateNumNonEmptyLevels();
+ vstorage_.GenerateLevelFilesBrief();
+
+ ASSERT_EQ("1,2", GetOverlappingFiles(
+ 1, {"a", 0, kTypeValue}, {"b", 0, kTypeValue}));
+ ASSERT_EQ("1", GetOverlappingFiles(
+ 1, {"a", 0, kTypeValue}, {"b", kMaxSequenceNumber, kTypeRangeDeletion}));
+ ASSERT_EQ("2", GetOverlappingFiles(
+ 1, {"b", kMaxSequenceNumber, kTypeValue}, {"c", 0, kTypeValue}));
+ ASSERT_EQ("3,4", GetOverlappingFiles(
+ 1, {"d", 0, kTypeValue}, {"e", 0, kTypeValue}));
+ ASSERT_EQ("3", GetOverlappingFiles(
+ 1, {"d", 0, kTypeValue}, {"e", kMaxSequenceNumber, kTypeRangeDeletion}));
+ ASSERT_EQ("3,4", GetOverlappingFiles(
+ 1, {"e", kMaxSequenceNumber, kTypeValue}, {"f", 0, kTypeValue}));
+ ASSERT_EQ("3,4", GetOverlappingFiles(
+ 1, {"e", 0, kTypeValue}, {"f", 0, kTypeValue}));
+ ASSERT_EQ("5", GetOverlappingFiles(
+ 1, {"g", 0, kTypeValue}, {"h", 0, kTypeValue}));
+ ASSERT_EQ("6", GetOverlappingFiles(
+ 1, {"i", 0, kTypeValue}, {"j", 0, kTypeValue}));
+}
+
+
+class FindLevelFileTest : public testing::Test {
+ public:
+ LevelFilesBrief file_level_;
+ bool disjoint_sorted_files_;
+ Arena arena_;
+
+ FindLevelFileTest() : disjoint_sorted_files_(true) { }
+
+ ~FindLevelFileTest() override {}
+
+ void LevelFileInit(size_t num = 0) {
+ char* mem = arena_.AllocateAligned(num * sizeof(FdWithKeyRange));
+ file_level_.files = new (mem)FdWithKeyRange[num];
+ file_level_.num_files = 0;
+ }
+
+ void Add(const char* smallest, const char* largest,
+ SequenceNumber smallest_seq = 100,
+ SequenceNumber largest_seq = 100) {
+ InternalKey smallest_key = InternalKey(smallest, smallest_seq, kTypeValue);
+ InternalKey largest_key = InternalKey(largest, largest_seq, kTypeValue);
+
+ Slice smallest_slice = smallest_key.Encode();
+ Slice largest_slice = largest_key.Encode();
+
+ char* mem = arena_.AllocateAligned(
+ smallest_slice.size() + largest_slice.size());
+ memcpy(mem, smallest_slice.data(), smallest_slice.size());
+ memcpy(mem + smallest_slice.size(), largest_slice.data(),
+ largest_slice.size());
+
+ // add to file_level_
+ size_t num = file_level_.num_files;
+ auto& file = file_level_.files[num];
+ file.fd = FileDescriptor(num + 1, 0, 0);
+ file.smallest_key = Slice(mem, smallest_slice.size());
+ file.largest_key = Slice(mem + smallest_slice.size(),
+ largest_slice.size());
+ file_level_.num_files++;
+ }
+
+ int Find(const char* key) {
+ InternalKey target(key, 100, kTypeValue);
+ InternalKeyComparator cmp(BytewiseComparator());
+ return FindFile(cmp, file_level_, target.Encode());
+ }
+
+ bool Overlaps(const char* smallest, const char* largest) {
+ InternalKeyComparator cmp(BytewiseComparator());
+ Slice s(smallest != nullptr ? smallest : "");
+ Slice l(largest != nullptr ? largest : "");
+ return SomeFileOverlapsRange(cmp, disjoint_sorted_files_, file_level_,
+ (smallest != nullptr ? &s : nullptr),
+ (largest != nullptr ? &l : nullptr));
+ }
+};
+
+TEST_F(FindLevelFileTest, LevelEmpty) {
+ LevelFileInit(0);
+
+ ASSERT_EQ(0, Find("foo"));
+ ASSERT_TRUE(! Overlaps("a", "z"));
+ ASSERT_TRUE(! Overlaps(nullptr, "z"));
+ ASSERT_TRUE(! Overlaps("a", nullptr));
+ ASSERT_TRUE(! Overlaps(nullptr, nullptr));
+}
+
+TEST_F(FindLevelFileTest, LevelSingle) {
+ LevelFileInit(1);
+
+ Add("p", "q");
+ ASSERT_EQ(0, Find("a"));
+ ASSERT_EQ(0, Find("p"));
+ ASSERT_EQ(0, Find("p1"));
+ ASSERT_EQ(0, Find("q"));
+ ASSERT_EQ(1, Find("q1"));
+ ASSERT_EQ(1, Find("z"));
+
+ ASSERT_TRUE(! Overlaps("a", "b"));
+ ASSERT_TRUE(! Overlaps("z1", "z2"));
+ ASSERT_TRUE(Overlaps("a", "p"));
+ ASSERT_TRUE(Overlaps("a", "q"));
+ ASSERT_TRUE(Overlaps("a", "z"));
+ ASSERT_TRUE(Overlaps("p", "p1"));
+ ASSERT_TRUE(Overlaps("p", "q"));
+ ASSERT_TRUE(Overlaps("p", "z"));
+ ASSERT_TRUE(Overlaps("p1", "p2"));
+ ASSERT_TRUE(Overlaps("p1", "z"));
+ ASSERT_TRUE(Overlaps("q", "q"));
+ ASSERT_TRUE(Overlaps("q", "q1"));
+
+ ASSERT_TRUE(! Overlaps(nullptr, "j"));
+ ASSERT_TRUE(! Overlaps("r", nullptr));
+ ASSERT_TRUE(Overlaps(nullptr, "p"));
+ ASSERT_TRUE(Overlaps(nullptr, "p1"));
+ ASSERT_TRUE(Overlaps("q", nullptr));
+ ASSERT_TRUE(Overlaps(nullptr, nullptr));
+}
+
+TEST_F(FindLevelFileTest, LevelMultiple) {
+ LevelFileInit(4);
+
+ Add("150", "200");
+ Add("200", "250");
+ Add("300", "350");
+ Add("400", "450");
+ ASSERT_EQ(0, Find("100"));
+ ASSERT_EQ(0, Find("150"));
+ ASSERT_EQ(0, Find("151"));
+ ASSERT_EQ(0, Find("199"));
+ ASSERT_EQ(0, Find("200"));
+ ASSERT_EQ(1, Find("201"));
+ ASSERT_EQ(1, Find("249"));
+ ASSERT_EQ(1, Find("250"));
+ ASSERT_EQ(2, Find("251"));
+ ASSERT_EQ(2, Find("299"));
+ ASSERT_EQ(2, Find("300"));
+ ASSERT_EQ(2, Find("349"));
+ ASSERT_EQ(2, Find("350"));
+ ASSERT_EQ(3, Find("351"));
+ ASSERT_EQ(3, Find("400"));
+ ASSERT_EQ(3, Find("450"));
+ ASSERT_EQ(4, Find("451"));
+
+ ASSERT_TRUE(! Overlaps("100", "149"));
+ ASSERT_TRUE(! Overlaps("251", "299"));
+ ASSERT_TRUE(! Overlaps("451", "500"));
+ ASSERT_TRUE(! Overlaps("351", "399"));
+
+ ASSERT_TRUE(Overlaps("100", "150"));
+ ASSERT_TRUE(Overlaps("100", "200"));
+ ASSERT_TRUE(Overlaps("100", "300"));
+ ASSERT_TRUE(Overlaps("100", "400"));
+ ASSERT_TRUE(Overlaps("100", "500"));
+ ASSERT_TRUE(Overlaps("375", "400"));
+ ASSERT_TRUE(Overlaps("450", "450"));
+ ASSERT_TRUE(Overlaps("450", "500"));
+}
+
+TEST_F(FindLevelFileTest, LevelMultipleNullBoundaries) {
+ LevelFileInit(4);
+
+ Add("150", "200");
+ Add("200", "250");
+ Add("300", "350");
+ Add("400", "450");
+ ASSERT_TRUE(! Overlaps(nullptr, "149"));
+ ASSERT_TRUE(! Overlaps("451", nullptr));
+ ASSERT_TRUE(Overlaps(nullptr, nullptr));
+ ASSERT_TRUE(Overlaps(nullptr, "150"));
+ ASSERT_TRUE(Overlaps(nullptr, "199"));
+ ASSERT_TRUE(Overlaps(nullptr, "200"));
+ ASSERT_TRUE(Overlaps(nullptr, "201"));
+ ASSERT_TRUE(Overlaps(nullptr, "400"));
+ ASSERT_TRUE(Overlaps(nullptr, "800"));
+ ASSERT_TRUE(Overlaps("100", nullptr));
+ ASSERT_TRUE(Overlaps("200", nullptr));
+ ASSERT_TRUE(Overlaps("449", nullptr));
+ ASSERT_TRUE(Overlaps("450", nullptr));
+}
+
+TEST_F(FindLevelFileTest, LevelOverlapSequenceChecks) {
+ LevelFileInit(1);
+
+ Add("200", "200", 5000, 3000);
+ ASSERT_TRUE(! Overlaps("199", "199"));
+ ASSERT_TRUE(! Overlaps("201", "300"));
+ ASSERT_TRUE(Overlaps("200", "200"));
+ ASSERT_TRUE(Overlaps("190", "200"));
+ ASSERT_TRUE(Overlaps("200", "210"));
+}
+
+TEST_F(FindLevelFileTest, LevelOverlappingFiles) {
+ LevelFileInit(2);
+
+ Add("150", "600");
+ Add("400", "500");
+ disjoint_sorted_files_ = false;
+ ASSERT_TRUE(! Overlaps("100", "149"));
+ ASSERT_TRUE(! Overlaps("601", "700"));
+ ASSERT_TRUE(Overlaps("100", "150"));
+ ASSERT_TRUE(Overlaps("100", "200"));
+ ASSERT_TRUE(Overlaps("100", "300"));
+ ASSERT_TRUE(Overlaps("100", "400"));
+ ASSERT_TRUE(Overlaps("100", "500"));
+ ASSERT_TRUE(Overlaps("375", "400"));
+ ASSERT_TRUE(Overlaps("450", "450"));
+ ASSERT_TRUE(Overlaps("450", "500"));
+ ASSERT_TRUE(Overlaps("450", "700"));
+ ASSERT_TRUE(Overlaps("600", "700"));
+}
+
+class VersionSetTestBase {
+ public:
+ const static std::string kColumnFamilyName1;
+ const static std::string kColumnFamilyName2;
+ const static std::string kColumnFamilyName3;
+ int num_initial_edits_;
+
+ VersionSetTestBase()
+ : env_(Env::Default()),
+ fs_(std::make_shared<LegacyFileSystemWrapper>(env_)),
+ dbname_(test::PerThreadDBPath("version_set_test")),
+ db_options_(),
+ mutable_cf_options_(cf_options_),
+ table_cache_(NewLRUCache(50000, 16)),
+ write_buffer_manager_(db_options_.db_write_buffer_size),
+ shutting_down_(false),
+ mock_table_factory_(std::make_shared<mock::MockTableFactory>()) {
+ EXPECT_OK(env_->CreateDirIfMissing(dbname_));
+
+ db_options_.env = env_;
+ db_options_.fs = fs_;
+ versions_.reset(new VersionSet(dbname_, &db_options_, env_options_,
+ table_cache_.get(), &write_buffer_manager_,
+ &write_controller_,
+ /*block_cache_tracer=*/nullptr)),
+ reactive_versions_ = std::make_shared<ReactiveVersionSet>(
+ dbname_, &db_options_, env_options_, table_cache_.get(),
+ &write_buffer_manager_, &write_controller_);
+ db_options_.db_paths.emplace_back(dbname_,
+ std::numeric_limits<uint64_t>::max());
+ }
+
+ void PrepareManifest(std::vector<ColumnFamilyDescriptor>* column_families,
+ SequenceNumber* last_seqno,
+ std::unique_ptr<log::Writer>* log_writer) {
+ assert(column_families != nullptr);
+ assert(last_seqno != nullptr);
+ assert(log_writer != nullptr);
+ VersionEdit new_db;
+ if (db_options_.write_dbid_to_manifest) {
+ DBImpl* impl = new DBImpl(DBOptions(), dbname_);
+ std::string db_id;
+ impl->GetDbIdentityFromIdentityFile(&db_id);
+ new_db.SetDBId(db_id);
+ }
+ new_db.SetLogNumber(0);
+ new_db.SetNextFile(2);
+ new_db.SetLastSequence(0);
+
+ const std::vector<std::string> cf_names = {
+ kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2,
+ kColumnFamilyName3};
+ const int kInitialNumOfCfs = static_cast<int>(cf_names.size());
+ autovector<VersionEdit> new_cfs;
+ uint64_t last_seq = 1;
+ uint32_t cf_id = 1;
+ for (int i = 1; i != kInitialNumOfCfs; ++i) {
+ VersionEdit new_cf;
+ new_cf.AddColumnFamily(cf_names[i]);
+ new_cf.SetColumnFamily(cf_id++);
+ new_cf.SetLogNumber(0);
+ new_cf.SetNextFile(2);
+ new_cf.SetLastSequence(last_seq++);
+ new_cfs.emplace_back(new_cf);
+ }
+ *last_seqno = last_seq;
+ num_initial_edits_ = static_cast<int>(new_cfs.size() + 1);
+ const std::string manifest = DescriptorFileName(dbname_, 1);
+ std::unique_ptr<WritableFile> file;
+ Status s = env_->NewWritableFile(
+ manifest, &file, env_->OptimizeForManifestWrite(env_options_));
+ ASSERT_OK(s);
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ NewLegacyWritableFileWrapper(std::move(file)), manifest, env_options_));
+ {
+ log_writer->reset(new log::Writer(std::move(file_writer), 0, false));
+ std::string record;
+ new_db.EncodeTo(&record);
+ s = (*log_writer)->AddRecord(record);
+ for (const auto& e : new_cfs) {
+ record.clear();
+ e.EncodeTo(&record);
+ s = (*log_writer)->AddRecord(record);
+ ASSERT_OK(s);
+ }
+ }
+ ASSERT_OK(s);
+
+ cf_options_.table_factory = mock_table_factory_;
+ for (const auto& cf_name : cf_names) {
+ column_families->emplace_back(cf_name, cf_options_);
+ }
+ }
+
+ // Create DB with 3 column families.
+ void NewDB() {
+ std::vector<ColumnFamilyDescriptor> column_families;
+ SequenceNumber last_seqno;
+ std::unique_ptr<log::Writer> log_writer;
+ SetIdentityFile(env_, dbname_);
+ PrepareManifest(&column_families, &last_seqno, &log_writer);
+ log_writer.reset();
+ // Make "CURRENT" file point to the new manifest file.
+ Status s = SetCurrentFile(env_, dbname_, 1, nullptr);
+ ASSERT_OK(s);
+
+ EXPECT_OK(versions_->Recover(column_families, false));
+ EXPECT_EQ(column_families.size(),
+ versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+ }
+
+ Env* env_;
+ std::shared_ptr<FileSystem> fs_;
+ const std::string dbname_;
+ EnvOptions env_options_;
+ ImmutableDBOptions db_options_;
+ ColumnFamilyOptions cf_options_;
+ MutableCFOptions mutable_cf_options_;
+ std::shared_ptr<Cache> table_cache_;
+ WriteController write_controller_;
+ WriteBufferManager write_buffer_manager_;
+ std::shared_ptr<VersionSet> versions_;
+ std::shared_ptr<ReactiveVersionSet> reactive_versions_;
+ InstrumentedMutex mutex_;
+ std::atomic<bool> shutting_down_;
+ std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
+};
+
+const std::string VersionSetTestBase::kColumnFamilyName1 = "alice";
+const std::string VersionSetTestBase::kColumnFamilyName2 = "bob";
+const std::string VersionSetTestBase::kColumnFamilyName3 = "charles";
+
+class VersionSetTest : public VersionSetTestBase, public testing::Test {
+ public:
+ VersionSetTest() : VersionSetTestBase() {}
+};
+
+TEST_F(VersionSetTest, SameColumnFamilyGroupCommit) {
+ NewDB();
+ const int kGroupSize = 5;
+ autovector<VersionEdit> edits;
+ for (int i = 0; i != kGroupSize; ++i) {
+ edits.emplace_back(VersionEdit());
+ }
+ autovector<ColumnFamilyData*> cfds;
+ autovector<const MutableCFOptions*> all_mutable_cf_options;
+ autovector<autovector<VersionEdit*>> edit_lists;
+ for (int i = 0; i != kGroupSize; ++i) {
+ cfds.emplace_back(versions_->GetColumnFamilySet()->GetDefault());
+ all_mutable_cf_options.emplace_back(&mutable_cf_options_);
+ autovector<VersionEdit*> edit_list;
+ edit_list.emplace_back(&edits[i]);
+ edit_lists.emplace_back(edit_list);
+ }
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ int count = 0;
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::ProcessManifestWrites:SameColumnFamily", [&](void* arg) {
+ uint32_t* cf_id = reinterpret_cast<uint32_t*>(arg);
+ EXPECT_EQ(0u, *cf_id);
+ ++count;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ mutex_.Lock();
+ Status s =
+ versions_->LogAndApply(cfds, all_mutable_cf_options, edit_lists, &mutex_);
+ mutex_.Unlock();
+ EXPECT_OK(s);
+ EXPECT_EQ(kGroupSize - 1, count);
+}
+
+class VersionSetAtomicGroupTest : public VersionSetTestBase,
+ public testing::Test {
+ public:
+ VersionSetAtomicGroupTest() : VersionSetTestBase() {}
+
+ void SetUp() override {
+ PrepareManifest(&column_families_, &last_seqno_, &log_writer_);
+ SetupTestSyncPoints();
+ }
+
+ void SetupValidAtomicGroup(int atomic_group_size) {
+ edits_.resize(atomic_group_size);
+ int remaining = atomic_group_size;
+ for (size_t i = 0; i != edits_.size(); ++i) {
+ edits_[i].SetLogNumber(0);
+ edits_[i].SetNextFile(2);
+ edits_[i].MarkAtomicGroup(--remaining);
+ edits_[i].SetLastSequence(last_seqno_++);
+ }
+ ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr));
+ }
+
+ void SetupIncompleteTrailingAtomicGroup(int atomic_group_size) {
+ edits_.resize(atomic_group_size);
+ int remaining = atomic_group_size;
+ for (size_t i = 0; i != edits_.size(); ++i) {
+ edits_[i].SetLogNumber(0);
+ edits_[i].SetNextFile(2);
+ edits_[i].MarkAtomicGroup(--remaining);
+ edits_[i].SetLastSequence(last_seqno_++);
+ }
+ ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr));
+ }
+
+ void SetupCorruptedAtomicGroup(int atomic_group_size) {
+ edits_.resize(atomic_group_size);
+ int remaining = atomic_group_size;
+ for (size_t i = 0; i != edits_.size(); ++i) {
+ edits_[i].SetLogNumber(0);
+ edits_[i].SetNextFile(2);
+ if (i != ((size_t)atomic_group_size / 2)) {
+ edits_[i].MarkAtomicGroup(--remaining);
+ }
+ edits_[i].SetLastSequence(last_seqno_++);
+ }
+ ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr));
+ }
+
+ void SetupIncorrectAtomicGroup(int atomic_group_size) {
+ edits_.resize(atomic_group_size);
+ int remaining = atomic_group_size;
+ for (size_t i = 0; i != edits_.size(); ++i) {
+ edits_[i].SetLogNumber(0);
+ edits_[i].SetNextFile(2);
+ if (i != 1) {
+ edits_[i].MarkAtomicGroup(--remaining);
+ } else {
+ edits_[i].MarkAtomicGroup(remaining--);
+ }
+ edits_[i].SetLastSequence(last_seqno_++);
+ }
+ ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr));
+ }
+
+ void SetupTestSyncPoints() {
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "AtomicGroupReadBuffer::AddEdit:FirstInAtomicGroup", [&](void* arg) {
+ VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
+ EXPECT_EQ(edits_.front().DebugString(),
+ e->DebugString()); // compare based on value
+ first_in_atomic_group_ = true;
+ });
+ SyncPoint::GetInstance()->SetCallBack(
+ "AtomicGroupReadBuffer::AddEdit:LastInAtomicGroup", [&](void* arg) {
+ VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
+ EXPECT_EQ(edits_.back().DebugString(),
+ e->DebugString()); // compare based on value
+ EXPECT_TRUE(first_in_atomic_group_);
+ last_in_atomic_group_ = true;
+ });
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::ReadAndRecover:RecoveredEdits", [&](void* arg) {
+ num_recovered_edits_ = *reinterpret_cast<int*>(arg);
+ });
+ SyncPoint::GetInstance()->SetCallBack(
+ "ReactiveVersionSet::ReadAndApply:AppliedEdits",
+ [&](void* arg) { num_applied_edits_ = *reinterpret_cast<int*>(arg); });
+ SyncPoint::GetInstance()->SetCallBack(
+ "AtomicGroupReadBuffer::AddEdit:AtomicGroup",
+ [&](void* /* arg */) { ++num_edits_in_atomic_group_; });
+ SyncPoint::GetInstance()->SetCallBack(
+ "AtomicGroupReadBuffer::AddEdit:AtomicGroupMixedWithNormalEdits",
+ [&](void* arg) {
+ corrupted_edit_ = *reinterpret_cast<VersionEdit*>(arg);
+ });
+ SyncPoint::GetInstance()->SetCallBack(
+ "AtomicGroupReadBuffer::AddEdit:IncorrectAtomicGroupSize",
+ [&](void* arg) {
+ edit_with_incorrect_group_size_ =
+ *reinterpret_cast<VersionEdit*>(arg);
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ }
+
+ void AddNewEditsToLog(int num_edits) {
+ for (int i = 0; i < num_edits; i++) {
+ std::string record;
+ edits_[i].EncodeTo(&record);
+ ASSERT_OK(log_writer_->AddRecord(record));
+ }
+ }
+
+ void TearDown() override {
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ log_writer_.reset();
+ }
+
+ protected:
+ std::vector<ColumnFamilyDescriptor> column_families_;
+ SequenceNumber last_seqno_;
+ std::vector<VersionEdit> edits_;
+ bool first_in_atomic_group_ = false;
+ bool last_in_atomic_group_ = false;
+ int num_edits_in_atomic_group_ = 0;
+ int num_recovered_edits_ = 0;
+ int num_applied_edits_ = 0;
+ VersionEdit corrupted_edit_;
+ VersionEdit edit_with_incorrect_group_size_;
+ std::unique_ptr<log::Writer> log_writer_;
+};
+
+TEST_F(VersionSetAtomicGroupTest, HandleValidAtomicGroupWithVersionSetRecover) {
+ const int kAtomicGroupSize = 3;
+ SetupValidAtomicGroup(kAtomicGroupSize);
+ AddNewEditsToLog(kAtomicGroupSize);
+ EXPECT_OK(versions_->Recover(column_families_, false));
+ EXPECT_EQ(column_families_.size(),
+ versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+ EXPECT_TRUE(first_in_atomic_group_);
+ EXPECT_TRUE(last_in_atomic_group_);
+ EXPECT_EQ(num_initial_edits_ + kAtomicGroupSize, num_recovered_edits_);
+ EXPECT_EQ(0, num_applied_edits_);
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+ HandleValidAtomicGroupWithReactiveVersionSetRecover) {
+ const int kAtomicGroupSize = 3;
+ SetupValidAtomicGroup(kAtomicGroupSize);
+ AddNewEditsToLog(kAtomicGroupSize);
+ std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+ std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+ std::unique_ptr<Status> manifest_reader_status;
+ EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+ &manifest_reporter,
+ &manifest_reader_status));
+ EXPECT_EQ(column_families_.size(),
+ reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+ EXPECT_TRUE(first_in_atomic_group_);
+ EXPECT_TRUE(last_in_atomic_group_);
+ // The recover should clean up the replay buffer.
+ EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0);
+ EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0);
+ EXPECT_EQ(num_initial_edits_ + kAtomicGroupSize, num_recovered_edits_);
+ EXPECT_EQ(0, num_applied_edits_);
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+ HandleValidAtomicGroupWithReactiveVersionSetReadAndApply) {
+ const int kAtomicGroupSize = 3;
+ SetupValidAtomicGroup(kAtomicGroupSize);
+ std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+ std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+ std::unique_ptr<Status> manifest_reader_status;
+ EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+ &manifest_reporter,
+ &manifest_reader_status));
+ AddNewEditsToLog(kAtomicGroupSize);
+ InstrumentedMutex mu;
+ std::unordered_set<ColumnFamilyData*> cfds_changed;
+ mu.Lock();
+ EXPECT_OK(
+ reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed));
+ mu.Unlock();
+ EXPECT_TRUE(first_in_atomic_group_);
+ EXPECT_TRUE(last_in_atomic_group_);
+ // The recover should clean up the replay buffer.
+ EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0);
+ EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0);
+ EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
+ EXPECT_EQ(kAtomicGroupSize, num_applied_edits_);
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+ HandleIncompleteTrailingAtomicGroupWithVersionSetRecover) {
+ const int kAtomicGroupSize = 4;
+ const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1;
+ SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize);
+ AddNewEditsToLog(kNumberOfPersistedVersionEdits);
+ EXPECT_OK(versions_->Recover(column_families_, false));
+ EXPECT_EQ(column_families_.size(),
+ versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+ EXPECT_TRUE(first_in_atomic_group_);
+ EXPECT_FALSE(last_in_atomic_group_);
+ EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_);
+ EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
+ EXPECT_EQ(0, num_applied_edits_);
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+ HandleIncompleteTrailingAtomicGroupWithReactiveVersionSetRecover) {
+ const int kAtomicGroupSize = 4;
+ const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1;
+ SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize);
+ AddNewEditsToLog(kNumberOfPersistedVersionEdits);
+ std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+ std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+ std::unique_ptr<Status> manifest_reader_status;
+ EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+ &manifest_reporter,
+ &manifest_reader_status));
+ EXPECT_EQ(column_families_.size(),
+ reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+ EXPECT_TRUE(first_in_atomic_group_);
+ EXPECT_FALSE(last_in_atomic_group_);
+ EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_);
+ // Reactive version set should store the edits in the replay buffer.
+ EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() ==
+ kNumberOfPersistedVersionEdits);
+ EXPECT_TRUE(reactive_versions_->replay_buffer().size() == kAtomicGroupSize);
+ // Write the last record. The reactive version set should now apply all
+ // edits.
+ std::string last_record;
+ edits_[kAtomicGroupSize - 1].EncodeTo(&last_record);
+ EXPECT_OK(log_writer_->AddRecord(last_record));
+ InstrumentedMutex mu;
+ std::unordered_set<ColumnFamilyData*> cfds_changed;
+ mu.Lock();
+ EXPECT_OK(
+ reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed));
+ mu.Unlock();
+ // Reactive version set should be empty now.
+ EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0);
+ EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0);
+ EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
+ EXPECT_EQ(kAtomicGroupSize, num_applied_edits_);
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+ HandleIncompleteTrailingAtomicGroupWithReactiveVersionSetReadAndApply) {
+ const int kAtomicGroupSize = 4;
+ const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1;
+ SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize);
+ std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+ std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+ std::unique_ptr<Status> manifest_reader_status;
+ // No edits in an atomic group.
+ EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+ &manifest_reporter,
+ &manifest_reader_status));
+ EXPECT_EQ(column_families_.size(),
+ reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+ // Write a few edits in an atomic group.
+ AddNewEditsToLog(kNumberOfPersistedVersionEdits);
+ InstrumentedMutex mu;
+ std::unordered_set<ColumnFamilyData*> cfds_changed;
+ mu.Lock();
+ EXPECT_OK(
+ reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed));
+ mu.Unlock();
+ EXPECT_TRUE(first_in_atomic_group_);
+ EXPECT_FALSE(last_in_atomic_group_);
+ EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_);
+ // Reactive version set should store the edits in the replay buffer.
+ EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() ==
+ kNumberOfPersistedVersionEdits);
+ EXPECT_TRUE(reactive_versions_->replay_buffer().size() == kAtomicGroupSize);
+ EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
+ EXPECT_EQ(0, num_applied_edits_);
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+ HandleCorruptedAtomicGroupWithVersionSetRecover) {
+ const int kAtomicGroupSize = 4;
+ SetupCorruptedAtomicGroup(kAtomicGroupSize);
+ AddNewEditsToLog(kAtomicGroupSize);
+ EXPECT_NOK(versions_->Recover(column_families_, false));
+ EXPECT_EQ(column_families_.size(),
+ versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+ EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(),
+ corrupted_edit_.DebugString());
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+ HandleCorruptedAtomicGroupWithReactiveVersionSetRecover) {
+ const int kAtomicGroupSize = 4;
+ SetupCorruptedAtomicGroup(kAtomicGroupSize);
+ AddNewEditsToLog(kAtomicGroupSize);
+ std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+ std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+ std::unique_ptr<Status> manifest_reader_status;
+ EXPECT_NOK(reactive_versions_->Recover(column_families_, &manifest_reader,
+ &manifest_reporter,
+ &manifest_reader_status));
+ EXPECT_EQ(column_families_.size(),
+ reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+ EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(),
+ corrupted_edit_.DebugString());
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+ HandleCorruptedAtomicGroupWithReactiveVersionSetReadAndApply) {
+ const int kAtomicGroupSize = 4;
+ SetupCorruptedAtomicGroup(kAtomicGroupSize);
+ InstrumentedMutex mu;
+ std::unordered_set<ColumnFamilyData*> cfds_changed;
+ std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+ std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+ std::unique_ptr<Status> manifest_reader_status;
+ EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+ &manifest_reporter,
+ &manifest_reader_status));
+ // Write the corrupted edits.
+ AddNewEditsToLog(kAtomicGroupSize);
+ mu.Lock();
+ EXPECT_OK(
+ reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed));
+ mu.Unlock();
+ EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(),
+ corrupted_edit_.DebugString());
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+ HandleIncorrectAtomicGroupSizeWithVersionSetRecover) {
+ const int kAtomicGroupSize = 4;
+ SetupIncorrectAtomicGroup(kAtomicGroupSize);
+ AddNewEditsToLog(kAtomicGroupSize);
+ EXPECT_NOK(versions_->Recover(column_families_, false));
+ EXPECT_EQ(column_families_.size(),
+ versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+ EXPECT_EQ(edits_[1].DebugString(),
+ edit_with_incorrect_group_size_.DebugString());
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+ HandleIncorrectAtomicGroupSizeWithReactiveVersionSetRecover) {
+ const int kAtomicGroupSize = 4;
+ SetupIncorrectAtomicGroup(kAtomicGroupSize);
+ AddNewEditsToLog(kAtomicGroupSize);
+ std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+ std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+ std::unique_ptr<Status> manifest_reader_status;
+ EXPECT_NOK(reactive_versions_->Recover(column_families_, &manifest_reader,
+ &manifest_reporter,
+ &manifest_reader_status));
+ EXPECT_EQ(column_families_.size(),
+ reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+ EXPECT_EQ(edits_[1].DebugString(),
+ edit_with_incorrect_group_size_.DebugString());
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+ HandleIncorrectAtomicGroupSizeWithReactiveVersionSetReadAndApply) {
+ const int kAtomicGroupSize = 4;
+ SetupIncorrectAtomicGroup(kAtomicGroupSize);
+ InstrumentedMutex mu;
+ std::unordered_set<ColumnFamilyData*> cfds_changed;
+ std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+ std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+ std::unique_ptr<Status> manifest_reader_status;
+ EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+ &manifest_reporter,
+ &manifest_reader_status));
+ AddNewEditsToLog(kAtomicGroupSize);
+ mu.Lock();
+ EXPECT_OK(
+ reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed));
+ mu.Unlock();
+ EXPECT_EQ(edits_[1].DebugString(),
+ edit_with_incorrect_group_size_.DebugString());
+}
+
+class VersionSetTestDropOneCF : public VersionSetTestBase,
+ public testing::TestWithParam<std::string> {
+ public:
+ VersionSetTestDropOneCF() : VersionSetTestBase() {}
+};
+
+// This test simulates the following execution sequence
+// Time thread1 bg_flush_thr
+// | Prepare version edits (e1,e2,e3) for atomic
+// | flush cf1, cf2, cf3
+// | Enqueue e to drop cfi
+// | to manifest_writers_
+// | Enqueue (e1,e2,e3) to manifest_writers_
+// |
+// | Apply e,
+// | cfi.IsDropped() is true
+// | Apply (e1,e2,e3),
+// | since cfi.IsDropped() == true, we need to
+// | drop ei and write the rest to MANIFEST.
+// V
+//
+// Repeat the test for i = 1, 2, 3 to simulate dropping the first, middle and
+// last column family in an atomic group.
+TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) {
+ std::vector<ColumnFamilyDescriptor> column_families;
+ SequenceNumber last_seqno;
+ std::unique_ptr<log::Writer> log_writer;
+ PrepareManifest(&column_families, &last_seqno, &log_writer);
+ Status s = SetCurrentFile(env_, dbname_, 1, nullptr);
+ ASSERT_OK(s);
+
+ EXPECT_OK(versions_->Recover(column_families, false /* read_only */));
+ EXPECT_EQ(column_families.size(),
+ versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+
+ const int kAtomicGroupSize = 3;
+ const std::vector<std::string> non_default_cf_names = {
+ kColumnFamilyName1, kColumnFamilyName2, kColumnFamilyName3};
+
+ // Drop one column family
+ VersionEdit drop_cf_edit;
+ drop_cf_edit.DropColumnFamily();
+ const std::string cf_to_drop_name(GetParam());
+ auto cfd_to_drop =
+ versions_->GetColumnFamilySet()->GetColumnFamily(cf_to_drop_name);
+ ASSERT_NE(nullptr, cfd_to_drop);
+ // Increase its refcount because cfd_to_drop is used later, and we need to
+ // prevent it from being deleted.
+ cfd_to_drop->Ref();
+ drop_cf_edit.SetColumnFamily(cfd_to_drop->GetID());
+ mutex_.Lock();
+ s = versions_->LogAndApply(cfd_to_drop,
+ *cfd_to_drop->GetLatestMutableCFOptions(),
+ &drop_cf_edit, &mutex_);
+ mutex_.Unlock();
+ ASSERT_OK(s);
+
+ std::vector<VersionEdit> edits(kAtomicGroupSize);
+ uint32_t remaining = kAtomicGroupSize;
+ size_t i = 0;
+ autovector<ColumnFamilyData*> cfds;
+ autovector<const MutableCFOptions*> mutable_cf_options_list;
+ autovector<autovector<VersionEdit*>> edit_lists;
+ for (const auto& cf_name : non_default_cf_names) {
+ auto cfd = (cf_name != cf_to_drop_name)
+ ? versions_->GetColumnFamilySet()->GetColumnFamily(cf_name)
+ : cfd_to_drop;
+ ASSERT_NE(nullptr, cfd);
+ cfds.push_back(cfd);
+ mutable_cf_options_list.emplace_back(cfd->GetLatestMutableCFOptions());
+ edits[i].SetColumnFamily(cfd->GetID());
+ edits[i].SetLogNumber(0);
+ edits[i].SetNextFile(2);
+ edits[i].MarkAtomicGroup(--remaining);
+ edits[i].SetLastSequence(last_seqno++);
+ autovector<VersionEdit*> tmp_edits;
+ tmp_edits.push_back(&edits[i]);
+ edit_lists.emplace_back(tmp_edits);
+ ++i;
+ }
+ int called = 0;
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::ProcessManifestWrites:CheckOneAtomicGroup", [&](void* arg) {
+ std::vector<VersionEdit*>* tmp_edits =
+ reinterpret_cast<std::vector<VersionEdit*>*>(arg);
+ EXPECT_EQ(kAtomicGroupSize - 1, tmp_edits->size());
+ for (const auto e : *tmp_edits) {
+ bool found = false;
+ for (const auto& e2 : edits) {
+ if (&e2 == e) {
+ found = true;
+ break;
+ }
+ }
+ ASSERT_TRUE(found);
+ }
+ ++called;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ mutex_.Lock();
+ s = versions_->LogAndApply(cfds, mutable_cf_options_list, edit_lists,
+ &mutex_);
+ mutex_.Unlock();
+ ASSERT_OK(s);
+ ASSERT_EQ(1, called);
+ if (cfd_to_drop->Unref()) {
+ delete cfd_to_drop;
+ cfd_to_drop = nullptr;
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(
+ AtomicGroup, VersionSetTestDropOneCF,
+ testing::Values(VersionSetTestBase::kColumnFamilyName1,
+ VersionSetTestBase::kColumnFamilyName2,
+ VersionSetTestBase::kColumnFamilyName3));
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/wal_manager.cc b/src/rocksdb/db/wal_manager.cc
new file mode 100644
index 000000000..5b699274c
--- /dev/null
+++ b/src/rocksdb/db/wal_manager.cc
@@ -0,0 +1,510 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/wal_manager.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <memory>
+#include <vector>
+
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/transaction_log_impl.h"
+#include "db/write_batch_internal.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "file/sequence_file_reader.h"
+#include "logging/logging.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/write_batch.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/coding.h"
+#include "util/mutexlock.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+
+Status WalManager::DeleteFile(const std::string& fname, uint64_t number) {
+ auto s = env_->DeleteFile(db_options_.wal_dir + "/" + fname);
+ if (s.ok()) {
+ MutexLock l(&read_first_record_cache_mutex_);
+ read_first_record_cache_.erase(number);
+ }
+ return s;
+}
+
+Status WalManager::GetSortedWalFiles(VectorLogPtr& files) {
+ // First get sorted files in db dir, then get sorted files from archived
+ // dir, to avoid a race condition where a log file is moved to archived
+ // dir in between.
+ Status s;
+ // list wal files in main db dir.
+ VectorLogPtr logs;
+ s = GetSortedWalsOfType(db_options_.wal_dir, logs, kAliveLogFile);
+ if (!s.ok()) {
+ return s;
+ }
+
+ // Reproduce the race condition where a log file is moved
+ // to archived dir, between these two sync points, used in
+ // (DBTest,TransactionLogIteratorRace)
+ TEST_SYNC_POINT("WalManager::GetSortedWalFiles:1");
+ TEST_SYNC_POINT("WalManager::GetSortedWalFiles:2");
+
+ files.clear();
+ // list wal files in archive dir.
+ std::string archivedir = ArchivalDirectory(db_options_.wal_dir);
+ Status exists = env_->FileExists(archivedir);
+ if (exists.ok()) {
+ s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile);
+ if (!s.ok()) {
+ return s;
+ }
+ } else if (!exists.IsNotFound()) {
+ assert(s.IsIOError());
+ return s;
+ }
+
+ uint64_t latest_archived_log_number = 0;
+ if (!files.empty()) {
+ latest_archived_log_number = files.back()->LogNumber();
+ ROCKS_LOG_INFO(db_options_.info_log, "Latest Archived log: %" PRIu64,
+ latest_archived_log_number);
+ }
+
+ files.reserve(files.size() + logs.size());
+ for (auto& log : logs) {
+ if (log->LogNumber() > latest_archived_log_number) {
+ files.push_back(std::move(log));
+ } else {
+ // When the race condition happens, we could see the
+ // same log in both db dir and archived dir. Simply
+ // ignore the one in db dir. Note that, if we read
+ // archived dir first, we would have missed the log file.
+ ROCKS_LOG_WARN(db_options_.info_log, "%s already moved to archive",
+ log->PathName().c_str());
+ }
+ }
+
+ return s;
+}
+
+Status WalManager::GetUpdatesSince(
+ SequenceNumber seq, std::unique_ptr<TransactionLogIterator>* iter,
+ const TransactionLogIterator::ReadOptions& read_options,
+ VersionSet* version_set) {
+
+ // Get all sorted Wal Files.
+ // Do binary search and open files and find the seq number.
+
+ std::unique_ptr<VectorLogPtr> wal_files(new VectorLogPtr);
+ Status s = GetSortedWalFiles(*wal_files);
+ if (!s.ok()) {
+ return s;
+ }
+
+ s = RetainProbableWalFiles(*wal_files, seq);
+ if (!s.ok()) {
+ return s;
+ }
+ iter->reset(new TransactionLogIteratorImpl(
+ db_options_.wal_dir, &db_options_, read_options, file_options_, seq,
+ std::move(wal_files), version_set, seq_per_batch_));
+ return (*iter)->status();
+}
+
+// 1. Go through all archived files and
+// a. if ttl is enabled, delete outdated files
+// b. if archive size limit is enabled, delete empty files,
+// compute file number and size.
+// 2. If size limit is enabled:
+// a. compute how many files should be deleted
+// b. get sorted non-empty archived logs
+// c. delete what should be deleted
+void WalManager::PurgeObsoleteWALFiles() {
+ bool const ttl_enabled = db_options_.wal_ttl_seconds > 0;
+ bool const size_limit_enabled = db_options_.wal_size_limit_mb > 0;
+ if (!ttl_enabled && !size_limit_enabled) {
+ return;
+ }
+
+ int64_t current_time;
+ Status s = env_->GetCurrentTime(&current_time);
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(db_options_.info_log, "Can't get current time: %s",
+ s.ToString().c_str());
+ assert(false);
+ return;
+ }
+ uint64_t const now_seconds = static_cast<uint64_t>(current_time);
+ uint64_t const time_to_check = (ttl_enabled && !size_limit_enabled)
+ ? db_options_.wal_ttl_seconds / 2
+ : kDefaultIntervalToDeleteObsoleteWAL;
+
+ if (purge_wal_files_last_run_ + time_to_check > now_seconds) {
+ return;
+ }
+
+ purge_wal_files_last_run_ = now_seconds;
+
+ std::string archival_dir = ArchivalDirectory(db_options_.wal_dir);
+ std::vector<std::string> files;
+ s = env_->GetChildren(archival_dir, &files);
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(db_options_.info_log, "Can't get archive files: %s",
+ s.ToString().c_str());
+ assert(false);
+ return;
+ }
+
+ size_t log_files_num = 0;
+ uint64_t log_file_size = 0;
+
+ for (auto& f : files) {
+ uint64_t number;
+ FileType type;
+ if (ParseFileName(f, &number, &type) && type == kLogFile) {
+ std::string const file_path = archival_dir + "/" + f;
+ if (ttl_enabled) {
+ uint64_t file_m_time;
+ s = env_->GetFileModificationTime(file_path, &file_m_time);
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Can't get file mod time: %s: %s", file_path.c_str(),
+ s.ToString().c_str());
+ continue;
+ }
+ if (now_seconds - file_m_time > db_options_.wal_ttl_seconds) {
+ s = DeleteDBFile(&db_options_, file_path, archival_dir, false,
+ /*force_fg=*/!wal_in_db_path_);
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log, "Can't delete file: %s: %s",
+ file_path.c_str(), s.ToString().c_str());
+ continue;
+ } else {
+ MutexLock l(&read_first_record_cache_mutex_);
+ read_first_record_cache_.erase(number);
+ }
+ continue;
+ }
+ }
+
+ if (size_limit_enabled) {
+ uint64_t file_size;
+ s = env_->GetFileSize(file_path, &file_size);
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(db_options_.info_log,
+ "Unable to get file size: %s: %s", file_path.c_str(),
+ s.ToString().c_str());
+ return;
+ } else {
+ if (file_size > 0) {
+ log_file_size = std::max(log_file_size, file_size);
+ ++log_files_num;
+ } else {
+ s = DeleteDBFile(&db_options_, file_path, archival_dir, false,
+ /*force_fg=*/!wal_in_db_path_);
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Unable to delete file: %s: %s", file_path.c_str(),
+ s.ToString().c_str());
+ continue;
+ } else {
+ MutexLock l(&read_first_record_cache_mutex_);
+ read_first_record_cache_.erase(number);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ if (0 == log_files_num || !size_limit_enabled) {
+ return;
+ }
+
+ size_t const files_keep_num =
+ static_cast<size_t>(db_options_.wal_size_limit_mb * 1024 * 1024 / log_file_size);
+ if (log_files_num <= files_keep_num) {
+ return;
+ }
+
+ size_t files_del_num = log_files_num - files_keep_num;
+ VectorLogPtr archived_logs;
+ GetSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile);
+
+ if (files_del_num > archived_logs.size()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Trying to delete more archived log files than "
+ "exist. Deleting all");
+ files_del_num = archived_logs.size();
+ }
+
+ for (size_t i = 0; i < files_del_num; ++i) {
+ std::string const file_path = archived_logs[i]->PathName();
+ s = DeleteDBFile(&db_options_, db_options_.wal_dir + "/" + file_path,
+ db_options_.wal_dir, false,
+ /*force_fg=*/!wal_in_db_path_);
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log, "Unable to delete file: %s: %s",
+ file_path.c_str(), s.ToString().c_str());
+ continue;
+ } else {
+ MutexLock l(&read_first_record_cache_mutex_);
+ read_first_record_cache_.erase(archived_logs[i]->LogNumber());
+ }
+ }
+}
+
+void WalManager::ArchiveWALFile(const std::string& fname, uint64_t number) {
+ auto archived_log_name = ArchivedLogFileName(db_options_.wal_dir, number);
+ // The sync point below is used in (DBTest,TransactionLogIteratorRace)
+ TEST_SYNC_POINT("WalManager::PurgeObsoleteFiles:1");
+ Status s = env_->RenameFile(fname, archived_log_name);
+ // The sync point below is used in (DBTest,TransactionLogIteratorRace)
+ TEST_SYNC_POINT("WalManager::PurgeObsoleteFiles:2");
+ ROCKS_LOG_INFO(db_options_.info_log, "Move log file %s to %s -- %s\n",
+ fname.c_str(), archived_log_name.c_str(),
+ s.ToString().c_str());
+}
+
+Status WalManager::GetSortedWalsOfType(const std::string& path,
+ VectorLogPtr& log_files,
+ WalFileType log_type) {
+ std::vector<std::string> all_files;
+ const Status status = env_->GetChildren(path, &all_files);
+ if (!status.ok()) {
+ return status;
+ }
+ log_files.reserve(all_files.size());
+ for (const auto& f : all_files) {
+ uint64_t number;
+ FileType type;
+ if (ParseFileName(f, &number, &type) && type == kLogFile) {
+ SequenceNumber sequence;
+ Status s = ReadFirstRecord(log_type, number, &sequence);
+ if (!s.ok()) {
+ return s;
+ }
+ if (sequence == 0) {
+ // empty file
+ continue;
+ }
+
+ // Reproduce the race condition where a log file is moved
+ // to archived dir, between these two sync points, used in
+ // (DBTest,TransactionLogIteratorRace)
+ TEST_SYNC_POINT("WalManager::GetSortedWalsOfType:1");
+ TEST_SYNC_POINT("WalManager::GetSortedWalsOfType:2");
+
+ uint64_t size_bytes;
+ s = env_->GetFileSize(LogFileName(path, number), &size_bytes);
+ // re-try in case the alive log file has been moved to archive.
+ if (!s.ok() && log_type == kAliveLogFile) {
+ std::string archived_file = ArchivedLogFileName(path, number);
+ if (env_->FileExists(archived_file).ok()) {
+ s = env_->GetFileSize(archived_file, &size_bytes);
+ if (!s.ok() && env_->FileExists(archived_file).IsNotFound()) {
+ // oops, the file just got deleted from archived dir! move on
+ s = Status::OK();
+ continue;
+ }
+ }
+ }
+ if (!s.ok()) {
+ return s;
+ }
+
+ log_files.push_back(std::unique_ptr<LogFile>(
+ new LogFileImpl(number, log_type, sequence, size_bytes)));
+ }
+ }
+ std::sort(
+ log_files.begin(), log_files.end(),
+ [](const std::unique_ptr<LogFile>& a, const std::unique_ptr<LogFile>& b) {
+ LogFileImpl* a_impl =
+ static_cast_with_check<LogFileImpl, LogFile>(a.get());
+ LogFileImpl* b_impl =
+ static_cast_with_check<LogFileImpl, LogFile>(b.get());
+ return *a_impl < *b_impl;
+ });
+ return status;
+}
+
+Status WalManager::RetainProbableWalFiles(VectorLogPtr& all_logs,
+ const SequenceNumber target) {
+ int64_t start = 0; // signed to avoid overflow when target is < first file.
+ int64_t end = static_cast<int64_t>(all_logs.size()) - 1;
+ // Binary Search. avoid opening all files.
+ while (end >= start) {
+ int64_t mid = start + (end - start) / 2; // Avoid overflow.
+ SequenceNumber current_seq_num = all_logs.at(static_cast<size_t>(mid))->StartSequence();
+ if (current_seq_num == target) {
+ end = mid;
+ break;
+ } else if (current_seq_num < target) {
+ start = mid + 1;
+ } else {
+ end = mid - 1;
+ }
+ }
+ // end could be -ve.
+ size_t start_index = static_cast<size_t>(std::max(static_cast<int64_t>(0), end));
+ // The last wal file is always included
+ all_logs.erase(all_logs.begin(), all_logs.begin() + start_index);
+ return Status::OK();
+}
+
+Status WalManager::ReadFirstRecord(const WalFileType type,
+ const uint64_t number,
+ SequenceNumber* sequence) {
+ *sequence = 0;
+ if (type != kAliveLogFile && type != kArchivedLogFile) {
+ ROCKS_LOG_ERROR(db_options_.info_log, "[WalManger] Unknown file type %s",
+ ToString(type).c_str());
+ return Status::NotSupported(
+ "File Type Not Known " + ToString(type));
+ }
+ {
+ MutexLock l(&read_first_record_cache_mutex_);
+ auto itr = read_first_record_cache_.find(number);
+ if (itr != read_first_record_cache_.end()) {
+ *sequence = itr->second;
+ return Status::OK();
+ }
+ }
+ Status s;
+ if (type == kAliveLogFile) {
+ std::string fname = LogFileName(db_options_.wal_dir, number);
+ s = ReadFirstLine(fname, number, sequence);
+ if (!s.ok() && env_->FileExists(fname).ok()) {
+ // return any error that is not caused by non-existing file
+ return s;
+ }
+ }
+
+ if (type == kArchivedLogFile || !s.ok()) {
+ // check if the file got moved to archive.
+ std::string archived_file =
+ ArchivedLogFileName(db_options_.wal_dir, number);
+ s = ReadFirstLine(archived_file, number, sequence);
+ // maybe the file was deleted from archive dir. If that's the case, return
+ // Status::OK(). The caller with identify this as empty file because
+ // *sequence == 0
+ if (!s.ok() && env_->FileExists(archived_file).IsNotFound()) {
+ return Status::OK();
+ }
+ }
+
+ if (s.ok() && *sequence != 0) {
+ MutexLock l(&read_first_record_cache_mutex_);
+ read_first_record_cache_.insert({number, *sequence});
+ }
+ return s;
+}
+
+Status WalManager::GetLiveWalFile(uint64_t number,
+ std::unique_ptr<LogFile>* log_file) {
+ if (!log_file) {
+ return Status::InvalidArgument("log_file not preallocated.");
+ }
+
+ if (!number) {
+ return Status::PathNotFound("log file not available");
+ }
+
+ Status s;
+
+ uint64_t size_bytes;
+ s = env_->GetFileSize(LogFileName(db_options_.wal_dir, number), &size_bytes);
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ log_file->reset(new LogFileImpl(number, kAliveLogFile,
+ 0, // SequenceNumber
+ size_bytes));
+
+ return Status::OK();
+}
+
+// the function returns status.ok() and sequence == 0 if the file exists, but is
+// empty
+Status WalManager::ReadFirstLine(const std::string& fname,
+ const uint64_t number,
+ SequenceNumber* sequence) {
+ struct LogReporter : public log::Reader::Reporter {
+ Env* env;
+ Logger* info_log;
+ const char* fname;
+
+ Status* status;
+ bool ignore_error; // true if db_options_.paranoid_checks==false
+ void Corruption(size_t bytes, const Status& s) override {
+ ROCKS_LOG_WARN(info_log, "[WalManager] %s%s: dropping %d bytes; %s",
+ (this->ignore_error ? "(ignoring error) " : ""), fname,
+ static_cast<int>(bytes), s.ToString().c_str());
+ if (this->status->ok()) {
+ // only keep the first error
+ *this->status = s;
+ }
+ }
+ };
+
+ std::unique_ptr<FSSequentialFile> file;
+ Status status = fs_->NewSequentialFile(fname,
+ fs_->OptimizeForLogRead(file_options_),
+ &file, nullptr);
+ std::unique_ptr<SequentialFileReader> file_reader(
+ new SequentialFileReader(std::move(file), fname));
+
+ if (!status.ok()) {
+ return status;
+ }
+
+ LogReporter reporter;
+ reporter.env = env_;
+ reporter.info_log = db_options_.info_log.get();
+ reporter.fname = fname.c_str();
+ reporter.status = &status;
+ reporter.ignore_error = !db_options_.paranoid_checks;
+ log::Reader reader(db_options_.info_log, std::move(file_reader), &reporter,
+ true /*checksum*/, number);
+ std::string scratch;
+ Slice record;
+
+ if (reader.ReadRecord(&record, &scratch) &&
+ (status.ok() || !db_options_.paranoid_checks)) {
+ if (record.size() < WriteBatchInternal::kHeader) {
+ reporter.Corruption(record.size(),
+ Status::Corruption("log record too small"));
+ // TODO read record's till the first no corrupt entry?
+ } else {
+ WriteBatch batch;
+ WriteBatchInternal::SetContents(&batch, record);
+ *sequence = WriteBatchInternal::Sequence(&batch);
+ return Status::OK();
+ }
+ }
+
+ // ReadRecord returns false on EOF, which means that the log file is empty. we
+ // return status.ok() in that case and set sequence number to 0
+ *sequence = 0;
+ return status;
+}
+
+#endif // ROCKSDB_LITE
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/wal_manager.h b/src/rocksdb/db/wal_manager.h
new file mode 100644
index 000000000..783bfe99c
--- /dev/null
+++ b/src/rocksdb/db/wal_manager.h
@@ -0,0 +1,114 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <limits>
+#include <set>
+#include <utility>
+#include <vector>
+#include <string>
+#include <memory>
+
+#include "db/version_set.h"
+#include "file/file_util.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "rocksdb/transaction_log.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+
+// WAL manager provides the abstraction for reading the WAL files as a single
+// unit. Internally, it opens and reads the files using Reader or Writer
+// abstraction.
+class WalManager {
+ public:
+ WalManager(const ImmutableDBOptions& db_options,
+ const FileOptions& file_options, const bool seq_per_batch = false)
+ : db_options_(db_options),
+ file_options_(file_options),
+ env_(db_options.env),
+ fs_(db_options.fs.get()),
+ purge_wal_files_last_run_(0),
+ seq_per_batch_(seq_per_batch),
+ wal_in_db_path_(IsWalDirSameAsDBPath(&db_options)) {}
+
+ Status GetSortedWalFiles(VectorLogPtr& files);
+
+ // Allow user to tail transaction log to find all recent changes to the
+ // database that are newer than `seq_number`.
+ Status GetUpdatesSince(
+ SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
+ const TransactionLogIterator::ReadOptions& read_options,
+ VersionSet* version_set);
+
+ void PurgeObsoleteWALFiles();
+
+ void ArchiveWALFile(const std::string& fname, uint64_t number);
+
+ Status DeleteFile(const std::string& fname, uint64_t number);
+
+ Status GetLiveWalFile(uint64_t number, std::unique_ptr<LogFile>* log_file);
+
+ Status TEST_ReadFirstRecord(const WalFileType type, const uint64_t number,
+ SequenceNumber* sequence) {
+ return ReadFirstRecord(type, number, sequence);
+ }
+
+ Status TEST_ReadFirstLine(const std::string& fname, const uint64_t number,
+ SequenceNumber* sequence) {
+ return ReadFirstLine(fname, number, sequence);
+ }
+
+ private:
+ Status GetSortedWalsOfType(const std::string& path, VectorLogPtr& log_files,
+ WalFileType type);
+ // Requires: all_logs should be sorted with earliest log file first
+ // Retains all log files in all_logs which contain updates with seq no.
+ // Greater Than or Equal to the requested SequenceNumber.
+ Status RetainProbableWalFiles(VectorLogPtr& all_logs,
+ const SequenceNumber target);
+
+ Status ReadFirstRecord(const WalFileType type, const uint64_t number,
+ SequenceNumber* sequence);
+
+ Status ReadFirstLine(const std::string& fname, const uint64_t number,
+ SequenceNumber* sequence);
+
+ // ------- state from DBImpl ------
+ const ImmutableDBOptions& db_options_;
+ const FileOptions file_options_;
+ Env* env_;
+ FileSystem* fs_;
+
+ // ------- WalManager state -------
+ // cache for ReadFirstRecord() calls
+ std::unordered_map<uint64_t, SequenceNumber> read_first_record_cache_;
+ port::Mutex read_first_record_cache_mutex_;
+
+ // last time when PurgeObsoleteWALFiles ran.
+ uint64_t purge_wal_files_last_run_;
+
+ bool seq_per_batch_;
+
+ bool wal_in_db_path_;
+
+ // obsolete files will be deleted every this seconds if ttl deletion is
+ // enabled and archive size_limit is disabled.
+ static const uint64_t kDefaultIntervalToDeleteObsoleteWAL = 600;
+};
+
+#endif // ROCKSDB_LITE
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/wal_manager_test.cc b/src/rocksdb/db/wal_manager_test.cc
new file mode 100644
index 000000000..26bad368e
--- /dev/null
+++ b/src/rocksdb/db/wal_manager_test.cc
@@ -0,0 +1,338 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include <map>
+#include <string>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/write_batch.h"
+#include "rocksdb/write_buffer_manager.h"
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/log_writer.h"
+#include "db/version_set.h"
+#include "db/wal_manager.h"
+#include "env/mock_env.h"
+#include "file/writable_file_writer.h"
+#include "table/mock_table.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TODO(icanadi) mock out VersionSet
+// TODO(icanadi) move other WalManager-specific tests from db_test here
+class WalManagerTest : public testing::Test {
+ public:
+ WalManagerTest()
+ : env_(new MockEnv(Env::Default())),
+ dbname_(test::PerThreadDBPath("wal_manager_test")),
+ db_options_(),
+ table_cache_(NewLRUCache(50000, 16)),
+ write_buffer_manager_(db_options_.db_write_buffer_size),
+ current_log_number_(0) {
+ DestroyDB(dbname_, Options());
+ }
+
+ void Init() {
+ ASSERT_OK(env_->CreateDirIfMissing(dbname_));
+ ASSERT_OK(env_->CreateDirIfMissing(ArchivalDirectory(dbname_)));
+ db_options_.db_paths.emplace_back(dbname_,
+ std::numeric_limits<uint64_t>::max());
+ db_options_.wal_dir = dbname_;
+ db_options_.env = env_.get();
+ fs_.reset(new LegacyFileSystemWrapper(env_.get()));
+ db_options_.fs = fs_;
+
+ versions_.reset(new VersionSet(dbname_, &db_options_, env_options_,
+ table_cache_.get(), &write_buffer_manager_,
+ &write_controller_,
+ /*block_cache_tracer=*/nullptr));
+
+ wal_manager_.reset(new WalManager(db_options_, env_options_));
+ }
+
+ void Reopen() {
+ wal_manager_.reset(new WalManager(db_options_, env_options_));
+ }
+
+ // NOT thread safe
+ void Put(const std::string& key, const std::string& value) {
+ assert(current_log_writer_.get() != nullptr);
+ uint64_t seq = versions_->LastSequence() + 1;
+ WriteBatch batch;
+ batch.Put(key, value);
+ WriteBatchInternal::SetSequence(&batch, seq);
+ current_log_writer_->AddRecord(WriteBatchInternal::Contents(&batch));
+ versions_->SetLastAllocatedSequence(seq);
+ versions_->SetLastPublishedSequence(seq);
+ versions_->SetLastSequence(seq);
+ }
+
+ // NOT thread safe
+ void RollTheLog(bool /*archived*/) {
+ current_log_number_++;
+ std::string fname = ArchivedLogFileName(dbname_, current_log_number_);
+ std::unique_ptr<WritableFile> file;
+ ASSERT_OK(env_->NewWritableFile(fname, &file, env_options_));
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ NewLegacyWritableFileWrapper(std::move(file)), fname, env_options_));
+ current_log_writer_.reset(new log::Writer(std::move(file_writer), 0, false));
+ }
+
+ void CreateArchiveLogs(int num_logs, int entries_per_log) {
+ for (int i = 1; i <= num_logs; ++i) {
+ RollTheLog(true);
+ for (int k = 0; k < entries_per_log; ++k) {
+ Put(ToString(k), std::string(1024, 'a'));
+ }
+ }
+ }
+
+ std::unique_ptr<TransactionLogIterator> OpenTransactionLogIter(
+ const SequenceNumber seq) {
+ std::unique_ptr<TransactionLogIterator> iter;
+ Status status = wal_manager_->GetUpdatesSince(
+ seq, &iter, TransactionLogIterator::ReadOptions(), versions_.get());
+ EXPECT_OK(status);
+ return iter;
+ }
+
+ std::unique_ptr<MockEnv> env_;
+ std::string dbname_;
+ ImmutableDBOptions db_options_;
+ WriteController write_controller_;
+ EnvOptions env_options_;
+ std::shared_ptr<Cache> table_cache_;
+ WriteBufferManager write_buffer_manager_;
+ std::unique_ptr<VersionSet> versions_;
+ std::unique_ptr<WalManager> wal_manager_;
+ std::shared_ptr<LegacyFileSystemWrapper> fs_;
+
+ std::unique_ptr<log::Writer> current_log_writer_;
+ uint64_t current_log_number_;
+};
+
+TEST_F(WalManagerTest, ReadFirstRecordCache) {
+ Init();
+ std::string path = dbname_ + "/000001.log";
+ std::unique_ptr<WritableFile> file;
+ ASSERT_OK(env_->NewWritableFile(path, &file, EnvOptions()));
+
+ SequenceNumber s;
+ ASSERT_OK(wal_manager_->TEST_ReadFirstLine(path, 1 /* number */, &s));
+ ASSERT_EQ(s, 0U);
+
+ ASSERT_OK(
+ wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1 /* number */, &s));
+ ASSERT_EQ(s, 0U);
+
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ NewLegacyWritableFileWrapper(std::move(file)), path, EnvOptions()));
+ log::Writer writer(std::move(file_writer), 1,
+ db_options_.recycle_log_file_num > 0);
+ WriteBatch batch;
+ batch.Put("foo", "bar");
+ WriteBatchInternal::SetSequence(&batch, 10);
+ writer.AddRecord(WriteBatchInternal::Contents(&batch));
+
+ // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here.
+ // Waiting for lei to finish with db_test
+ // env_->count_sequential_reads_ = true;
+ // sequential_read_counter_ sanity test
+ // ASSERT_EQ(env_->sequential_read_counter_.Read(), 0);
+
+ ASSERT_OK(wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1, &s));
+ ASSERT_EQ(s, 10U);
+ // did a read
+ // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here
+ // ASSERT_EQ(env_->sequential_read_counter_.Read(), 1);
+
+ ASSERT_OK(wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1, &s));
+ ASSERT_EQ(s, 10U);
+ // no new reads since the value is cached
+ // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here
+ // ASSERT_EQ(env_->sequential_read_counter_.Read(), 1);
+}
+
+namespace {
+uint64_t GetLogDirSize(std::string dir_path, Env* env) {
+ uint64_t dir_size = 0;
+ std::vector<std::string> files;
+ env->GetChildren(dir_path, &files);
+ for (auto& f : files) {
+ uint64_t number;
+ FileType type;
+ if (ParseFileName(f, &number, &type) && type == kLogFile) {
+ std::string const file_path = dir_path + "/" + f;
+ uint64_t file_size;
+ env->GetFileSize(file_path, &file_size);
+ dir_size += file_size;
+ }
+ }
+ return dir_size;
+}
+std::vector<std::uint64_t> ListSpecificFiles(
+ Env* env, const std::string& path, const FileType expected_file_type) {
+ std::vector<std::string> files;
+ std::vector<uint64_t> file_numbers;
+ env->GetChildren(path, &files);
+ uint64_t number;
+ FileType type;
+ for (size_t i = 0; i < files.size(); ++i) {
+ if (ParseFileName(files[i], &number, &type)) {
+ if (type == expected_file_type) {
+ file_numbers.push_back(number);
+ }
+ }
+ }
+ return file_numbers;
+}
+
+int CountRecords(TransactionLogIterator* iter) {
+ int count = 0;
+ SequenceNumber lastSequence = 0;
+ BatchResult res;
+ while (iter->Valid()) {
+ res = iter->GetBatch();
+ EXPECT_TRUE(res.sequence > lastSequence);
+ ++count;
+ lastSequence = res.sequence;
+ EXPECT_OK(iter->status());
+ iter->Next();
+ }
+ return count;
+}
+} // namespace
+
+TEST_F(WalManagerTest, WALArchivalSizeLimit) {
+ db_options_.wal_ttl_seconds = 0;
+ db_options_.wal_size_limit_mb = 1000;
+ Init();
+
+ // TEST : Create WalManager with huge size limit and no ttl.
+ // Create some archived files and call PurgeObsoleteWALFiles().
+ // Count the archived log files that survived.
+ // Assert that all of them did.
+ // Change size limit. Re-open WalManager.
+ // Assert that archive is not greater than wal_size_limit_mb after
+ // PurgeObsoleteWALFiles()
+ // Set ttl and time_to_check_ to small values. Re-open db.
+ // Assert that there are no archived logs left.
+
+ std::string archive_dir = ArchivalDirectory(dbname_);
+ CreateArchiveLogs(20, 5000);
+
+ std::vector<std::uint64_t> log_files =
+ ListSpecificFiles(env_.get(), archive_dir, kLogFile);
+ ASSERT_EQ(log_files.size(), 20U);
+
+ db_options_.wal_size_limit_mb = 8;
+ Reopen();
+ wal_manager_->PurgeObsoleteWALFiles();
+
+ uint64_t archive_size = GetLogDirSize(archive_dir, env_.get());
+ ASSERT_TRUE(archive_size <= db_options_.wal_size_limit_mb * 1024 * 1024);
+
+ db_options_.wal_ttl_seconds = 1;
+ env_->FakeSleepForMicroseconds(2 * 1000 * 1000);
+ Reopen();
+ wal_manager_->PurgeObsoleteWALFiles();
+
+ log_files = ListSpecificFiles(env_.get(), archive_dir, kLogFile);
+ ASSERT_TRUE(log_files.empty());
+}
+
+TEST_F(WalManagerTest, WALArchivalTtl) {
+ db_options_.wal_ttl_seconds = 1000;
+ Init();
+
+ // TEST : Create WalManager with a ttl and no size limit.
+ // Create some archived log files and call PurgeObsoleteWALFiles().
+ // Assert that files are not deleted
+ // Reopen db with small ttl.
+ // Assert that all archived logs was removed.
+
+ std::string archive_dir = ArchivalDirectory(dbname_);
+ CreateArchiveLogs(20, 5000);
+
+ std::vector<uint64_t> log_files =
+ ListSpecificFiles(env_.get(), archive_dir, kLogFile);
+ ASSERT_GT(log_files.size(), 0U);
+
+ db_options_.wal_ttl_seconds = 1;
+ env_->FakeSleepForMicroseconds(3 * 1000 * 1000);
+ Reopen();
+ wal_manager_->PurgeObsoleteWALFiles();
+
+ log_files = ListSpecificFiles(env_.get(), archive_dir, kLogFile);
+ ASSERT_TRUE(log_files.empty());
+}
+
+TEST_F(WalManagerTest, TransactionLogIteratorMoveOverZeroFiles) {
+ Init();
+ RollTheLog(false);
+ Put("key1", std::string(1024, 'a'));
+ // Create a zero record WAL file.
+ RollTheLog(false);
+ RollTheLog(false);
+
+ Put("key2", std::string(1024, 'a'));
+
+ auto iter = OpenTransactionLogIter(0);
+ ASSERT_EQ(2, CountRecords(iter.get()));
+}
+
+TEST_F(WalManagerTest, TransactionLogIteratorJustEmptyFile) {
+ Init();
+ RollTheLog(false);
+ auto iter = OpenTransactionLogIter(0);
+ // Check that an empty iterator is returned
+ ASSERT_TRUE(!iter->Valid());
+}
+
+TEST_F(WalManagerTest, TransactionLogIteratorNewFileWhileScanning) {
+ Init();
+ CreateArchiveLogs(2, 100);
+ auto iter = OpenTransactionLogIter(0);
+ CreateArchiveLogs(1, 100);
+ int i = 0;
+ for (; iter->Valid(); iter->Next()) {
+ i++;
+ }
+ ASSERT_EQ(i, 200);
+ // A new log file was added after the iterator was created.
+ // TryAgain indicates a new iterator is needed to fetch the new data
+ ASSERT_TRUE(iter->status().IsTryAgain());
+
+ iter = OpenTransactionLogIter(0);
+ i = 0;
+ for (; iter->Valid(); iter->Next()) {
+ i++;
+ }
+ ASSERT_EQ(i, 300);
+ ASSERT_TRUE(iter->status().ok());
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "SKIPPED as WalManager is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/write_batch.cc b/src/rocksdb/db/write_batch.cc
new file mode 100644
index 000000000..d578db59b
--- /dev/null
+++ b/src/rocksdb/db/write_batch.cc
@@ -0,0 +1,2092 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// WriteBatch::rep_ :=
+// sequence: fixed64
+// count: fixed32
+// data: record[count]
+// record :=
+// kTypeValue varstring varstring
+// kTypeDeletion varstring
+// kTypeSingleDeletion varstring
+// kTypeRangeDeletion varstring varstring
+// kTypeMerge varstring varstring
+// kTypeColumnFamilyValue varint32 varstring varstring
+// kTypeColumnFamilyDeletion varint32 varstring
+// kTypeColumnFamilySingleDeletion varint32 varstring
+// kTypeColumnFamilyRangeDeletion varint32 varstring varstring
+// kTypeColumnFamilyMerge varint32 varstring varstring
+// kTypeBeginPrepareXID varstring
+// kTypeEndPrepareXID
+// kTypeCommitXID varstring
+// kTypeRollbackXID varstring
+// kTypeBeginPersistedPrepareXID varstring
+// kTypeBeginUnprepareXID varstring
+// kTypeNoop
+// varstring :=
+// len: varint32
+// data: uint8[len]
+
+#include "rocksdb/write_batch.h"
+
+#include <map>
+#include <stack>
+#include <stdexcept>
+#include <type_traits>
+#include <unordered_map>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "db/flush_scheduler.h"
+#include "db/memtable.h"
+#include "db/merge_context.h"
+#include "db/snapshot_impl.h"
+#include "db/trim_history_scheduler.h"
+#include "db/write_batch_internal.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/statistics.h"
+#include "rocksdb/merge_operator.h"
+#include "util/autovector.h"
+#include "util/cast_util.h"
+#include "util/coding.h"
+#include "util/duplicate_detector.h"
+#include "util/string_util.h"
+#include "util/util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// anon namespace for file-local types
+namespace {
+
+enum ContentFlags : uint32_t {
+ DEFERRED = 1 << 0,
+ HAS_PUT = 1 << 1,
+ HAS_DELETE = 1 << 2,
+ HAS_SINGLE_DELETE = 1 << 3,
+ HAS_MERGE = 1 << 4,
+ HAS_BEGIN_PREPARE = 1 << 5,
+ HAS_END_PREPARE = 1 << 6,
+ HAS_COMMIT = 1 << 7,
+ HAS_ROLLBACK = 1 << 8,
+ HAS_DELETE_RANGE = 1 << 9,
+ HAS_BLOB_INDEX = 1 << 10,
+ HAS_BEGIN_UNPREPARE = 1 << 11,
+};
+
+struct BatchContentClassifier : public WriteBatch::Handler {
+ uint32_t content_flags = 0;
+
+ Status PutCF(uint32_t, const Slice&, const Slice&) override {
+ content_flags |= ContentFlags::HAS_PUT;
+ return Status::OK();
+ }
+
+ Status DeleteCF(uint32_t, const Slice&) override {
+ content_flags |= ContentFlags::HAS_DELETE;
+ return Status::OK();
+ }
+
+ Status SingleDeleteCF(uint32_t, const Slice&) override {
+ content_flags |= ContentFlags::HAS_SINGLE_DELETE;
+ return Status::OK();
+ }
+
+ Status DeleteRangeCF(uint32_t, const Slice&, const Slice&) override {
+ content_flags |= ContentFlags::HAS_DELETE_RANGE;
+ return Status::OK();
+ }
+
+ Status MergeCF(uint32_t, const Slice&, const Slice&) override {
+ content_flags |= ContentFlags::HAS_MERGE;
+ return Status::OK();
+ }
+
+ Status PutBlobIndexCF(uint32_t, const Slice&, const Slice&) override {
+ content_flags |= ContentFlags::HAS_BLOB_INDEX;
+ return Status::OK();
+ }
+
+ Status MarkBeginPrepare(bool unprepare) override {
+ content_flags |= ContentFlags::HAS_BEGIN_PREPARE;
+ if (unprepare) {
+ content_flags |= ContentFlags::HAS_BEGIN_UNPREPARE;
+ }
+ return Status::OK();
+ }
+
+ Status MarkEndPrepare(const Slice&) override {
+ content_flags |= ContentFlags::HAS_END_PREPARE;
+ return Status::OK();
+ }
+
+ Status MarkCommit(const Slice&) override {
+ content_flags |= ContentFlags::HAS_COMMIT;
+ return Status::OK();
+ }
+
+ Status MarkRollback(const Slice&) override {
+ content_flags |= ContentFlags::HAS_ROLLBACK;
+ return Status::OK();
+ }
+};
+
+class TimestampAssigner : public WriteBatch::Handler {
+ public:
+ explicit TimestampAssigner(const Slice& ts)
+ : timestamp_(ts), timestamps_(kEmptyTimestampList) {}
+ explicit TimestampAssigner(const std::vector<Slice>& ts_list)
+ : timestamps_(ts_list) {
+ SanityCheck();
+ }
+ ~TimestampAssigner() override {}
+
+ Status PutCF(uint32_t, const Slice& key, const Slice&) override {
+ AssignTimestamp(key);
+ ++idx_;
+ return Status::OK();
+ }
+
+ Status DeleteCF(uint32_t, const Slice& key) override {
+ AssignTimestamp(key);
+ ++idx_;
+ return Status::OK();
+ }
+
+ Status SingleDeleteCF(uint32_t, const Slice& key) override {
+ AssignTimestamp(key);
+ ++idx_;
+ return Status::OK();
+ }
+
+ Status DeleteRangeCF(uint32_t, const Slice& begin_key,
+ const Slice& end_key) override {
+ AssignTimestamp(begin_key);
+ AssignTimestamp(end_key);
+ ++idx_;
+ return Status::OK();
+ }
+
+ Status MergeCF(uint32_t, const Slice& key, const Slice&) override {
+ AssignTimestamp(key);
+ ++idx_;
+ return Status::OK();
+ }
+
+ Status PutBlobIndexCF(uint32_t, const Slice&, const Slice&) override {
+ // TODO (yanqin): support blob db in the future.
+ return Status::OK();
+ }
+
+ Status MarkBeginPrepare(bool) override {
+ // TODO (yanqin): support in the future.
+ return Status::OK();
+ }
+
+ Status MarkEndPrepare(const Slice&) override {
+ // TODO (yanqin): support in the future.
+ return Status::OK();
+ }
+
+ Status MarkCommit(const Slice&) override {
+ // TODO (yanqin): support in the future.
+ return Status::OK();
+ }
+
+ Status MarkRollback(const Slice&) override {
+ // TODO (yanqin): support in the future.
+ return Status::OK();
+ }
+
+ private:
+ void SanityCheck() const {
+ assert(!timestamps_.empty());
+#ifndef NDEBUG
+ const size_t ts_sz = timestamps_[0].size();
+ for (size_t i = 1; i != timestamps_.size(); ++i) {
+ assert(ts_sz == timestamps_[i].size());
+ }
+#endif // !NDEBUG
+ }
+
+ void AssignTimestamp(const Slice& key) {
+ assert(timestamps_.empty() || idx_ < timestamps_.size());
+ const Slice& ts = timestamps_.empty() ? timestamp_ : timestamps_[idx_];
+ size_t ts_sz = ts.size();
+ char* ptr = const_cast<char*>(key.data() + key.size() - ts_sz);
+ memcpy(ptr, ts.data(), ts_sz);
+ }
+
+ static const std::vector<Slice> kEmptyTimestampList;
+ const Slice timestamp_;
+ const std::vector<Slice>& timestamps_;
+ size_t idx_ = 0;
+
+ // No copy or move.
+ TimestampAssigner(const TimestampAssigner&) = delete;
+ TimestampAssigner(TimestampAssigner&&) = delete;
+ TimestampAssigner& operator=(const TimestampAssigner&) = delete;
+ TimestampAssigner&& operator=(TimestampAssigner&&) = delete;
+};
+const std::vector<Slice> TimestampAssigner::kEmptyTimestampList;
+
+} // anon namespace
+
+struct SavePoints {
+ std::stack<SavePoint, autovector<SavePoint>> stack;
+};
+
+WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes)
+ : content_flags_(0), max_bytes_(max_bytes), rep_(), timestamp_size_(0) {
+ rep_.reserve((reserved_bytes > WriteBatchInternal::kHeader)
+ ? reserved_bytes
+ : WriteBatchInternal::kHeader);
+ rep_.resize(WriteBatchInternal::kHeader);
+}
+
+WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes, size_t ts_sz)
+ : content_flags_(0), max_bytes_(max_bytes), rep_(), timestamp_size_(ts_sz) {
+ rep_.reserve((reserved_bytes > WriteBatchInternal::kHeader) ?
+ reserved_bytes : WriteBatchInternal::kHeader);
+ rep_.resize(WriteBatchInternal::kHeader);
+}
+
+WriteBatch::WriteBatch(const std::string& rep)
+ : content_flags_(ContentFlags::DEFERRED),
+ max_bytes_(0),
+ rep_(rep),
+ timestamp_size_(0) {}
+
+WriteBatch::WriteBatch(std::string&& rep)
+ : content_flags_(ContentFlags::DEFERRED),
+ max_bytes_(0),
+ rep_(std::move(rep)),
+ timestamp_size_(0) {}
+
+WriteBatch::WriteBatch(const WriteBatch& src)
+ : wal_term_point_(src.wal_term_point_),
+ content_flags_(src.content_flags_.load(std::memory_order_relaxed)),
+ max_bytes_(src.max_bytes_),
+ rep_(src.rep_),
+ timestamp_size_(src.timestamp_size_) {
+ if (src.save_points_ != nullptr) {
+ save_points_.reset(new SavePoints());
+ save_points_->stack = src.save_points_->stack;
+ }
+}
+
+WriteBatch::WriteBatch(WriteBatch&& src) noexcept
+ : save_points_(std::move(src.save_points_)),
+ wal_term_point_(std::move(src.wal_term_point_)),
+ content_flags_(src.content_flags_.load(std::memory_order_relaxed)),
+ max_bytes_(src.max_bytes_),
+ rep_(std::move(src.rep_)),
+ timestamp_size_(src.timestamp_size_) {}
+
+WriteBatch& WriteBatch::operator=(const WriteBatch& src) {
+ if (&src != this) {
+ this->~WriteBatch();
+ new (this) WriteBatch(src);
+ }
+ return *this;
+}
+
+WriteBatch& WriteBatch::operator=(WriteBatch&& src) {
+ if (&src != this) {
+ this->~WriteBatch();
+ new (this) WriteBatch(std::move(src));
+ }
+ return *this;
+}
+
+WriteBatch::~WriteBatch() { }
+
+WriteBatch::Handler::~Handler() { }
+
+void WriteBatch::Handler::LogData(const Slice& /*blob*/) {
+ // If the user has not specified something to do with blobs, then we ignore
+ // them.
+}
+
+bool WriteBatch::Handler::Continue() {
+ return true;
+}
+
+void WriteBatch::Clear() {
+ rep_.clear();
+ rep_.resize(WriteBatchInternal::kHeader);
+
+ content_flags_.store(0, std::memory_order_relaxed);
+
+ if (save_points_ != nullptr) {
+ while (!save_points_->stack.empty()) {
+ save_points_->stack.pop();
+ }
+ }
+
+ wal_term_point_.clear();
+}
+
+uint32_t WriteBatch::Count() const { return WriteBatchInternal::Count(this); }
+
+uint32_t WriteBatch::ComputeContentFlags() const {
+ auto rv = content_flags_.load(std::memory_order_relaxed);
+ if ((rv & ContentFlags::DEFERRED) != 0) {
+ BatchContentClassifier classifier;
+ Iterate(&classifier);
+ rv = classifier.content_flags;
+
+ // this method is conceptually const, because it is performing a lazy
+ // computation that doesn't affect the abstract state of the batch.
+ // content_flags_ is marked mutable so that we can perform the
+ // following assignment
+ content_flags_.store(rv, std::memory_order_relaxed);
+ }
+ return rv;
+}
+
+void WriteBatch::MarkWalTerminationPoint() {
+ wal_term_point_.size = GetDataSize();
+ wal_term_point_.count = Count();
+ wal_term_point_.content_flags = content_flags_;
+}
+
+bool WriteBatch::HasPut() const {
+ return (ComputeContentFlags() & ContentFlags::HAS_PUT) != 0;
+}
+
+bool WriteBatch::HasDelete() const {
+ return (ComputeContentFlags() & ContentFlags::HAS_DELETE) != 0;
+}
+
+bool WriteBatch::HasSingleDelete() const {
+ return (ComputeContentFlags() & ContentFlags::HAS_SINGLE_DELETE) != 0;
+}
+
+bool WriteBatch::HasDeleteRange() const {
+ return (ComputeContentFlags() & ContentFlags::HAS_DELETE_RANGE) != 0;
+}
+
+bool WriteBatch::HasMerge() const {
+ return (ComputeContentFlags() & ContentFlags::HAS_MERGE) != 0;
+}
+
+bool ReadKeyFromWriteBatchEntry(Slice* input, Slice* key, bool cf_record) {
+ assert(input != nullptr && key != nullptr);
+ // Skip tag byte
+ input->remove_prefix(1);
+
+ if (cf_record) {
+ // Skip column_family bytes
+ uint32_t cf;
+ if (!GetVarint32(input, &cf)) {
+ return false;
+ }
+ }
+
+ // Extract key
+ return GetLengthPrefixedSlice(input, key);
+}
+
+bool WriteBatch::HasBeginPrepare() const {
+ return (ComputeContentFlags() & ContentFlags::HAS_BEGIN_PREPARE) != 0;
+}
+
+bool WriteBatch::HasEndPrepare() const {
+ return (ComputeContentFlags() & ContentFlags::HAS_END_PREPARE) != 0;
+}
+
+bool WriteBatch::HasCommit() const {
+ return (ComputeContentFlags() & ContentFlags::HAS_COMMIT) != 0;
+}
+
+bool WriteBatch::HasRollback() const {
+ return (ComputeContentFlags() & ContentFlags::HAS_ROLLBACK) != 0;
+}
+
+Status ReadRecordFromWriteBatch(Slice* input, char* tag,
+ uint32_t* column_family, Slice* key,
+ Slice* value, Slice* blob, Slice* xid) {
+ assert(key != nullptr && value != nullptr);
+ *tag = (*input)[0];
+ input->remove_prefix(1);
+ *column_family = 0; // default
+ switch (*tag) {
+ case kTypeColumnFamilyValue:
+ if (!GetVarint32(input, column_family)) {
+ return Status::Corruption("bad WriteBatch Put");
+ }
+ FALLTHROUGH_INTENDED;
+ case kTypeValue:
+ if (!GetLengthPrefixedSlice(input, key) ||
+ !GetLengthPrefixedSlice(input, value)) {
+ return Status::Corruption("bad WriteBatch Put");
+ }
+ break;
+ case kTypeColumnFamilyDeletion:
+ case kTypeColumnFamilySingleDeletion:
+ if (!GetVarint32(input, column_family)) {
+ return Status::Corruption("bad WriteBatch Delete");
+ }
+ FALLTHROUGH_INTENDED;
+ case kTypeDeletion:
+ case kTypeSingleDeletion:
+ if (!GetLengthPrefixedSlice(input, key)) {
+ return Status::Corruption("bad WriteBatch Delete");
+ }
+ break;
+ case kTypeColumnFamilyRangeDeletion:
+ if (!GetVarint32(input, column_family)) {
+ return Status::Corruption("bad WriteBatch DeleteRange");
+ }
+ FALLTHROUGH_INTENDED;
+ case kTypeRangeDeletion:
+ // for range delete, "key" is begin_key, "value" is end_key
+ if (!GetLengthPrefixedSlice(input, key) ||
+ !GetLengthPrefixedSlice(input, value)) {
+ return Status::Corruption("bad WriteBatch DeleteRange");
+ }
+ break;
+ case kTypeColumnFamilyMerge:
+ if (!GetVarint32(input, column_family)) {
+ return Status::Corruption("bad WriteBatch Merge");
+ }
+ FALLTHROUGH_INTENDED;
+ case kTypeMerge:
+ if (!GetLengthPrefixedSlice(input, key) ||
+ !GetLengthPrefixedSlice(input, value)) {
+ return Status::Corruption("bad WriteBatch Merge");
+ }
+ break;
+ case kTypeColumnFamilyBlobIndex:
+ if (!GetVarint32(input, column_family)) {
+ return Status::Corruption("bad WriteBatch BlobIndex");
+ }
+ FALLTHROUGH_INTENDED;
+ case kTypeBlobIndex:
+ if (!GetLengthPrefixedSlice(input, key) ||
+ !GetLengthPrefixedSlice(input, value)) {
+ return Status::Corruption("bad WriteBatch BlobIndex");
+ }
+ break;
+ case kTypeLogData:
+ assert(blob != nullptr);
+ if (!GetLengthPrefixedSlice(input, blob)) {
+ return Status::Corruption("bad WriteBatch Blob");
+ }
+ break;
+ case kTypeNoop:
+ case kTypeBeginPrepareXID:
+ // This indicates that the prepared batch is also persisted in the db.
+ // This is used in WritePreparedTxn
+ case kTypeBeginPersistedPrepareXID:
+ // This is used in WriteUnpreparedTxn
+ case kTypeBeginUnprepareXID:
+ break;
+ case kTypeEndPrepareXID:
+ if (!GetLengthPrefixedSlice(input, xid)) {
+ return Status::Corruption("bad EndPrepare XID");
+ }
+ break;
+ case kTypeCommitXID:
+ if (!GetLengthPrefixedSlice(input, xid)) {
+ return Status::Corruption("bad Commit XID");
+ }
+ break;
+ case kTypeRollbackXID:
+ if (!GetLengthPrefixedSlice(input, xid)) {
+ return Status::Corruption("bad Rollback XID");
+ }
+ break;
+ default:
+ return Status::Corruption("unknown WriteBatch tag");
+ }
+ return Status::OK();
+}
+
+Status WriteBatch::Iterate(Handler* handler) const {
+ if (rep_.size() < WriteBatchInternal::kHeader) {
+ return Status::Corruption("malformed WriteBatch (too small)");
+ }
+
+ return WriteBatchInternal::Iterate(this, handler, WriteBatchInternal::kHeader,
+ rep_.size());
+}
+
+Status WriteBatchInternal::Iterate(const WriteBatch* wb,
+ WriteBatch::Handler* handler, size_t begin,
+ size_t end) {
+ if (begin > wb->rep_.size() || end > wb->rep_.size() || end < begin) {
+ return Status::Corruption("Invalid start/end bounds for Iterate");
+ }
+ assert(begin <= end);
+ Slice input(wb->rep_.data() + begin, static_cast<size_t>(end - begin));
+ bool whole_batch =
+ (begin == WriteBatchInternal::kHeader) && (end == wb->rep_.size());
+
+ Slice key, value, blob, xid;
+ // Sometimes a sub-batch starts with a Noop. We want to exclude such Noops as
+ // the batch boundary symbols otherwise we would mis-count the number of
+ // batches. We do that by checking whether the accumulated batch is empty
+ // before seeing the next Noop.
+ bool empty_batch = true;
+ uint32_t found = 0;
+ Status s;
+ char tag = 0;
+ uint32_t column_family = 0; // default
+ bool last_was_try_again = false;
+ bool handler_continue = true;
+ while (((s.ok() && !input.empty()) || UNLIKELY(s.IsTryAgain()))) {
+ handler_continue = handler->Continue();
+ if (!handler_continue) {
+ break;
+ }
+
+ if (LIKELY(!s.IsTryAgain())) {
+ last_was_try_again = false;
+ tag = 0;
+ column_family = 0; // default
+
+ s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value,
+ &blob, &xid);
+ if (!s.ok()) {
+ return s;
+ }
+ } else {
+ assert(s.IsTryAgain());
+ assert(!last_was_try_again); // to detect infinite loop bugs
+ if (UNLIKELY(last_was_try_again)) {
+ return Status::Corruption(
+ "two consecutive TryAgain in WriteBatch handler; this is either a "
+ "software bug or data corruption.");
+ }
+ last_was_try_again = true;
+ s = Status::OK();
+ }
+
+ switch (tag) {
+ case kTypeColumnFamilyValue:
+ case kTypeValue:
+ assert(wb->content_flags_.load(std::memory_order_relaxed) &
+ (ContentFlags::DEFERRED | ContentFlags::HAS_PUT));
+ s = handler->PutCF(column_family, key, value);
+ if (LIKELY(s.ok())) {
+ empty_batch = false;
+ found++;
+ }
+ break;
+ case kTypeColumnFamilyDeletion:
+ case kTypeDeletion:
+ assert(wb->content_flags_.load(std::memory_order_relaxed) &
+ (ContentFlags::DEFERRED | ContentFlags::HAS_DELETE));
+ s = handler->DeleteCF(column_family, key);
+ if (LIKELY(s.ok())) {
+ empty_batch = false;
+ found++;
+ }
+ break;
+ case kTypeColumnFamilySingleDeletion:
+ case kTypeSingleDeletion:
+ assert(wb->content_flags_.load(std::memory_order_relaxed) &
+ (ContentFlags::DEFERRED | ContentFlags::HAS_SINGLE_DELETE));
+ s = handler->SingleDeleteCF(column_family, key);
+ if (LIKELY(s.ok())) {
+ empty_batch = false;
+ found++;
+ }
+ break;
+ case kTypeColumnFamilyRangeDeletion:
+ case kTypeRangeDeletion:
+ assert(wb->content_flags_.load(std::memory_order_relaxed) &
+ (ContentFlags::DEFERRED | ContentFlags::HAS_DELETE_RANGE));
+ s = handler->DeleteRangeCF(column_family, key, value);
+ if (LIKELY(s.ok())) {
+ empty_batch = false;
+ found++;
+ }
+ break;
+ case kTypeColumnFamilyMerge:
+ case kTypeMerge:
+ assert(wb->content_flags_.load(std::memory_order_relaxed) &
+ (ContentFlags::DEFERRED | ContentFlags::HAS_MERGE));
+ s = handler->MergeCF(column_family, key, value);
+ if (LIKELY(s.ok())) {
+ empty_batch = false;
+ found++;
+ }
+ break;
+ case kTypeColumnFamilyBlobIndex:
+ case kTypeBlobIndex:
+ assert(wb->content_flags_.load(std::memory_order_relaxed) &
+ (ContentFlags::DEFERRED | ContentFlags::HAS_BLOB_INDEX));
+ s = handler->PutBlobIndexCF(column_family, key, value);
+ if (LIKELY(s.ok())) {
+ found++;
+ }
+ break;
+ case kTypeLogData:
+ handler->LogData(blob);
+ // A batch might have nothing but LogData. It is still a batch.
+ empty_batch = false;
+ break;
+ case kTypeBeginPrepareXID:
+ assert(wb->content_flags_.load(std::memory_order_relaxed) &
+ (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE));
+ handler->MarkBeginPrepare();
+ empty_batch = false;
+ if (!handler->WriteAfterCommit()) {
+ s = Status::NotSupported(
+ "WriteCommitted txn tag when write_after_commit_ is disabled (in "
+ "WritePrepared/WriteUnprepared mode). If it is not due to "
+ "corruption, the WAL must be emptied before changing the "
+ "WritePolicy.");
+ }
+ if (handler->WriteBeforePrepare()) {
+ s = Status::NotSupported(
+ "WriteCommitted txn tag when write_before_prepare_ is enabled "
+ "(in WriteUnprepared mode). If it is not due to corruption, the "
+ "WAL must be emptied before changing the WritePolicy.");
+ }
+ break;
+ case kTypeBeginPersistedPrepareXID:
+ assert(wb->content_flags_.load(std::memory_order_relaxed) &
+ (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE));
+ handler->MarkBeginPrepare();
+ empty_batch = false;
+ if (handler->WriteAfterCommit()) {
+ s = Status::NotSupported(
+ "WritePrepared/WriteUnprepared txn tag when write_after_commit_ "
+ "is enabled (in default WriteCommitted mode). If it is not due "
+ "to corruption, the WAL must be emptied before changing the "
+ "WritePolicy.");
+ }
+ break;
+ case kTypeBeginUnprepareXID:
+ assert(wb->content_flags_.load(std::memory_order_relaxed) &
+ (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_UNPREPARE));
+ handler->MarkBeginPrepare(true /* unprepared */);
+ empty_batch = false;
+ if (handler->WriteAfterCommit()) {
+ s = Status::NotSupported(
+ "WriteUnprepared txn tag when write_after_commit_ is enabled (in "
+ "default WriteCommitted mode). If it is not due to corruption, "
+ "the WAL must be emptied before changing the WritePolicy.");
+ }
+ if (!handler->WriteBeforePrepare()) {
+ s = Status::NotSupported(
+ "WriteUnprepared txn tag when write_before_prepare_ is disabled "
+ "(in WriteCommitted/WritePrepared mode). If it is not due to "
+ "corruption, the WAL must be emptied before changing the "
+ "WritePolicy.");
+ }
+ break;
+ case kTypeEndPrepareXID:
+ assert(wb->content_flags_.load(std::memory_order_relaxed) &
+ (ContentFlags::DEFERRED | ContentFlags::HAS_END_PREPARE));
+ handler->MarkEndPrepare(xid);
+ empty_batch = true;
+ break;
+ case kTypeCommitXID:
+ assert(wb->content_flags_.load(std::memory_order_relaxed) &
+ (ContentFlags::DEFERRED | ContentFlags::HAS_COMMIT));
+ handler->MarkCommit(xid);
+ empty_batch = true;
+ break;
+ case kTypeRollbackXID:
+ assert(wb->content_flags_.load(std::memory_order_relaxed) &
+ (ContentFlags::DEFERRED | ContentFlags::HAS_ROLLBACK));
+ handler->MarkRollback(xid);
+ empty_batch = true;
+ break;
+ case kTypeNoop:
+ handler->MarkNoop(empty_batch);
+ empty_batch = true;
+ break;
+ default:
+ return Status::Corruption("unknown WriteBatch tag");
+ }
+ }
+ if (!s.ok()) {
+ return s;
+ }
+ if (handler_continue && whole_batch &&
+ found != WriteBatchInternal::Count(wb)) {
+ return Status::Corruption("WriteBatch has wrong count");
+ } else {
+ return Status::OK();
+ }
+}
+
+bool WriteBatchInternal::IsLatestPersistentState(const WriteBatch* b) {
+ return b->is_latest_persistent_state_;
+}
+
+void WriteBatchInternal::SetAsLastestPersistentState(WriteBatch* b) {
+ b->is_latest_persistent_state_ = true;
+}
+
+uint32_t WriteBatchInternal::Count(const WriteBatch* b) {
+ return DecodeFixed32(b->rep_.data() + 8);
+}
+
+void WriteBatchInternal::SetCount(WriteBatch* b, uint32_t n) {
+ EncodeFixed32(&b->rep_[8], n);
+}
+
+SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) {
+ return SequenceNumber(DecodeFixed64(b->rep_.data()));
+}
+
+void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) {
+ EncodeFixed64(&b->rep_[0], seq);
+}
+
+size_t WriteBatchInternal::GetFirstOffset(WriteBatch* /*b*/) {
+ return WriteBatchInternal::kHeader;
+}
+
+Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
+ const Slice& key, const Slice& value) {
+ if (key.size() > size_t{port::kMaxUint32}) {
+ return Status::InvalidArgument("key is too large");
+ }
+ if (value.size() > size_t{port::kMaxUint32}) {
+ return Status::InvalidArgument("value is too large");
+ }
+
+ LocalSavePoint save(b);
+ WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+ if (column_family_id == 0) {
+ b->rep_.push_back(static_cast<char>(kTypeValue));
+ } else {
+ b->rep_.push_back(static_cast<char>(kTypeColumnFamilyValue));
+ PutVarint32(&b->rep_, column_family_id);
+ }
+ if (0 == b->timestamp_size_) {
+ PutLengthPrefixedSlice(&b->rep_, key);
+ } else {
+ PutVarint32(&b->rep_,
+ static_cast<uint32_t>(key.size() + b->timestamp_size_));
+ b->rep_.append(key.data(), key.size());
+ b->rep_.append(b->timestamp_size_, '\0');
+ }
+ PutLengthPrefixedSlice(&b->rep_, value);
+ b->content_flags_.store(
+ b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT,
+ std::memory_order_relaxed);
+ return save.commit();
+}
+
+Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) {
+ return WriteBatchInternal::Put(this, GetColumnFamilyID(column_family), key,
+ value);
+}
+
+Status WriteBatchInternal::CheckSlicePartsLength(const SliceParts& key,
+ const SliceParts& value) {
+ size_t total_key_bytes = 0;
+ for (int i = 0; i < key.num_parts; ++i) {
+ total_key_bytes += key.parts[i].size();
+ }
+ if (total_key_bytes >= size_t{port::kMaxUint32}) {
+ return Status::InvalidArgument("key is too large");
+ }
+
+ size_t total_value_bytes = 0;
+ for (int i = 0; i < value.num_parts; ++i) {
+ total_value_bytes += value.parts[i].size();
+ }
+ if (total_value_bytes >= size_t{port::kMaxUint32}) {
+ return Status::InvalidArgument("value is too large");
+ }
+ return Status::OK();
+}
+
+Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
+ const SliceParts& key, const SliceParts& value) {
+ Status s = CheckSlicePartsLength(key, value);
+ if (!s.ok()) {
+ return s;
+ }
+
+ LocalSavePoint save(b);
+ WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+ if (column_family_id == 0) {
+ b->rep_.push_back(static_cast<char>(kTypeValue));
+ } else {
+ b->rep_.push_back(static_cast<char>(kTypeColumnFamilyValue));
+ PutVarint32(&b->rep_, column_family_id);
+ }
+ if (0 == b->timestamp_size_) {
+ PutLengthPrefixedSliceParts(&b->rep_, key);
+ } else {
+ PutLengthPrefixedSlicePartsWithPadding(&b->rep_, key, b->timestamp_size_);
+ }
+ PutLengthPrefixedSliceParts(&b->rep_, value);
+ b->content_flags_.store(
+ b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT,
+ std::memory_order_relaxed);
+ return save.commit();
+}
+
+Status WriteBatch::Put(ColumnFamilyHandle* column_family, const SliceParts& key,
+ const SliceParts& value) {
+ return WriteBatchInternal::Put(this, GetColumnFamilyID(column_family), key,
+ value);
+}
+
+Status WriteBatchInternal::InsertNoop(WriteBatch* b) {
+ b->rep_.push_back(static_cast<char>(kTypeNoop));
+ return Status::OK();
+}
+
+Status WriteBatchInternal::MarkEndPrepare(WriteBatch* b, const Slice& xid,
+ bool write_after_commit,
+ bool unprepared_batch) {
+ // a manually constructed batch can only contain one prepare section
+ assert(b->rep_[12] == static_cast<char>(kTypeNoop));
+
+ // all savepoints up to this point are cleared
+ if (b->save_points_ != nullptr) {
+ while (!b->save_points_->stack.empty()) {
+ b->save_points_->stack.pop();
+ }
+ }
+
+ // rewrite noop as begin marker
+ b->rep_[12] = static_cast<char>(
+ write_after_commit ? kTypeBeginPrepareXID
+ : (unprepared_batch ? kTypeBeginUnprepareXID
+ : kTypeBeginPersistedPrepareXID));
+ b->rep_.push_back(static_cast<char>(kTypeEndPrepareXID));
+ PutLengthPrefixedSlice(&b->rep_, xid);
+ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+ ContentFlags::HAS_END_PREPARE |
+ ContentFlags::HAS_BEGIN_PREPARE,
+ std::memory_order_relaxed);
+ if (unprepared_batch) {
+ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+ ContentFlags::HAS_BEGIN_UNPREPARE,
+ std::memory_order_relaxed);
+ }
+ return Status::OK();
+}
+
+Status WriteBatchInternal::MarkCommit(WriteBatch* b, const Slice& xid) {
+ b->rep_.push_back(static_cast<char>(kTypeCommitXID));
+ PutLengthPrefixedSlice(&b->rep_, xid);
+ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+ ContentFlags::HAS_COMMIT,
+ std::memory_order_relaxed);
+ return Status::OK();
+}
+
+Status WriteBatchInternal::MarkRollback(WriteBatch* b, const Slice& xid) {
+ b->rep_.push_back(static_cast<char>(kTypeRollbackXID));
+ PutLengthPrefixedSlice(&b->rep_, xid);
+ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+ ContentFlags::HAS_ROLLBACK,
+ std::memory_order_relaxed);
+ return Status::OK();
+}
+
+Status WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id,
+ const Slice& key) {
+ LocalSavePoint save(b);
+ WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+ if (column_family_id == 0) {
+ b->rep_.push_back(static_cast<char>(kTypeDeletion));
+ } else {
+ b->rep_.push_back(static_cast<char>(kTypeColumnFamilyDeletion));
+ PutVarint32(&b->rep_, column_family_id);
+ }
+ PutLengthPrefixedSlice(&b->rep_, key);
+ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+ ContentFlags::HAS_DELETE,
+ std::memory_order_relaxed);
+ return save.commit();
+}
+
+Status WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key) {
+ return WriteBatchInternal::Delete(this, GetColumnFamilyID(column_family),
+ key);
+}
+
+Status WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id,
+ const SliceParts& key) {
+ LocalSavePoint save(b);
+ WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+ if (column_family_id == 0) {
+ b->rep_.push_back(static_cast<char>(kTypeDeletion));
+ } else {
+ b->rep_.push_back(static_cast<char>(kTypeColumnFamilyDeletion));
+ PutVarint32(&b->rep_, column_family_id);
+ }
+ PutLengthPrefixedSliceParts(&b->rep_, key);
+ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+ ContentFlags::HAS_DELETE,
+ std::memory_order_relaxed);
+ return save.commit();
+}
+
+Status WriteBatch::Delete(ColumnFamilyHandle* column_family,
+ const SliceParts& key) {
+ return WriteBatchInternal::Delete(this, GetColumnFamilyID(column_family),
+ key);
+}
+
+Status WriteBatchInternal::SingleDelete(WriteBatch* b,
+ uint32_t column_family_id,
+ const Slice& key) {
+ LocalSavePoint save(b);
+ WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+ if (column_family_id == 0) {
+ b->rep_.push_back(static_cast<char>(kTypeSingleDeletion));
+ } else {
+ b->rep_.push_back(static_cast<char>(kTypeColumnFamilySingleDeletion));
+ PutVarint32(&b->rep_, column_family_id);
+ }
+ PutLengthPrefixedSlice(&b->rep_, key);
+ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+ ContentFlags::HAS_SINGLE_DELETE,
+ std::memory_order_relaxed);
+ return save.commit();
+}
+
+Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
+ const Slice& key) {
+ return WriteBatchInternal::SingleDelete(
+ this, GetColumnFamilyID(column_family), key);
+}
+
+Status WriteBatchInternal::SingleDelete(WriteBatch* b,
+ uint32_t column_family_id,
+ const SliceParts& key) {
+ LocalSavePoint save(b);
+ WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+ if (column_family_id == 0) {
+ b->rep_.push_back(static_cast<char>(kTypeSingleDeletion));
+ } else {
+ b->rep_.push_back(static_cast<char>(kTypeColumnFamilySingleDeletion));
+ PutVarint32(&b->rep_, column_family_id);
+ }
+ PutLengthPrefixedSliceParts(&b->rep_, key);
+ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+ ContentFlags::HAS_SINGLE_DELETE,
+ std::memory_order_relaxed);
+ return save.commit();
+}
+
+Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
+ const SliceParts& key) {
+ return WriteBatchInternal::SingleDelete(
+ this, GetColumnFamilyID(column_family), key);
+}
+
+Status WriteBatchInternal::DeleteRange(WriteBatch* b, uint32_t column_family_id,
+ const Slice& begin_key,
+ const Slice& end_key) {
+ LocalSavePoint save(b);
+ WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+ if (column_family_id == 0) {
+ b->rep_.push_back(static_cast<char>(kTypeRangeDeletion));
+ } else {
+ b->rep_.push_back(static_cast<char>(kTypeColumnFamilyRangeDeletion));
+ PutVarint32(&b->rep_, column_family_id);
+ }
+ PutLengthPrefixedSlice(&b->rep_, begin_key);
+ PutLengthPrefixedSlice(&b->rep_, end_key);
+ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+ ContentFlags::HAS_DELETE_RANGE,
+ std::memory_order_relaxed);
+ return save.commit();
+}
+
+Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family,
+ const Slice& begin_key, const Slice& end_key) {
+ return WriteBatchInternal::DeleteRange(this, GetColumnFamilyID(column_family),
+ begin_key, end_key);
+}
+
+Status WriteBatchInternal::DeleteRange(WriteBatch* b, uint32_t column_family_id,
+ const SliceParts& begin_key,
+ const SliceParts& end_key) {
+ LocalSavePoint save(b);
+ WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+ if (column_family_id == 0) {
+ b->rep_.push_back(static_cast<char>(kTypeRangeDeletion));
+ } else {
+ b->rep_.push_back(static_cast<char>(kTypeColumnFamilyRangeDeletion));
+ PutVarint32(&b->rep_, column_family_id);
+ }
+ PutLengthPrefixedSliceParts(&b->rep_, begin_key);
+ PutLengthPrefixedSliceParts(&b->rep_, end_key);
+ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+ ContentFlags::HAS_DELETE_RANGE,
+ std::memory_order_relaxed);
+ return save.commit();
+}
+
+Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family,
+ const SliceParts& begin_key,
+ const SliceParts& end_key) {
+ return WriteBatchInternal::DeleteRange(this, GetColumnFamilyID(column_family),
+ begin_key, end_key);
+}
+
+Status WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id,
+ const Slice& key, const Slice& value) {
+ if (key.size() > size_t{port::kMaxUint32}) {
+ return Status::InvalidArgument("key is too large");
+ }
+ if (value.size() > size_t{port::kMaxUint32}) {
+ return Status::InvalidArgument("value is too large");
+ }
+
+ LocalSavePoint save(b);
+ WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+ if (column_family_id == 0) {
+ b->rep_.push_back(static_cast<char>(kTypeMerge));
+ } else {
+ b->rep_.push_back(static_cast<char>(kTypeColumnFamilyMerge));
+ PutVarint32(&b->rep_, column_family_id);
+ }
+ PutLengthPrefixedSlice(&b->rep_, key);
+ PutLengthPrefixedSlice(&b->rep_, value);
+ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+ ContentFlags::HAS_MERGE,
+ std::memory_order_relaxed);
+ return save.commit();
+}
+
+Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) {
+ return WriteBatchInternal::Merge(this, GetColumnFamilyID(column_family), key,
+ value);
+}
+
+Status WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id,
+ const SliceParts& key,
+ const SliceParts& value) {
+ Status s = CheckSlicePartsLength(key, value);
+ if (!s.ok()) {
+ return s;
+ }
+
+ LocalSavePoint save(b);
+ WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+ if (column_family_id == 0) {
+ b->rep_.push_back(static_cast<char>(kTypeMerge));
+ } else {
+ b->rep_.push_back(static_cast<char>(kTypeColumnFamilyMerge));
+ PutVarint32(&b->rep_, column_family_id);
+ }
+ PutLengthPrefixedSliceParts(&b->rep_, key);
+ PutLengthPrefixedSliceParts(&b->rep_, value);
+ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+ ContentFlags::HAS_MERGE,
+ std::memory_order_relaxed);
+ return save.commit();
+}
+
+Status WriteBatch::Merge(ColumnFamilyHandle* column_family,
+ const SliceParts& key, const SliceParts& value) {
+ return WriteBatchInternal::Merge(this, GetColumnFamilyID(column_family), key,
+ value);
+}
+
+Status WriteBatchInternal::PutBlobIndex(WriteBatch* b,
+ uint32_t column_family_id,
+ const Slice& key, const Slice& value) {
+ LocalSavePoint save(b);
+ WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+ if (column_family_id == 0) {
+ b->rep_.push_back(static_cast<char>(kTypeBlobIndex));
+ } else {
+ b->rep_.push_back(static_cast<char>(kTypeColumnFamilyBlobIndex));
+ PutVarint32(&b->rep_, column_family_id);
+ }
+ PutLengthPrefixedSlice(&b->rep_, key);
+ PutLengthPrefixedSlice(&b->rep_, value);
+ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+ ContentFlags::HAS_BLOB_INDEX,
+ std::memory_order_relaxed);
+ return save.commit();
+}
+
+Status WriteBatch::PutLogData(const Slice& blob) {
+ LocalSavePoint save(this);
+ rep_.push_back(static_cast<char>(kTypeLogData));
+ PutLengthPrefixedSlice(&rep_, blob);
+ return save.commit();
+}
+
+void WriteBatch::SetSavePoint() {
+ if (save_points_ == nullptr) {
+ save_points_.reset(new SavePoints());
+ }
+ // Record length and count of current batch of writes.
+ save_points_->stack.push(SavePoint(
+ GetDataSize(), Count(), content_flags_.load(std::memory_order_relaxed)));
+}
+
+Status WriteBatch::RollbackToSavePoint() {
+ if (save_points_ == nullptr || save_points_->stack.size() == 0) {
+ return Status::NotFound();
+ }
+
+ // Pop the most recent savepoint off the stack
+ SavePoint savepoint = save_points_->stack.top();
+ save_points_->stack.pop();
+
+ assert(savepoint.size <= rep_.size());
+ assert(static_cast<uint32_t>(savepoint.count) <= Count());
+
+ if (savepoint.size == rep_.size()) {
+ // No changes to rollback
+ } else if (savepoint.size == 0) {
+ // Rollback everything
+ Clear();
+ } else {
+ rep_.resize(savepoint.size);
+ WriteBatchInternal::SetCount(this, savepoint.count);
+ content_flags_.store(savepoint.content_flags, std::memory_order_relaxed);
+ }
+
+ return Status::OK();
+}
+
+Status WriteBatch::PopSavePoint() {
+ if (save_points_ == nullptr || save_points_->stack.size() == 0) {
+ return Status::NotFound();
+ }
+
+ // Pop the most recent savepoint off the stack
+ save_points_->stack.pop();
+
+ return Status::OK();
+}
+
+Status WriteBatch::AssignTimestamp(const Slice& ts) {
+ TimestampAssigner ts_assigner(ts);
+ return Iterate(&ts_assigner);
+}
+
+Status WriteBatch::AssignTimestamps(const std::vector<Slice>& ts_list) {
+ TimestampAssigner ts_assigner(ts_list);
+ return Iterate(&ts_assigner);
+}
+
+class MemTableInserter : public WriteBatch::Handler {
+
+ SequenceNumber sequence_;
+ ColumnFamilyMemTables* const cf_mems_;
+ FlushScheduler* const flush_scheduler_;
+ TrimHistoryScheduler* const trim_history_scheduler_;
+ const bool ignore_missing_column_families_;
+ const uint64_t recovering_log_number_;
+ // log number that all Memtables inserted into should reference
+ uint64_t log_number_ref_;
+ DBImpl* db_;
+ const bool concurrent_memtable_writes_;
+ bool post_info_created_;
+
+ bool* has_valid_writes_;
+ // On some (!) platforms just default creating
+ // a map is too expensive in the Write() path as they
+ // cause memory allocations though unused.
+ // Make creation optional but do not incur
+ // std::unique_ptr additional allocation
+ using MemPostInfoMap = std::map<MemTable*, MemTablePostProcessInfo>;
+ using PostMapType = std::aligned_storage<sizeof(MemPostInfoMap)>::type;
+ PostMapType mem_post_info_map_;
+ // current recovered transaction we are rebuilding (recovery)
+ WriteBatch* rebuilding_trx_;
+ SequenceNumber rebuilding_trx_seq_;
+ // Increase seq number once per each write batch. Otherwise increase it once
+ // per key.
+ bool seq_per_batch_;
+ // Whether the memtable write will be done only after the commit
+ bool write_after_commit_;
+ // Whether memtable write can be done before prepare
+ bool write_before_prepare_;
+ // Whether this batch was unprepared or not
+ bool unprepared_batch_;
+ using DupDetector = std::aligned_storage<sizeof(DuplicateDetector)>::type;
+ DupDetector duplicate_detector_;
+ bool dup_dectector_on_;
+
+ bool hint_per_batch_;
+ bool hint_created_;
+ // Hints for this batch
+ using HintMap = std::unordered_map<MemTable*, void*>;
+ using HintMapType = std::aligned_storage<sizeof(HintMap)>::type;
+ HintMapType hint_;
+
+ HintMap& GetHintMap() {
+ assert(hint_per_batch_);
+ if (!hint_created_) {
+ new (&hint_) HintMap();
+ hint_created_ = true;
+ }
+ return *reinterpret_cast<HintMap*>(&hint_);
+ }
+
+ MemPostInfoMap& GetPostMap() {
+ assert(concurrent_memtable_writes_);
+ if(!post_info_created_) {
+ new (&mem_post_info_map_) MemPostInfoMap();
+ post_info_created_ = true;
+ }
+ return *reinterpret_cast<MemPostInfoMap*>(&mem_post_info_map_);
+ }
+
+ bool IsDuplicateKeySeq(uint32_t column_family_id, const Slice& key) {
+ assert(!write_after_commit_);
+ assert(rebuilding_trx_ != nullptr);
+ if (!dup_dectector_on_) {
+ new (&duplicate_detector_) DuplicateDetector(db_);
+ dup_dectector_on_ = true;
+ }
+ return reinterpret_cast<DuplicateDetector*>
+ (&duplicate_detector_)->IsDuplicateKeySeq(column_family_id, key, sequence_);
+ }
+
+ protected:
+ bool WriteBeforePrepare() const override { return write_before_prepare_; }
+ bool WriteAfterCommit() const override { return write_after_commit_; }
+
+ public:
+ // cf_mems should not be shared with concurrent inserters
+ MemTableInserter(SequenceNumber _sequence, ColumnFamilyMemTables* cf_mems,
+ FlushScheduler* flush_scheduler,
+ TrimHistoryScheduler* trim_history_scheduler,
+ bool ignore_missing_column_families,
+ uint64_t recovering_log_number, DB* db,
+ bool concurrent_memtable_writes,
+ bool* has_valid_writes = nullptr, bool seq_per_batch = false,
+ bool batch_per_txn = true, bool hint_per_batch = false)
+ : sequence_(_sequence),
+ cf_mems_(cf_mems),
+ flush_scheduler_(flush_scheduler),
+ trim_history_scheduler_(trim_history_scheduler),
+ ignore_missing_column_families_(ignore_missing_column_families),
+ recovering_log_number_(recovering_log_number),
+ log_number_ref_(0),
+ db_(static_cast_with_check<DBImpl, DB>(db)),
+ concurrent_memtable_writes_(concurrent_memtable_writes),
+ post_info_created_(false),
+ has_valid_writes_(has_valid_writes),
+ rebuilding_trx_(nullptr),
+ rebuilding_trx_seq_(0),
+ seq_per_batch_(seq_per_batch),
+ // Write after commit currently uses one seq per key (instead of per
+ // batch). So seq_per_batch being false indicates write_after_commit
+ // approach.
+ write_after_commit_(!seq_per_batch),
+ // WriteUnprepared can write WriteBatches per transaction, so
+ // batch_per_txn being false indicates write_before_prepare.
+ write_before_prepare_(!batch_per_txn),
+ unprepared_batch_(false),
+ duplicate_detector_(),
+ dup_dectector_on_(false),
+ hint_per_batch_(hint_per_batch),
+ hint_created_(false) {
+ assert(cf_mems_);
+ }
+
+ ~MemTableInserter() override {
+ if (dup_dectector_on_) {
+ reinterpret_cast<DuplicateDetector*>
+ (&duplicate_detector_)->~DuplicateDetector();
+ }
+ if (post_info_created_) {
+ reinterpret_cast<MemPostInfoMap*>
+ (&mem_post_info_map_)->~MemPostInfoMap();
+ }
+ if (hint_created_) {
+ for (auto iter : GetHintMap()) {
+ delete[] reinterpret_cast<char*>(iter.second);
+ }
+ reinterpret_cast<HintMap*>(&hint_)->~HintMap();
+ }
+ delete rebuilding_trx_;
+ }
+
+ MemTableInserter(const MemTableInserter&) = delete;
+ MemTableInserter& operator=(const MemTableInserter&) = delete;
+
+ // The batch seq is regularly restarted; In normal mode it is set when
+ // MemTableInserter is constructed in the write thread and in recovery mode it
+ // is set when a batch, which is tagged with seq, is read from the WAL.
+ // Within a sequenced batch, which could be a merge of multiple batches, we
+ // have two policies to advance the seq: i) seq_per_key (default) and ii)
+ // seq_per_batch. To implement the latter we need to mark the boundary between
+ // the individual batches. The approach is this: 1) Use the terminating
+ // markers to indicate the boundary (kTypeEndPrepareXID, kTypeCommitXID,
+ // kTypeRollbackXID) 2) Terminate a batch with kTypeNoop in the absence of a
+ // natural boundary marker.
+ void MaybeAdvanceSeq(bool batch_boundry = false) {
+ if (batch_boundry == seq_per_batch_) {
+ sequence_++;
+ }
+ }
+
+ void set_log_number_ref(uint64_t log) { log_number_ref_ = log; }
+
+ SequenceNumber sequence() const { return sequence_; }
+
+ void PostProcess() {
+ assert(concurrent_memtable_writes_);
+ // If post info was not created there is nothing
+ // to process and no need to create on demand
+ if(post_info_created_) {
+ for (auto& pair : GetPostMap()) {
+ pair.first->BatchPostProcess(pair.second);
+ }
+ }
+ }
+
+ bool SeekToColumnFamily(uint32_t column_family_id, Status* s) {
+ // If we are in a concurrent mode, it is the caller's responsibility
+ // to clone the original ColumnFamilyMemTables so that each thread
+ // has its own instance. Otherwise, it must be guaranteed that there
+ // is no concurrent access
+ bool found = cf_mems_->Seek(column_family_id);
+ if (!found) {
+ if (ignore_missing_column_families_) {
+ *s = Status::OK();
+ } else {
+ *s = Status::InvalidArgument(
+ "Invalid column family specified in write batch");
+ }
+ return false;
+ }
+ if (recovering_log_number_ != 0 &&
+ recovering_log_number_ < cf_mems_->GetLogNumber()) {
+ // This is true only in recovery environment (recovering_log_number_ is
+ // always 0 in
+ // non-recovery, regular write code-path)
+ // * If recovering_log_number_ < cf_mems_->GetLogNumber(), this means that
+ // column
+ // family already contains updates from this log. We can't apply updates
+ // twice because of update-in-place or merge workloads -- ignore the
+ // update
+ *s = Status::OK();
+ return false;
+ }
+
+ if (has_valid_writes_ != nullptr) {
+ *has_valid_writes_ = true;
+ }
+
+ if (log_number_ref_ > 0) {
+ cf_mems_->GetMemTable()->RefLogContainingPrepSection(log_number_ref_);
+ }
+
+ return true;
+ }
+
+ Status PutCFImpl(uint32_t column_family_id, const Slice& key,
+ const Slice& value, ValueType value_type) {
+ // optimize for non-recovery mode
+ if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
+ WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, value);
+ return Status::OK();
+ // else insert the values to the memtable right away
+ }
+
+ Status seek_status;
+ if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) {
+ bool batch_boundry = false;
+ if (rebuilding_trx_ != nullptr) {
+ assert(!write_after_commit_);
+ // The CF is probably flushed and hence no need for insert but we still
+ // need to keep track of the keys for upcoming rollback/commit.
+ WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, value);
+ batch_boundry = IsDuplicateKeySeq(column_family_id, key);
+ }
+ MaybeAdvanceSeq(batch_boundry);
+ return seek_status;
+ }
+ Status ret_status;
+
+ MemTable* mem = cf_mems_->GetMemTable();
+ auto* moptions = mem->GetImmutableMemTableOptions();
+ // inplace_update_support is inconsistent with snapshots, and therefore with
+ // any kind of transactions including the ones that use seq_per_batch
+ assert(!seq_per_batch_ || !moptions->inplace_update_support);
+ if (!moptions->inplace_update_support) {
+ bool mem_res =
+ mem->Add(sequence_, value_type, key, value,
+ concurrent_memtable_writes_, get_post_process_info(mem),
+ hint_per_batch_ ? &GetHintMap()[mem] : nullptr);
+ if (UNLIKELY(!mem_res)) {
+ assert(seq_per_batch_);
+ ret_status = Status::TryAgain("key+seq exists");
+ const bool BATCH_BOUNDRY = true;
+ MaybeAdvanceSeq(BATCH_BOUNDRY);
+ }
+ } else if (moptions->inplace_callback == nullptr) {
+ assert(!concurrent_memtable_writes_);
+ mem->Update(sequence_, key, value);
+ } else {
+ assert(!concurrent_memtable_writes_);
+ if (mem->UpdateCallback(sequence_, key, value)) {
+ } else {
+ // key not found in memtable. Do sst get, update, add
+ SnapshotImpl read_from_snapshot;
+ read_from_snapshot.number_ = sequence_;
+ ReadOptions ropts;
+ // it's going to be overwritten for sure, so no point caching data block
+ // containing the old version
+ ropts.fill_cache = false;
+ ropts.snapshot = &read_from_snapshot;
+
+ std::string prev_value;
+ std::string merged_value;
+
+ auto cf_handle = cf_mems_->GetColumnFamilyHandle();
+ Status s = Status::NotSupported();
+ if (db_ != nullptr && recovering_log_number_ == 0) {
+ if (cf_handle == nullptr) {
+ cf_handle = db_->DefaultColumnFamily();
+ }
+ s = db_->Get(ropts, cf_handle, key, &prev_value);
+ }
+
+ char* prev_buffer = const_cast<char*>(prev_value.c_str());
+ uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
+ auto status = moptions->inplace_callback(s.ok() ? prev_buffer : nullptr,
+ s.ok() ? &prev_size : nullptr,
+ value, &merged_value);
+ if (status == UpdateStatus::UPDATED_INPLACE) {
+ // prev_value is updated in-place with final value.
+ bool mem_res __attribute__((__unused__));
+ mem_res = mem->Add(
+ sequence_, value_type, key, Slice(prev_buffer, prev_size));
+ assert(mem_res);
+ RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
+ } else if (status == UpdateStatus::UPDATED) {
+ // merged_value contains the final value.
+ bool mem_res __attribute__((__unused__));
+ mem_res =
+ mem->Add(sequence_, value_type, key, Slice(merged_value));
+ assert(mem_res);
+ RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
+ }
+ }
+ }
+ // optimize for non-recovery mode
+ if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
+ assert(!write_after_commit_);
+ // If the ret_status is TryAgain then let the next try to add the ky to
+ // the rebuilding transaction object.
+ WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, value);
+ }
+ // Since all Puts are logged in transaction logs (if enabled), always bump
+ // sequence number. Even if the update eventually fails and does not result
+ // in memtable add/update.
+ MaybeAdvanceSeq();
+ CheckMemtableFull();
+ return ret_status;
+ }
+
+ Status PutCF(uint32_t column_family_id, const Slice& key,
+ const Slice& value) override {
+ return PutCFImpl(column_family_id, key, value, kTypeValue);
+ }
+
+ Status DeleteImpl(uint32_t /*column_family_id*/, const Slice& key,
+ const Slice& value, ValueType delete_type) {
+ Status ret_status;
+ MemTable* mem = cf_mems_->GetMemTable();
+ bool mem_res =
+ mem->Add(sequence_, delete_type, key, value,
+ concurrent_memtable_writes_, get_post_process_info(mem),
+ hint_per_batch_ ? &GetHintMap()[mem] : nullptr);
+ if (UNLIKELY(!mem_res)) {
+ assert(seq_per_batch_);
+ ret_status = Status::TryAgain("key+seq exists");
+ const bool BATCH_BOUNDRY = true;
+ MaybeAdvanceSeq(BATCH_BOUNDRY);
+ }
+ MaybeAdvanceSeq();
+ CheckMemtableFull();
+ return ret_status;
+ }
+
+ Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
+ // optimize for non-recovery mode
+ if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
+ WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
+ return Status::OK();
+ // else insert the values to the memtable right away
+ }
+
+ Status seek_status;
+ if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) {
+ bool batch_boundry = false;
+ if (rebuilding_trx_ != nullptr) {
+ assert(!write_after_commit_);
+ // The CF is probably flushed and hence no need for insert but we still
+ // need to keep track of the keys for upcoming rollback/commit.
+ WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
+ batch_boundry = IsDuplicateKeySeq(column_family_id, key);
+ }
+ MaybeAdvanceSeq(batch_boundry);
+ return seek_status;
+ }
+
+ auto ret_status = DeleteImpl(column_family_id, key, Slice(), kTypeDeletion);
+ // optimize for non-recovery mode
+ if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
+ assert(!write_after_commit_);
+ // If the ret_status is TryAgain then let the next try to add the ky to
+ // the rebuilding transaction object.
+ WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
+ }
+ return ret_status;
+ }
+
+ Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) override {
+ // optimize for non-recovery mode
+ if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
+ WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id, key);
+ return Status::OK();
+ // else insert the values to the memtable right away
+ }
+
+ Status seek_status;
+ if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) {
+ bool batch_boundry = false;
+ if (rebuilding_trx_ != nullptr) {
+ assert(!write_after_commit_);
+ // The CF is probably flushed and hence no need for insert but we still
+ // need to keep track of the keys for upcoming rollback/commit.
+ WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id,
+ key);
+ batch_boundry = IsDuplicateKeySeq(column_family_id, key);
+ }
+ MaybeAdvanceSeq(batch_boundry);
+ return seek_status;
+ }
+
+ auto ret_status =
+ DeleteImpl(column_family_id, key, Slice(), kTypeSingleDeletion);
+ // optimize for non-recovery mode
+ if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
+ assert(!write_after_commit_);
+ // If the ret_status is TryAgain then let the next try to add the ky to
+ // the rebuilding transaction object.
+ WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id, key);
+ }
+ return ret_status;
+ }
+
+ Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key,
+ const Slice& end_key) override {
+ // optimize for non-recovery mode
+ if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
+ WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id,
+ begin_key, end_key);
+ return Status::OK();
+ // else insert the values to the memtable right away
+ }
+
+ Status seek_status;
+ if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) {
+ bool batch_boundry = false;
+ if (rebuilding_trx_ != nullptr) {
+ assert(!write_after_commit_);
+ // The CF is probably flushed and hence no need for insert but we still
+ // need to keep track of the keys for upcoming rollback/commit.
+ WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id,
+ begin_key, end_key);
+ // TODO(myabandeh): when transactional DeleteRange support is added,
+ // check if end_key must also be added.
+ batch_boundry = IsDuplicateKeySeq(column_family_id, begin_key);
+ }
+ MaybeAdvanceSeq(batch_boundry);
+ return seek_status;
+ }
+ if (db_ != nullptr) {
+ auto cf_handle = cf_mems_->GetColumnFamilyHandle();
+ if (cf_handle == nullptr) {
+ cf_handle = db_->DefaultColumnFamily();
+ }
+ auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cf_handle)->cfd();
+ if (!cfd->is_delete_range_supported()) {
+ return Status::NotSupported(
+ std::string("DeleteRange not supported for table type ") +
+ cfd->ioptions()->table_factory->Name() + " in CF " +
+ cfd->GetName());
+ }
+ }
+
+ auto ret_status =
+ DeleteImpl(column_family_id, begin_key, end_key, kTypeRangeDeletion);
+ // optimize for non-recovery mode
+ if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
+ assert(!write_after_commit_);
+ // If the ret_status is TryAgain then let the next try to add the ky to
+ // the rebuilding transaction object.
+ WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id,
+ begin_key, end_key);
+ }
+ return ret_status;
+ }
+
+ Status MergeCF(uint32_t column_family_id, const Slice& key,
+ const Slice& value) override {
+ // optimize for non-recovery mode
+ if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
+ WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key, value);
+ return Status::OK();
+ // else insert the values to the memtable right away
+ }
+
+ Status seek_status;
+ if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) {
+ bool batch_boundry = false;
+ if (rebuilding_trx_ != nullptr) {
+ assert(!write_after_commit_);
+ // The CF is probably flushed and hence no need for insert but we still
+ // need to keep track of the keys for upcoming rollback/commit.
+ WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key,
+ value);
+ batch_boundry = IsDuplicateKeySeq(column_family_id, key);
+ }
+ MaybeAdvanceSeq(batch_boundry);
+ return seek_status;
+ }
+
+ Status ret_status;
+ MemTable* mem = cf_mems_->GetMemTable();
+ auto* moptions = mem->GetImmutableMemTableOptions();
+ bool perform_merge = false;
+ assert(!concurrent_memtable_writes_ ||
+ moptions->max_successive_merges == 0);
+
+ // If we pass DB through and options.max_successive_merges is hit
+ // during recovery, Get() will be issued which will try to acquire
+ // DB mutex and cause deadlock, as DB mutex is already held.
+ // So we disable merge in recovery
+ if (moptions->max_successive_merges > 0 && db_ != nullptr &&
+ recovering_log_number_ == 0) {
+ assert(!concurrent_memtable_writes_);
+ LookupKey lkey(key, sequence_);
+
+ // Count the number of successive merges at the head
+ // of the key in the memtable
+ size_t num_merges = mem->CountSuccessiveMergeEntries(lkey);
+
+ if (num_merges >= moptions->max_successive_merges) {
+ perform_merge = true;
+ }
+ }
+
+ if (perform_merge) {
+ // 1) Get the existing value
+ std::string get_value;
+
+ // Pass in the sequence number so that we also include previous merge
+ // operations in the same batch.
+ SnapshotImpl read_from_snapshot;
+ read_from_snapshot.number_ = sequence_;
+ ReadOptions read_options;
+ read_options.snapshot = &read_from_snapshot;
+
+ auto cf_handle = cf_mems_->GetColumnFamilyHandle();
+ if (cf_handle == nullptr) {
+ cf_handle = db_->DefaultColumnFamily();
+ }
+ db_->Get(read_options, cf_handle, key, &get_value);
+ Slice get_value_slice = Slice(get_value);
+
+ // 2) Apply this merge
+ auto merge_operator = moptions->merge_operator;
+ assert(merge_operator);
+
+ std::string new_value;
+
+ Status merge_status = MergeHelper::TimedFullMerge(
+ merge_operator, key, &get_value_slice, {value}, &new_value,
+ moptions->info_log, moptions->statistics, Env::Default());
+
+ if (!merge_status.ok()) {
+ // Failed to merge!
+ // Store the delta in memtable
+ perform_merge = false;
+ } else {
+ // 3) Add value to memtable
+ assert(!concurrent_memtable_writes_);
+ bool mem_res = mem->Add(sequence_, kTypeValue, key, new_value);
+ if (UNLIKELY(!mem_res)) {
+ assert(seq_per_batch_);
+ ret_status = Status::TryAgain("key+seq exists");
+ const bool BATCH_BOUNDRY = true;
+ MaybeAdvanceSeq(BATCH_BOUNDRY);
+ }
+ }
+ }
+
+ if (!perform_merge) {
+ // Add merge operator to memtable
+ bool mem_res =
+ mem->Add(sequence_, kTypeMerge, key, value,
+ concurrent_memtable_writes_, get_post_process_info(mem));
+ if (UNLIKELY(!mem_res)) {
+ assert(seq_per_batch_);
+ ret_status = Status::TryAgain("key+seq exists");
+ const bool BATCH_BOUNDRY = true;
+ MaybeAdvanceSeq(BATCH_BOUNDRY);
+ }
+ }
+
+ // optimize for non-recovery mode
+ if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
+ assert(!write_after_commit_);
+ // If the ret_status is TryAgain then let the next try to add the ky to
+ // the rebuilding transaction object.
+ WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key, value);
+ }
+ MaybeAdvanceSeq();
+ CheckMemtableFull();
+ return ret_status;
+ }
+
+ Status PutBlobIndexCF(uint32_t column_family_id, const Slice& key,
+ const Slice& value) override {
+ // Same as PutCF except for value type.
+ return PutCFImpl(column_family_id, key, value, kTypeBlobIndex);
+ }
+
+ void CheckMemtableFull() {
+ if (flush_scheduler_ != nullptr) {
+ auto* cfd = cf_mems_->current();
+ assert(cfd != nullptr);
+ if (cfd->mem()->ShouldScheduleFlush() &&
+ cfd->mem()->MarkFlushScheduled()) {
+ // MarkFlushScheduled only returns true if we are the one that
+ // should take action, so no need to dedup further
+ flush_scheduler_->ScheduleWork(cfd);
+ }
+ }
+ // check if memtable_list size exceeds max_write_buffer_size_to_maintain
+ if (trim_history_scheduler_ != nullptr) {
+ auto* cfd = cf_mems_->current();
+
+ assert(cfd);
+ assert(cfd->ioptions());
+
+ const size_t size_to_maintain = static_cast<size_t>(
+ cfd->ioptions()->max_write_buffer_size_to_maintain);
+
+ if (size_to_maintain > 0) {
+ MemTableList* const imm = cfd->imm();
+ assert(imm);
+
+ if (imm->HasHistory()) {
+ const MemTable* const mem = cfd->mem();
+ assert(mem);
+
+ if (mem->ApproximateMemoryUsageFast() +
+ imm->ApproximateMemoryUsageExcludingLast() >=
+ size_to_maintain &&
+ imm->MarkTrimHistoryNeeded()) {
+ trim_history_scheduler_->ScheduleWork(cfd);
+ }
+ }
+ }
+ }
+ }
+
+ // The write batch handler calls MarkBeginPrepare with unprepare set to true
+ // if it encounters the kTypeBeginUnprepareXID marker.
+ Status MarkBeginPrepare(bool unprepare) override {
+ assert(rebuilding_trx_ == nullptr);
+ assert(db_);
+
+ if (recovering_log_number_ != 0) {
+ // during recovery we rebuild a hollow transaction
+ // from all encountered prepare sections of the wal
+ if (db_->allow_2pc() == false) {
+ return Status::NotSupported(
+ "WAL contains prepared transactions. Open with "
+ "TransactionDB::Open().");
+ }
+
+ // we are now iterating through a prepared section
+ rebuilding_trx_ = new WriteBatch();
+ rebuilding_trx_seq_ = sequence_;
+ // Verify that we have matching MarkBeginPrepare/MarkEndPrepare markers.
+ // unprepared_batch_ should be false because it is false by default, and
+ // gets reset to false in MarkEndPrepare.
+ assert(!unprepared_batch_);
+ unprepared_batch_ = unprepare;
+
+ if (has_valid_writes_ != nullptr) {
+ *has_valid_writes_ = true;
+ }
+ }
+
+ return Status::OK();
+ }
+
+ Status MarkEndPrepare(const Slice& name) override {
+ assert(db_);
+ assert((rebuilding_trx_ != nullptr) == (recovering_log_number_ != 0));
+
+ if (recovering_log_number_ != 0) {
+ assert(db_->allow_2pc());
+ size_t batch_cnt =
+ write_after_commit_
+ ? 0 // 0 will disable further checks
+ : static_cast<size_t>(sequence_ - rebuilding_trx_seq_ + 1);
+ db_->InsertRecoveredTransaction(recovering_log_number_, name.ToString(),
+ rebuilding_trx_, rebuilding_trx_seq_,
+ batch_cnt, unprepared_batch_);
+ unprepared_batch_ = false;
+ rebuilding_trx_ = nullptr;
+ } else {
+ assert(rebuilding_trx_ == nullptr);
+ }
+ const bool batch_boundry = true;
+ MaybeAdvanceSeq(batch_boundry);
+
+ return Status::OK();
+ }
+
+ Status MarkNoop(bool empty_batch) override {
+ // A hack in pessimistic transaction could result into a noop at the start
+ // of the write batch, that should be ignored.
+ if (!empty_batch) {
+ // In the absence of Prepare markers, a kTypeNoop tag indicates the end of
+ // a batch. This happens when write batch commits skipping the prepare
+ // phase.
+ const bool batch_boundry = true;
+ MaybeAdvanceSeq(batch_boundry);
+ }
+ return Status::OK();
+ }
+
+ Status MarkCommit(const Slice& name) override {
+ assert(db_);
+
+ Status s;
+
+ if (recovering_log_number_ != 0) {
+ // in recovery when we encounter a commit marker
+ // we lookup this transaction in our set of rebuilt transactions
+ // and commit.
+ auto trx = db_->GetRecoveredTransaction(name.ToString());
+
+ // the log containing the prepared section may have
+ // been released in the last incarnation because the
+ // data was flushed to L0
+ if (trx != nullptr) {
+ // at this point individual CF lognumbers will prevent
+ // duplicate re-insertion of values.
+ assert(log_number_ref_ == 0);
+ if (write_after_commit_) {
+ // write_after_commit_ can only have one batch in trx.
+ assert(trx->batches_.size() == 1);
+ const auto& batch_info = trx->batches_.begin()->second;
+ // all inserts must reference this trx log number
+ log_number_ref_ = batch_info.log_number_;
+ s = batch_info.batch_->Iterate(this);
+ log_number_ref_ = 0;
+ }
+ // else the values are already inserted before the commit
+
+ if (s.ok()) {
+ db_->DeleteRecoveredTransaction(name.ToString());
+ }
+ if (has_valid_writes_ != nullptr) {
+ *has_valid_writes_ = true;
+ }
+ }
+ } else {
+ // When writes are not delayed until commit, there is no disconnect
+ // between a memtable write and the WAL that supports it. So the commit
+ // need not reference any log as the only log to which it depends.
+ assert(!write_after_commit_ || log_number_ref_ > 0);
+ }
+ const bool batch_boundry = true;
+ MaybeAdvanceSeq(batch_boundry);
+
+ return s;
+ }
+
+ Status MarkRollback(const Slice& name) override {
+ assert(db_);
+
+ if (recovering_log_number_ != 0) {
+ auto trx = db_->GetRecoveredTransaction(name.ToString());
+
+ // the log containing the transactions prep section
+ // may have been released in the previous incarnation
+ // because we knew it had been rolled back
+ if (trx != nullptr) {
+ db_->DeleteRecoveredTransaction(name.ToString());
+ }
+ } else {
+ // in non recovery we simply ignore this tag
+ }
+
+ const bool batch_boundry = true;
+ MaybeAdvanceSeq(batch_boundry);
+
+ return Status::OK();
+ }
+
+ private:
+ MemTablePostProcessInfo* get_post_process_info(MemTable* mem) {
+ if (!concurrent_memtable_writes_) {
+ // No need to batch counters locally if we don't use concurrent mode.
+ return nullptr;
+ }
+ return &GetPostMap()[mem];
+ }
+};
+
+// This function can only be called in these conditions:
+// 1) During Recovery()
+// 2) During Write(), in a single-threaded write thread
+// 3) During Write(), in a concurrent context where memtables has been cloned
+// The reason is that it calls memtables->Seek(), which has a stateful cache
+Status WriteBatchInternal::InsertInto(
+ WriteThread::WriteGroup& write_group, SequenceNumber sequence,
+ ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
+ TrimHistoryScheduler* trim_history_scheduler,
+ bool ignore_missing_column_families, uint64_t recovery_log_number, DB* db,
+ bool concurrent_memtable_writes, bool seq_per_batch, bool batch_per_txn) {
+ MemTableInserter inserter(
+ sequence, memtables, flush_scheduler, trim_history_scheduler,
+ ignore_missing_column_families, recovery_log_number, db,
+ concurrent_memtable_writes, nullptr /*has_valid_writes*/, seq_per_batch,
+ batch_per_txn);
+ for (auto w : write_group) {
+ if (w->CallbackFailed()) {
+ continue;
+ }
+ w->sequence = inserter.sequence();
+ if (!w->ShouldWriteToMemtable()) {
+ // In seq_per_batch_ mode this advances the seq by one.
+ inserter.MaybeAdvanceSeq(true);
+ continue;
+ }
+ SetSequence(w->batch, inserter.sequence());
+ inserter.set_log_number_ref(w->log_ref);
+ w->status = w->batch->Iterate(&inserter);
+ if (!w->status.ok()) {
+ return w->status;
+ }
+ assert(!seq_per_batch || w->batch_cnt != 0);
+ assert(!seq_per_batch || inserter.sequence() - w->sequence == w->batch_cnt);
+ }
+ return Status::OK();
+}
+
+Status WriteBatchInternal::InsertInto(
+ WriteThread::Writer* writer, SequenceNumber sequence,
+ ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
+ TrimHistoryScheduler* trim_history_scheduler,
+ bool ignore_missing_column_families, uint64_t log_number, DB* db,
+ bool concurrent_memtable_writes, bool seq_per_batch, size_t batch_cnt,
+ bool batch_per_txn, bool hint_per_batch) {
+#ifdef NDEBUG
+ (void)batch_cnt;
+#endif
+ assert(writer->ShouldWriteToMemtable());
+ MemTableInserter inserter(
+ sequence, memtables, flush_scheduler, trim_history_scheduler,
+ ignore_missing_column_families, log_number, db,
+ concurrent_memtable_writes, nullptr /*has_valid_writes*/, seq_per_batch,
+ batch_per_txn, hint_per_batch);
+ SetSequence(writer->batch, sequence);
+ inserter.set_log_number_ref(writer->log_ref);
+ Status s = writer->batch->Iterate(&inserter);
+ assert(!seq_per_batch || batch_cnt != 0);
+ assert(!seq_per_batch || inserter.sequence() - sequence == batch_cnt);
+ if (concurrent_memtable_writes) {
+ inserter.PostProcess();
+ }
+ return s;
+}
+
+Status WriteBatchInternal::InsertInto(
+ const WriteBatch* batch, ColumnFamilyMemTables* memtables,
+ FlushScheduler* flush_scheduler,
+ TrimHistoryScheduler* trim_history_scheduler,
+ bool ignore_missing_column_families, uint64_t log_number, DB* db,
+ bool concurrent_memtable_writes, SequenceNumber* next_seq,
+ bool* has_valid_writes, bool seq_per_batch, bool batch_per_txn) {
+ MemTableInserter inserter(Sequence(batch), memtables, flush_scheduler,
+ trim_history_scheduler,
+ ignore_missing_column_families, log_number, db,
+ concurrent_memtable_writes, has_valid_writes,
+ seq_per_batch, batch_per_txn);
+ Status s = batch->Iterate(&inserter);
+ if (next_seq != nullptr) {
+ *next_seq = inserter.sequence();
+ }
+ if (concurrent_memtable_writes) {
+ inserter.PostProcess();
+ }
+ return s;
+}
+
+Status WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
+ assert(contents.size() >= WriteBatchInternal::kHeader);
+ b->rep_.assign(contents.data(), contents.size());
+ b->content_flags_.store(ContentFlags::DEFERRED, std::memory_order_relaxed);
+ return Status::OK();
+}
+
+Status WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src,
+ const bool wal_only) {
+ size_t src_len;
+ int src_count;
+ uint32_t src_flags;
+
+ const SavePoint& batch_end = src->GetWalTerminationPoint();
+
+ if (wal_only && !batch_end.is_cleared()) {
+ src_len = batch_end.size - WriteBatchInternal::kHeader;
+ src_count = batch_end.count;
+ src_flags = batch_end.content_flags;
+ } else {
+ src_len = src->rep_.size() - WriteBatchInternal::kHeader;
+ src_count = Count(src);
+ src_flags = src->content_flags_.load(std::memory_order_relaxed);
+ }
+
+ SetCount(dst, Count(dst) + src_count);
+ assert(src->rep_.size() >= WriteBatchInternal::kHeader);
+ dst->rep_.append(src->rep_.data() + WriteBatchInternal::kHeader, src_len);
+ dst->content_flags_.store(
+ dst->content_flags_.load(std::memory_order_relaxed) | src_flags,
+ std::memory_order_relaxed);
+ return Status::OK();
+}
+
+size_t WriteBatchInternal::AppendedByteSize(size_t leftByteSize,
+ size_t rightByteSize) {
+ if (leftByteSize == 0 || rightByteSize == 0) {
+ return leftByteSize + rightByteSize;
+ } else {
+ return leftByteSize + rightByteSize - WriteBatchInternal::kHeader;
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_batch_base.cc b/src/rocksdb/db/write_batch_base.cc
new file mode 100644
index 000000000..e4c0e74bd
--- /dev/null
+++ b/src/rocksdb/db/write_batch_base.cc
@@ -0,0 +1,94 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/write_batch_base.h"
+
+#include <string>
+
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Simple implementation of SlicePart variants of Put(). Child classes
+// can override these method with more performant solutions if they choose.
+Status WriteBatchBase::Put(ColumnFamilyHandle* column_family,
+ const SliceParts& key, const SliceParts& value) {
+ std::string key_buf, value_buf;
+ Slice key_slice(key, &key_buf);
+ Slice value_slice(value, &value_buf);
+
+ return Put(column_family, key_slice, value_slice);
+}
+
+Status WriteBatchBase::Put(const SliceParts& key, const SliceParts& value) {
+ std::string key_buf, value_buf;
+ Slice key_slice(key, &key_buf);
+ Slice value_slice(value, &value_buf);
+
+ return Put(key_slice, value_slice);
+}
+
+Status WriteBatchBase::Delete(ColumnFamilyHandle* column_family,
+ const SliceParts& key) {
+ std::string key_buf;
+ Slice key_slice(key, &key_buf);
+ return Delete(column_family, key_slice);
+}
+
+Status WriteBatchBase::Delete(const SliceParts& key) {
+ std::string key_buf;
+ Slice key_slice(key, &key_buf);
+ return Delete(key_slice);
+}
+
+Status WriteBatchBase::SingleDelete(ColumnFamilyHandle* column_family,
+ const SliceParts& key) {
+ std::string key_buf;
+ Slice key_slice(key, &key_buf);
+ return SingleDelete(column_family, key_slice);
+}
+
+Status WriteBatchBase::SingleDelete(const SliceParts& key) {
+ std::string key_buf;
+ Slice key_slice(key, &key_buf);
+ return SingleDelete(key_slice);
+}
+
+Status WriteBatchBase::DeleteRange(ColumnFamilyHandle* column_family,
+ const SliceParts& begin_key,
+ const SliceParts& end_key) {
+ std::string begin_key_buf, end_key_buf;
+ Slice begin_key_slice(begin_key, &begin_key_buf);
+ Slice end_key_slice(end_key, &end_key_buf);
+ return DeleteRange(column_family, begin_key_slice, end_key_slice);
+}
+
+Status WriteBatchBase::DeleteRange(const SliceParts& begin_key,
+ const SliceParts& end_key) {
+ std::string begin_key_buf, end_key_buf;
+ Slice begin_key_slice(begin_key, &begin_key_buf);
+ Slice end_key_slice(end_key, &end_key_buf);
+ return DeleteRange(begin_key_slice, end_key_slice);
+}
+
+Status WriteBatchBase::Merge(ColumnFamilyHandle* column_family,
+ const SliceParts& key, const SliceParts& value) {
+ std::string key_buf, value_buf;
+ Slice key_slice(key, &key_buf);
+ Slice value_slice(value, &value_buf);
+
+ return Merge(column_family, key_slice, value_slice);
+}
+
+Status WriteBatchBase::Merge(const SliceParts& key, const SliceParts& value) {
+ std::string key_buf, value_buf;
+ Slice key_slice(key, &key_buf);
+ Slice value_slice(value, &value_buf);
+
+ return Merge(key_slice, value_slice);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_batch_internal.h b/src/rocksdb/db/write_batch_internal.h
new file mode 100644
index 000000000..30c489965
--- /dev/null
+++ b/src/rocksdb/db/write_batch_internal.h
@@ -0,0 +1,250 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <vector>
+#include "db/flush_scheduler.h"
+#include "db/trim_history_scheduler.h"
+#include "db/write_thread.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/types.h"
+#include "rocksdb/write_batch.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MemTable;
+class FlushScheduler;
+class ColumnFamilyData;
+
+class ColumnFamilyMemTables {
+ public:
+ virtual ~ColumnFamilyMemTables() {}
+ virtual bool Seek(uint32_t column_family_id) = 0;
+ // returns true if the update to memtable should be ignored
+ // (useful when recovering from log whose updates have already
+ // been processed)
+ virtual uint64_t GetLogNumber() const = 0;
+ virtual MemTable* GetMemTable() const = 0;
+ virtual ColumnFamilyHandle* GetColumnFamilyHandle() = 0;
+ virtual ColumnFamilyData* current() { return nullptr; }
+};
+
+class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables {
+ public:
+ explicit ColumnFamilyMemTablesDefault(MemTable* mem)
+ : ok_(false), mem_(mem) {}
+
+ bool Seek(uint32_t column_family_id) override {
+ ok_ = (column_family_id == 0);
+ return ok_;
+ }
+
+ uint64_t GetLogNumber() const override { return 0; }
+
+ MemTable* GetMemTable() const override {
+ assert(ok_);
+ return mem_;
+ }
+
+ ColumnFamilyHandle* GetColumnFamilyHandle() override { return nullptr; }
+
+ private:
+ bool ok_;
+ MemTable* mem_;
+};
+
+// WriteBatchInternal provides static methods for manipulating a
+// WriteBatch that we don't want in the public WriteBatch interface.
+class WriteBatchInternal {
+ public:
+
+ // WriteBatch header has an 8-byte sequence number followed by a 4-byte count.
+ static const size_t kHeader = 12;
+
+ // WriteBatch methods with column_family_id instead of ColumnFamilyHandle*
+ static Status Put(WriteBatch* batch, uint32_t column_family_id,
+ const Slice& key, const Slice& value);
+
+ static Status Put(WriteBatch* batch, uint32_t column_family_id,
+ const SliceParts& key, const SliceParts& value);
+
+ static Status Delete(WriteBatch* batch, uint32_t column_family_id,
+ const SliceParts& key);
+
+ static Status Delete(WriteBatch* batch, uint32_t column_family_id,
+ const Slice& key);
+
+ static Status SingleDelete(WriteBatch* batch, uint32_t column_family_id,
+ const SliceParts& key);
+
+ static Status SingleDelete(WriteBatch* batch, uint32_t column_family_id,
+ const Slice& key);
+
+ static Status DeleteRange(WriteBatch* b, uint32_t column_family_id,
+ const Slice& begin_key, const Slice& end_key);
+
+ static Status DeleteRange(WriteBatch* b, uint32_t column_family_id,
+ const SliceParts& begin_key,
+ const SliceParts& end_key);
+
+ static Status Merge(WriteBatch* batch, uint32_t column_family_id,
+ const Slice& key, const Slice& value);
+
+ static Status Merge(WriteBatch* batch, uint32_t column_family_id,
+ const SliceParts& key, const SliceParts& value);
+
+ static Status PutBlobIndex(WriteBatch* batch, uint32_t column_family_id,
+ const Slice& key, const Slice& value);
+
+ static Status MarkEndPrepare(WriteBatch* batch, const Slice& xid,
+ const bool write_after_commit = true,
+ const bool unprepared_batch = false);
+
+ static Status MarkRollback(WriteBatch* batch, const Slice& xid);
+
+ static Status MarkCommit(WriteBatch* batch, const Slice& xid);
+
+ static Status InsertNoop(WriteBatch* batch);
+
+ // Return the number of entries in the batch.
+ static uint32_t Count(const WriteBatch* batch);
+
+ // Set the count for the number of entries in the batch.
+ static void SetCount(WriteBatch* batch, uint32_t n);
+
+ // Return the sequence number for the start of this batch.
+ static SequenceNumber Sequence(const WriteBatch* batch);
+
+ // Store the specified number as the sequence number for the start of
+ // this batch.
+ static void SetSequence(WriteBatch* batch, SequenceNumber seq);
+
+ // Returns the offset of the first entry in the batch.
+ // This offset is only valid if the batch is not empty.
+ static size_t GetFirstOffset(WriteBatch* batch);
+
+ static Slice Contents(const WriteBatch* batch) {
+ return Slice(batch->rep_);
+ }
+
+ static size_t ByteSize(const WriteBatch* batch) {
+ return batch->rep_.size();
+ }
+
+ static Status SetContents(WriteBatch* batch, const Slice& contents);
+
+ static Status CheckSlicePartsLength(const SliceParts& key,
+ const SliceParts& value);
+
+ // Inserts batches[i] into memtable, for i in 0..num_batches-1 inclusive.
+ //
+ // If ignore_missing_column_families == true. WriteBatch
+ // referencing non-existing column family will be ignored.
+ // If ignore_missing_column_families == false, processing of the
+ // batches will be stopped if a reference is found to a non-existing
+ // column family and InvalidArgument() will be returned. The writes
+ // in batches may be only partially applied at that point.
+ //
+ // If log_number is non-zero, the memtable will be updated only if
+ // memtables->GetLogNumber() >= log_number.
+ //
+ // If flush_scheduler is non-null, it will be invoked if the memtable
+ // should be flushed.
+ //
+ // Under concurrent use, the caller is responsible for making sure that
+ // the memtables object itself is thread-local.
+ static Status InsertInto(
+ WriteThread::WriteGroup& write_group, SequenceNumber sequence,
+ ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
+ TrimHistoryScheduler* trim_history_scheduler,
+ bool ignore_missing_column_families = false, uint64_t log_number = 0,
+ DB* db = nullptr, bool concurrent_memtable_writes = false,
+ bool seq_per_batch = false, bool batch_per_txn = true);
+
+ // Convenience form of InsertInto when you have only one batch
+ // next_seq returns the seq after last sequence number used in MemTable insert
+ static Status InsertInto(
+ const WriteBatch* batch, ColumnFamilyMemTables* memtables,
+ FlushScheduler* flush_scheduler,
+ TrimHistoryScheduler* trim_history_scheduler,
+ bool ignore_missing_column_families = false, uint64_t log_number = 0,
+ DB* db = nullptr, bool concurrent_memtable_writes = false,
+ SequenceNumber* next_seq = nullptr, bool* has_valid_writes = nullptr,
+ bool seq_per_batch = false, bool batch_per_txn = true);
+
+ static Status InsertInto(WriteThread::Writer* writer, SequenceNumber sequence,
+ ColumnFamilyMemTables* memtables,
+ FlushScheduler* flush_scheduler,
+ TrimHistoryScheduler* trim_history_scheduler,
+ bool ignore_missing_column_families = false,
+ uint64_t log_number = 0, DB* db = nullptr,
+ bool concurrent_memtable_writes = false,
+ bool seq_per_batch = false, size_t batch_cnt = 0,
+ bool batch_per_txn = true,
+ bool hint_per_batch = false);
+
+ static Status Append(WriteBatch* dst, const WriteBatch* src,
+ const bool WAL_only = false);
+
+ // Returns the byte size of appending a WriteBatch with ByteSize
+ // leftByteSize and a WriteBatch with ByteSize rightByteSize
+ static size_t AppendedByteSize(size_t leftByteSize, size_t rightByteSize);
+
+ // Iterate over [begin, end) range of a write batch
+ static Status Iterate(const WriteBatch* wb, WriteBatch::Handler* handler,
+ size_t begin, size_t end);
+
+ // This write batch includes the latest state that should be persisted. Such
+ // state meant to be used only during recovery.
+ static void SetAsLastestPersistentState(WriteBatch* b);
+ static bool IsLatestPersistentState(const WriteBatch* b);
+};
+
+// LocalSavePoint is similar to a scope guard
+class LocalSavePoint {
+ public:
+ explicit LocalSavePoint(WriteBatch* batch)
+ : batch_(batch),
+ savepoint_(batch->GetDataSize(), batch->Count(),
+ batch->content_flags_.load(std::memory_order_relaxed))
+#ifndef NDEBUG
+ ,
+ committed_(false)
+#endif
+ {
+ }
+
+#ifndef NDEBUG
+ ~LocalSavePoint() { assert(committed_); }
+#endif
+ Status commit() {
+#ifndef NDEBUG
+ committed_ = true;
+#endif
+ if (batch_->max_bytes_ && batch_->rep_.size() > batch_->max_bytes_) {
+ batch_->rep_.resize(savepoint_.size);
+ WriteBatchInternal::SetCount(batch_, savepoint_.count);
+ batch_->content_flags_.store(savepoint_.content_flags,
+ std::memory_order_relaxed);
+ return Status::MemoryLimit();
+ }
+ return Status::OK();
+ }
+
+ private:
+ WriteBatch* batch_;
+ SavePoint savepoint_;
+#ifndef NDEBUG
+ bool committed_;
+#endif
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_batch_test.cc b/src/rocksdb/db/write_batch_test.cc
new file mode 100644
index 000000000..84f9a45ec
--- /dev/null
+++ b/src/rocksdb/db/write_batch_test.cc
@@ -0,0 +1,888 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/db.h"
+
+#include <memory>
+#include "db/column_family.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/scoped_arena_iterator.h"
+#include "test_util/testharness.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static std::string PrintContents(WriteBatch* b) {
+ InternalKeyComparator cmp(BytewiseComparator());
+ auto factory = std::make_shared<SkipListFactory>();
+ Options options;
+ options.memtable_factory = factory;
+ ImmutableCFOptions ioptions(options);
+ WriteBufferManager wb(options.db_write_buffer_size);
+ MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+ kMaxSequenceNumber, 0 /* column_family_id */);
+ mem->Ref();
+ std::string state;
+ ColumnFamilyMemTablesDefault cf_mems_default(mem);
+ Status s =
+ WriteBatchInternal::InsertInto(b, &cf_mems_default, nullptr, nullptr);
+ uint32_t count = 0;
+ int put_count = 0;
+ int delete_count = 0;
+ int single_delete_count = 0;
+ int delete_range_count = 0;
+ int merge_count = 0;
+ for (int i = 0; i < 2; ++i) {
+ Arena arena;
+ ScopedArenaIterator arena_iter_guard;
+ std::unique_ptr<InternalIterator> iter_guard;
+ InternalIterator* iter;
+ if (i == 0) {
+ iter = mem->NewIterator(ReadOptions(), &arena);
+ arena_iter_guard.set(iter);
+ } else {
+ iter = mem->NewRangeTombstoneIterator(ReadOptions(),
+ kMaxSequenceNumber /* read_seq */);
+ iter_guard.reset(iter);
+ }
+ if (iter == nullptr) {
+ continue;
+ }
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ParsedInternalKey ikey;
+ ikey.clear();
+ EXPECT_TRUE(ParseInternalKey(iter->key(), &ikey));
+ switch (ikey.type) {
+ case kTypeValue:
+ state.append("Put(");
+ state.append(ikey.user_key.ToString());
+ state.append(", ");
+ state.append(iter->value().ToString());
+ state.append(")");
+ count++;
+ put_count++;
+ break;
+ case kTypeDeletion:
+ state.append("Delete(");
+ state.append(ikey.user_key.ToString());
+ state.append(")");
+ count++;
+ delete_count++;
+ break;
+ case kTypeSingleDeletion:
+ state.append("SingleDelete(");
+ state.append(ikey.user_key.ToString());
+ state.append(")");
+ count++;
+ single_delete_count++;
+ break;
+ case kTypeRangeDeletion:
+ state.append("DeleteRange(");
+ state.append(ikey.user_key.ToString());
+ state.append(", ");
+ state.append(iter->value().ToString());
+ state.append(")");
+ count++;
+ delete_range_count++;
+ break;
+ case kTypeMerge:
+ state.append("Merge(");
+ state.append(ikey.user_key.ToString());
+ state.append(", ");
+ state.append(iter->value().ToString());
+ state.append(")");
+ count++;
+ merge_count++;
+ break;
+ default:
+ assert(false);
+ break;
+ }
+ state.append("@");
+ state.append(NumberToString(ikey.sequence));
+ }
+ }
+ EXPECT_EQ(b->HasPut(), put_count > 0);
+ EXPECT_EQ(b->HasDelete(), delete_count > 0);
+ EXPECT_EQ(b->HasSingleDelete(), single_delete_count > 0);
+ EXPECT_EQ(b->HasDeleteRange(), delete_range_count > 0);
+ EXPECT_EQ(b->HasMerge(), merge_count > 0);
+ if (!s.ok()) {
+ state.append(s.ToString());
+ } else if (count != WriteBatchInternal::Count(b)) {
+ state.append("CountMismatch()");
+ }
+ delete mem->Unref();
+ return state;
+}
+
+class WriteBatchTest : public testing::Test {};
+
+TEST_F(WriteBatchTest, Empty) {
+ WriteBatch batch;
+ ASSERT_EQ("", PrintContents(&batch));
+ ASSERT_EQ(0u, WriteBatchInternal::Count(&batch));
+ ASSERT_EQ(0u, batch.Count());
+}
+
+TEST_F(WriteBatchTest, Multiple) {
+ WriteBatch batch;
+ batch.Put(Slice("foo"), Slice("bar"));
+ batch.Delete(Slice("box"));
+ batch.DeleteRange(Slice("bar"), Slice("foo"));
+ batch.Put(Slice("baz"), Slice("boo"));
+ WriteBatchInternal::SetSequence(&batch, 100);
+ ASSERT_EQ(100U, WriteBatchInternal::Sequence(&batch));
+ ASSERT_EQ(4u, WriteBatchInternal::Count(&batch));
+ ASSERT_EQ(
+ "Put(baz, boo)@103"
+ "Delete(box)@101"
+ "Put(foo, bar)@100"
+ "DeleteRange(bar, foo)@102",
+ PrintContents(&batch));
+ ASSERT_EQ(4u, batch.Count());
+}
+
+TEST_F(WriteBatchTest, Corruption) {
+ WriteBatch batch;
+ batch.Put(Slice("foo"), Slice("bar"));
+ batch.Delete(Slice("box"));
+ WriteBatchInternal::SetSequence(&batch, 200);
+ Slice contents = WriteBatchInternal::Contents(&batch);
+ WriteBatchInternal::SetContents(&batch,
+ Slice(contents.data(),contents.size()-1));
+ ASSERT_EQ("Put(foo, bar)@200"
+ "Corruption: bad WriteBatch Delete",
+ PrintContents(&batch));
+}
+
+TEST_F(WriteBatchTest, Append) {
+ WriteBatch b1, b2;
+ WriteBatchInternal::SetSequence(&b1, 200);
+ WriteBatchInternal::SetSequence(&b2, 300);
+ WriteBatchInternal::Append(&b1, &b2);
+ ASSERT_EQ("",
+ PrintContents(&b1));
+ ASSERT_EQ(0u, b1.Count());
+ b2.Put("a", "va");
+ WriteBatchInternal::Append(&b1, &b2);
+ ASSERT_EQ("Put(a, va)@200",
+ PrintContents(&b1));
+ ASSERT_EQ(1u, b1.Count());
+ b2.Clear();
+ b2.Put("b", "vb");
+ WriteBatchInternal::Append(&b1, &b2);
+ ASSERT_EQ("Put(a, va)@200"
+ "Put(b, vb)@201",
+ PrintContents(&b1));
+ ASSERT_EQ(2u, b1.Count());
+ b2.Delete("foo");
+ WriteBatchInternal::Append(&b1, &b2);
+ ASSERT_EQ("Put(a, va)@200"
+ "Put(b, vb)@202"
+ "Put(b, vb)@201"
+ "Delete(foo)@203",
+ PrintContents(&b1));
+ ASSERT_EQ(4u, b1.Count());
+ b2.Clear();
+ b2.Put("c", "cc");
+ b2.Put("d", "dd");
+ b2.MarkWalTerminationPoint();
+ b2.Put("e", "ee");
+ WriteBatchInternal::Append(&b1, &b2, /*wal only*/ true);
+ ASSERT_EQ(
+ "Put(a, va)@200"
+ "Put(b, vb)@202"
+ "Put(b, vb)@201"
+ "Put(c, cc)@204"
+ "Put(d, dd)@205"
+ "Delete(foo)@203",
+ PrintContents(&b1));
+ ASSERT_EQ(6u, b1.Count());
+ ASSERT_EQ(
+ "Put(c, cc)@0"
+ "Put(d, dd)@1"
+ "Put(e, ee)@2",
+ PrintContents(&b2));
+ ASSERT_EQ(3u, b2.Count());
+}
+
+TEST_F(WriteBatchTest, SingleDeletion) {
+ WriteBatch batch;
+ WriteBatchInternal::SetSequence(&batch, 100);
+ ASSERT_EQ("", PrintContents(&batch));
+ ASSERT_EQ(0u, batch.Count());
+ batch.Put("a", "va");
+ ASSERT_EQ("Put(a, va)@100", PrintContents(&batch));
+ ASSERT_EQ(1u, batch.Count());
+ batch.SingleDelete("a");
+ ASSERT_EQ(
+ "SingleDelete(a)@101"
+ "Put(a, va)@100",
+ PrintContents(&batch));
+ ASSERT_EQ(2u, batch.Count());
+}
+
+namespace {
+ struct TestHandler : public WriteBatch::Handler {
+ std::string seen;
+ Status PutCF(uint32_t column_family_id, const Slice& key,
+ const Slice& value) override {
+ if (column_family_id == 0) {
+ seen += "Put(" + key.ToString() + ", " + value.ToString() + ")";
+ } else {
+ seen += "PutCF(" + ToString(column_family_id) + ", " +
+ key.ToString() + ", " + value.ToString() + ")";
+ }
+ return Status::OK();
+ }
+ Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
+ if (column_family_id == 0) {
+ seen += "Delete(" + key.ToString() + ")";
+ } else {
+ seen += "DeleteCF(" + ToString(column_family_id) + ", " +
+ key.ToString() + ")";
+ }
+ return Status::OK();
+ }
+ Status SingleDeleteCF(uint32_t column_family_id,
+ const Slice& key) override {
+ if (column_family_id == 0) {
+ seen += "SingleDelete(" + key.ToString() + ")";
+ } else {
+ seen += "SingleDeleteCF(" + ToString(column_family_id) + ", " +
+ key.ToString() + ")";
+ }
+ return Status::OK();
+ }
+ Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key,
+ const Slice& end_key) override {
+ if (column_family_id == 0) {
+ seen += "DeleteRange(" + begin_key.ToString() + ", " +
+ end_key.ToString() + ")";
+ } else {
+ seen += "DeleteRangeCF(" + ToString(column_family_id) + ", " +
+ begin_key.ToString() + ", " + end_key.ToString() + ")";
+ }
+ return Status::OK();
+ }
+ Status MergeCF(uint32_t column_family_id, const Slice& key,
+ const Slice& value) override {
+ if (column_family_id == 0) {
+ seen += "Merge(" + key.ToString() + ", " + value.ToString() + ")";
+ } else {
+ seen += "MergeCF(" + ToString(column_family_id) + ", " +
+ key.ToString() + ", " + value.ToString() + ")";
+ }
+ return Status::OK();
+ }
+ void LogData(const Slice& blob) override {
+ seen += "LogData(" + blob.ToString() + ")";
+ }
+ Status MarkBeginPrepare(bool unprepare) override {
+ seen +=
+ "MarkBeginPrepare(" + std::string(unprepare ? "true" : "false") + ")";
+ return Status::OK();
+ }
+ Status MarkEndPrepare(const Slice& xid) override {
+ seen += "MarkEndPrepare(" + xid.ToString() + ")";
+ return Status::OK();
+ }
+ Status MarkNoop(bool empty_batch) override {
+ seen += "MarkNoop(" + std::string(empty_batch ? "true" : "false") + ")";
+ return Status::OK();
+ }
+ Status MarkCommit(const Slice& xid) override {
+ seen += "MarkCommit(" + xid.ToString() + ")";
+ return Status::OK();
+ }
+ Status MarkRollback(const Slice& xid) override {
+ seen += "MarkRollback(" + xid.ToString() + ")";
+ return Status::OK();
+ }
+ };
+}
+
+TEST_F(WriteBatchTest, PutNotImplemented) {
+ WriteBatch batch;
+ batch.Put(Slice("k1"), Slice("v1"));
+ ASSERT_EQ(1u, batch.Count());
+ ASSERT_EQ("Put(k1, v1)@0", PrintContents(&batch));
+
+ WriteBatch::Handler handler;
+ ASSERT_OK(batch.Iterate(&handler));
+}
+
+TEST_F(WriteBatchTest, DeleteNotImplemented) {
+ WriteBatch batch;
+ batch.Delete(Slice("k2"));
+ ASSERT_EQ(1u, batch.Count());
+ ASSERT_EQ("Delete(k2)@0", PrintContents(&batch));
+
+ WriteBatch::Handler handler;
+ ASSERT_OK(batch.Iterate(&handler));
+}
+
+TEST_F(WriteBatchTest, SingleDeleteNotImplemented) {
+ WriteBatch batch;
+ batch.SingleDelete(Slice("k2"));
+ ASSERT_EQ(1u, batch.Count());
+ ASSERT_EQ("SingleDelete(k2)@0", PrintContents(&batch));
+
+ WriteBatch::Handler handler;
+ ASSERT_OK(batch.Iterate(&handler));
+}
+
+TEST_F(WriteBatchTest, MergeNotImplemented) {
+ WriteBatch batch;
+ batch.Merge(Slice("foo"), Slice("bar"));
+ ASSERT_EQ(1u, batch.Count());
+ ASSERT_EQ("Merge(foo, bar)@0", PrintContents(&batch));
+
+ WriteBatch::Handler handler;
+ ASSERT_OK(batch.Iterate(&handler));
+}
+
+TEST_F(WriteBatchTest, Blob) {
+ WriteBatch batch;
+ batch.Put(Slice("k1"), Slice("v1"));
+ batch.Put(Slice("k2"), Slice("v2"));
+ batch.Put(Slice("k3"), Slice("v3"));
+ batch.PutLogData(Slice("blob1"));
+ batch.Delete(Slice("k2"));
+ batch.SingleDelete(Slice("k3"));
+ batch.PutLogData(Slice("blob2"));
+ batch.Merge(Slice("foo"), Slice("bar"));
+ ASSERT_EQ(6u, batch.Count());
+ ASSERT_EQ(
+ "Merge(foo, bar)@5"
+ "Put(k1, v1)@0"
+ "Delete(k2)@3"
+ "Put(k2, v2)@1"
+ "SingleDelete(k3)@4"
+ "Put(k3, v3)@2",
+ PrintContents(&batch));
+
+ TestHandler handler;
+ batch.Iterate(&handler);
+ ASSERT_EQ(
+ "Put(k1, v1)"
+ "Put(k2, v2)"
+ "Put(k3, v3)"
+ "LogData(blob1)"
+ "Delete(k2)"
+ "SingleDelete(k3)"
+ "LogData(blob2)"
+ "Merge(foo, bar)",
+ handler.seen);
+}
+
+TEST_F(WriteBatchTest, PrepareCommit) {
+ WriteBatch batch;
+ WriteBatchInternal::InsertNoop(&batch);
+ batch.Put(Slice("k1"), Slice("v1"));
+ batch.Put(Slice("k2"), Slice("v2"));
+ batch.SetSavePoint();
+ WriteBatchInternal::MarkEndPrepare(&batch, Slice("xid1"));
+ Status s = batch.RollbackToSavePoint();
+ ASSERT_EQ(s, Status::NotFound());
+ WriteBatchInternal::MarkCommit(&batch, Slice("xid1"));
+ WriteBatchInternal::MarkRollback(&batch, Slice("xid1"));
+ ASSERT_EQ(2u, batch.Count());
+
+ TestHandler handler;
+ batch.Iterate(&handler);
+ ASSERT_EQ(
+ "MarkBeginPrepare(false)"
+ "Put(k1, v1)"
+ "Put(k2, v2)"
+ "MarkEndPrepare(xid1)"
+ "MarkCommit(xid1)"
+ "MarkRollback(xid1)",
+ handler.seen);
+}
+
+// It requires more than 30GB of memory to run the test. With single memory
+// allocation of more than 30GB.
+// Not all platform can run it. Also it runs a long time. So disable it.
+TEST_F(WriteBatchTest, DISABLED_ManyUpdates) {
+ // Insert key and value of 3GB and push total batch size to 12GB.
+ static const size_t kKeyValueSize = 4u;
+ static const uint32_t kNumUpdates = uint32_t(3 << 30);
+ std::string raw(kKeyValueSize, 'A');
+ WriteBatch batch(kNumUpdates * (4 + kKeyValueSize * 2) + 1024u);
+ char c = 'A';
+ for (uint32_t i = 0; i < kNumUpdates; i++) {
+ if (c > 'Z') {
+ c = 'A';
+ }
+ raw[0] = c;
+ raw[raw.length() - 1] = c;
+ c++;
+ batch.Put(raw, raw);
+ }
+
+ ASSERT_EQ(kNumUpdates, batch.Count());
+
+ struct NoopHandler : public WriteBatch::Handler {
+ uint32_t num_seen = 0;
+ char expected_char = 'A';
+ Status PutCF(uint32_t /*column_family_id*/, const Slice& key,
+ const Slice& value) override {
+ EXPECT_EQ(kKeyValueSize, key.size());
+ EXPECT_EQ(kKeyValueSize, value.size());
+ EXPECT_EQ(expected_char, key[0]);
+ EXPECT_EQ(expected_char, value[0]);
+ EXPECT_EQ(expected_char, key[kKeyValueSize - 1]);
+ EXPECT_EQ(expected_char, value[kKeyValueSize - 1]);
+ expected_char++;
+ if (expected_char > 'Z') {
+ expected_char = 'A';
+ }
+ ++num_seen;
+ return Status::OK();
+ }
+ Status DeleteCF(uint32_t /*column_family_id*/,
+ const Slice& /*key*/) override {
+ ADD_FAILURE();
+ return Status::OK();
+ }
+ Status SingleDeleteCF(uint32_t /*column_family_id*/,
+ const Slice& /*key*/) override {
+ ADD_FAILURE();
+ return Status::OK();
+ }
+ Status MergeCF(uint32_t /*column_family_id*/, const Slice& /*key*/,
+ const Slice& /*value*/) override {
+ ADD_FAILURE();
+ return Status::OK();
+ }
+ void LogData(const Slice& /*blob*/) override { ADD_FAILURE(); }
+ bool Continue() override { return num_seen < kNumUpdates; }
+ } handler;
+
+ batch.Iterate(&handler);
+ ASSERT_EQ(kNumUpdates, handler.num_seen);
+}
+
+// The test requires more than 18GB memory to run it, with single memory
+// allocation of more than 12GB. Not all the platform can run it. So disable it.
+TEST_F(WriteBatchTest, DISABLED_LargeKeyValue) {
+ // Insert key and value of 3GB and push total batch size to 12GB.
+ static const size_t kKeyValueSize = 3221225472u;
+ std::string raw(kKeyValueSize, 'A');
+ WriteBatch batch(size_t(12884901888ull + 1024u));
+ for (char i = 0; i < 2; i++) {
+ raw[0] = 'A' + i;
+ raw[raw.length() - 1] = 'A' - i;
+ batch.Put(raw, raw);
+ }
+
+ ASSERT_EQ(2u, batch.Count());
+
+ struct NoopHandler : public WriteBatch::Handler {
+ int num_seen = 0;
+ Status PutCF(uint32_t /*column_family_id*/, const Slice& key,
+ const Slice& value) override {
+ EXPECT_EQ(kKeyValueSize, key.size());
+ EXPECT_EQ(kKeyValueSize, value.size());
+ EXPECT_EQ('A' + num_seen, key[0]);
+ EXPECT_EQ('A' + num_seen, value[0]);
+ EXPECT_EQ('A' - num_seen, key[kKeyValueSize - 1]);
+ EXPECT_EQ('A' - num_seen, value[kKeyValueSize - 1]);
+ ++num_seen;
+ return Status::OK();
+ }
+ Status DeleteCF(uint32_t /*column_family_id*/,
+ const Slice& /*key*/) override {
+ ADD_FAILURE();
+ return Status::OK();
+ }
+ Status SingleDeleteCF(uint32_t /*column_family_id*/,
+ const Slice& /*key*/) override {
+ ADD_FAILURE();
+ return Status::OK();
+ }
+ Status MergeCF(uint32_t /*column_family_id*/, const Slice& /*key*/,
+ const Slice& /*value*/) override {
+ ADD_FAILURE();
+ return Status::OK();
+ }
+ void LogData(const Slice& /*blob*/) override { ADD_FAILURE(); }
+ bool Continue() override { return num_seen < 2; }
+ } handler;
+
+ batch.Iterate(&handler);
+ ASSERT_EQ(2, handler.num_seen);
+}
+
+TEST_F(WriteBatchTest, Continue) {
+ WriteBatch batch;
+
+ struct Handler : public TestHandler {
+ int num_seen = 0;
+ Status PutCF(uint32_t column_family_id, const Slice& key,
+ const Slice& value) override {
+ ++num_seen;
+ return TestHandler::PutCF(column_family_id, key, value);
+ }
+ Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
+ ++num_seen;
+ return TestHandler::DeleteCF(column_family_id, key);
+ }
+ Status SingleDeleteCF(uint32_t column_family_id,
+ const Slice& key) override {
+ ++num_seen;
+ return TestHandler::SingleDeleteCF(column_family_id, key);
+ }
+ Status MergeCF(uint32_t column_family_id, const Slice& key,
+ const Slice& value) override {
+ ++num_seen;
+ return TestHandler::MergeCF(column_family_id, key, value);
+ }
+ void LogData(const Slice& blob) override {
+ ++num_seen;
+ TestHandler::LogData(blob);
+ }
+ bool Continue() override { return num_seen < 5; }
+ } handler;
+
+ batch.Put(Slice("k1"), Slice("v1"));
+ batch.Put(Slice("k2"), Slice("v2"));
+ batch.PutLogData(Slice("blob1"));
+ batch.Delete(Slice("k1"));
+ batch.SingleDelete(Slice("k2"));
+ batch.PutLogData(Slice("blob2"));
+ batch.Merge(Slice("foo"), Slice("bar"));
+ batch.Iterate(&handler);
+ ASSERT_EQ(
+ "Put(k1, v1)"
+ "Put(k2, v2)"
+ "LogData(blob1)"
+ "Delete(k1)"
+ "SingleDelete(k2)",
+ handler.seen);
+}
+
+TEST_F(WriteBatchTest, PutGatherSlices) {
+ WriteBatch batch;
+ batch.Put(Slice("foo"), Slice("bar"));
+
+ {
+ // Try a write where the key is one slice but the value is two
+ Slice key_slice("baz");
+ Slice value_slices[2] = { Slice("header"), Slice("payload") };
+ batch.Put(SliceParts(&key_slice, 1),
+ SliceParts(value_slices, 2));
+ }
+
+ {
+ // One where the key is composite but the value is a single slice
+ Slice key_slices[3] = { Slice("key"), Slice("part2"), Slice("part3") };
+ Slice value_slice("value");
+ batch.Put(SliceParts(key_slices, 3),
+ SliceParts(&value_slice, 1));
+ }
+
+ WriteBatchInternal::SetSequence(&batch, 100);
+ ASSERT_EQ("Put(baz, headerpayload)@101"
+ "Put(foo, bar)@100"
+ "Put(keypart2part3, value)@102",
+ PrintContents(&batch));
+ ASSERT_EQ(3u, batch.Count());
+}
+
+namespace {
+class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl {
+ public:
+ explicit ColumnFamilyHandleImplDummy(int id)
+ : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), id_(id) {}
+ uint32_t GetID() const override { return id_; }
+ const Comparator* GetComparator() const override {
+ return BytewiseComparator();
+ }
+
+ private:
+ uint32_t id_;
+};
+} // namespace anonymous
+
+TEST_F(WriteBatchTest, ColumnFamiliesBatchTest) {
+ WriteBatch batch;
+ ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8);
+ batch.Put(&zero, Slice("foo"), Slice("bar"));
+ batch.Put(&two, Slice("twofoo"), Slice("bar2"));
+ batch.Put(&eight, Slice("eightfoo"), Slice("bar8"));
+ batch.Delete(&eight, Slice("eightfoo"));
+ batch.SingleDelete(&two, Slice("twofoo"));
+ batch.DeleteRange(&two, Slice("3foo"), Slice("4foo"));
+ batch.Merge(&three, Slice("threethree"), Slice("3three"));
+ batch.Put(&zero, Slice("foo"), Slice("bar"));
+ batch.Merge(Slice("omom"), Slice("nom"));
+
+ TestHandler handler;
+ batch.Iterate(&handler);
+ ASSERT_EQ(
+ "Put(foo, bar)"
+ "PutCF(2, twofoo, bar2)"
+ "PutCF(8, eightfoo, bar8)"
+ "DeleteCF(8, eightfoo)"
+ "SingleDeleteCF(2, twofoo)"
+ "DeleteRangeCF(2, 3foo, 4foo)"
+ "MergeCF(3, threethree, 3three)"
+ "Put(foo, bar)"
+ "Merge(omom, nom)",
+ handler.seen);
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(WriteBatchTest, ColumnFamiliesBatchWithIndexTest) {
+ WriteBatchWithIndex batch;
+ ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8);
+ batch.Put(&zero, Slice("foo"), Slice("bar"));
+ batch.Put(&two, Slice("twofoo"), Slice("bar2"));
+ batch.Put(&eight, Slice("eightfoo"), Slice("bar8"));
+ batch.Delete(&eight, Slice("eightfoo"));
+ batch.SingleDelete(&two, Slice("twofoo"));
+ batch.Merge(&three, Slice("threethree"), Slice("3three"));
+ batch.Put(&zero, Slice("foo"), Slice("bar"));
+ batch.Merge(Slice("omom"), Slice("nom"));
+
+ std::unique_ptr<WBWIIterator> iter;
+
+ iter.reset(batch.NewIterator(&eight));
+ iter->Seek("eightfoo");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
+ ASSERT_EQ("eightfoo", iter->Entry().key.ToString());
+ ASSERT_EQ("bar8", iter->Entry().value.ToString());
+
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(WriteType::kDeleteRecord, iter->Entry().type);
+ ASSERT_EQ("eightfoo", iter->Entry().key.ToString());
+
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+
+ iter.reset(batch.NewIterator(&two));
+ iter->Seek("twofoo");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
+ ASSERT_EQ("twofoo", iter->Entry().key.ToString());
+ ASSERT_EQ("bar2", iter->Entry().value.ToString());
+
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(WriteType::kSingleDeleteRecord, iter->Entry().type);
+ ASSERT_EQ("twofoo", iter->Entry().key.ToString());
+
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+
+ iter.reset(batch.NewIterator());
+ iter->Seek("gggg");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(WriteType::kMergeRecord, iter->Entry().type);
+ ASSERT_EQ("omom", iter->Entry().key.ToString());
+ ASSERT_EQ("nom", iter->Entry().value.ToString());
+
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+
+ iter.reset(batch.NewIterator(&zero));
+ iter->Seek("foo");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
+ ASSERT_EQ("foo", iter->Entry().key.ToString());
+ ASSERT_EQ("bar", iter->Entry().value.ToString());
+
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
+ ASSERT_EQ("foo", iter->Entry().key.ToString());
+ ASSERT_EQ("bar", iter->Entry().value.ToString());
+
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(WriteType::kMergeRecord, iter->Entry().type);
+ ASSERT_EQ("omom", iter->Entry().key.ToString());
+ ASSERT_EQ("nom", iter->Entry().value.ToString());
+
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+
+ TestHandler handler;
+ batch.GetWriteBatch()->Iterate(&handler);
+ ASSERT_EQ(
+ "Put(foo, bar)"
+ "PutCF(2, twofoo, bar2)"
+ "PutCF(8, eightfoo, bar8)"
+ "DeleteCF(8, eightfoo)"
+ "SingleDeleteCF(2, twofoo)"
+ "MergeCF(3, threethree, 3three)"
+ "Put(foo, bar)"
+ "Merge(omom, nom)",
+ handler.seen);
+}
+#endif // !ROCKSDB_LITE
+
+TEST_F(WriteBatchTest, SavePointTest) {
+ Status s;
+ WriteBatch batch;
+ batch.SetSavePoint();
+
+ batch.Put("A", "a");
+ batch.Put("B", "b");
+ batch.SetSavePoint();
+
+ batch.Put("C", "c");
+ batch.Delete("A");
+ batch.SetSavePoint();
+ batch.SetSavePoint();
+
+ ASSERT_OK(batch.RollbackToSavePoint());
+ ASSERT_EQ(
+ "Delete(A)@3"
+ "Put(A, a)@0"
+ "Put(B, b)@1"
+ "Put(C, c)@2",
+ PrintContents(&batch));
+
+ ASSERT_OK(batch.RollbackToSavePoint());
+ ASSERT_OK(batch.RollbackToSavePoint());
+ ASSERT_EQ(
+ "Put(A, a)@0"
+ "Put(B, b)@1",
+ PrintContents(&batch));
+
+ batch.Delete("A");
+ batch.Put("B", "bb");
+
+ ASSERT_OK(batch.RollbackToSavePoint());
+ ASSERT_EQ("", PrintContents(&batch));
+
+ s = batch.RollbackToSavePoint();
+ ASSERT_TRUE(s.IsNotFound());
+ ASSERT_EQ("", PrintContents(&batch));
+
+ batch.Put("D", "d");
+ batch.Delete("A");
+
+ batch.SetSavePoint();
+
+ batch.Put("A", "aaa");
+
+ ASSERT_OK(batch.RollbackToSavePoint());
+ ASSERT_EQ(
+ "Delete(A)@1"
+ "Put(D, d)@0",
+ PrintContents(&batch));
+
+ batch.SetSavePoint();
+
+ batch.Put("D", "d");
+ batch.Delete("A");
+
+ ASSERT_OK(batch.RollbackToSavePoint());
+ ASSERT_EQ(
+ "Delete(A)@1"
+ "Put(D, d)@0",
+ PrintContents(&batch));
+
+ s = batch.RollbackToSavePoint();
+ ASSERT_TRUE(s.IsNotFound());
+ ASSERT_EQ(
+ "Delete(A)@1"
+ "Put(D, d)@0",
+ PrintContents(&batch));
+
+ WriteBatch batch2;
+
+ s = batch2.RollbackToSavePoint();
+ ASSERT_TRUE(s.IsNotFound());
+ ASSERT_EQ("", PrintContents(&batch2));
+
+ batch2.Delete("A");
+ batch2.SetSavePoint();
+
+ s = batch2.RollbackToSavePoint();
+ ASSERT_OK(s);
+ ASSERT_EQ("Delete(A)@0", PrintContents(&batch2));
+
+ batch2.Clear();
+ ASSERT_EQ("", PrintContents(&batch2));
+
+ batch2.SetSavePoint();
+
+ batch2.Delete("B");
+ ASSERT_EQ("Delete(B)@0", PrintContents(&batch2));
+
+ batch2.SetSavePoint();
+ s = batch2.RollbackToSavePoint();
+ ASSERT_OK(s);
+ ASSERT_EQ("Delete(B)@0", PrintContents(&batch2));
+
+ s = batch2.RollbackToSavePoint();
+ ASSERT_OK(s);
+ ASSERT_EQ("", PrintContents(&batch2));
+
+ s = batch2.RollbackToSavePoint();
+ ASSERT_TRUE(s.IsNotFound());
+ ASSERT_EQ("", PrintContents(&batch2));
+
+ WriteBatch batch3;
+
+ s = batch3.PopSavePoint();
+ ASSERT_TRUE(s.IsNotFound());
+ ASSERT_EQ("", PrintContents(&batch3));
+
+ batch3.SetSavePoint();
+ batch3.Delete("A");
+
+ s = batch3.PopSavePoint();
+ ASSERT_OK(s);
+ ASSERT_EQ("Delete(A)@0", PrintContents(&batch3));
+}
+
+TEST_F(WriteBatchTest, MemoryLimitTest) {
+ Status s;
+ // The header size is 12 bytes. The two Puts take 8 bytes which gives total
+ // of 12 + 8 * 2 = 28 bytes.
+ WriteBatch batch(0, 28);
+
+ ASSERT_OK(batch.Put("a", "...."));
+ ASSERT_OK(batch.Put("b", "...."));
+ s = batch.Put("c", "....");
+ ASSERT_TRUE(s.IsMemoryLimit());
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/write_callback.h b/src/rocksdb/db/write_callback.h
new file mode 100644
index 000000000..106d02041
--- /dev/null
+++ b/src/rocksdb/db/write_callback.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DB;
+
+class WriteCallback {
+ public:
+ virtual ~WriteCallback() {}
+
+ // Will be called while on the write thread before the write executes. If
+ // this function returns a non-OK status, the write will be aborted and this
+ // status will be returned to the caller of DB::Write().
+ virtual Status Callback(DB* db) = 0;
+
+ // return true if writes with this callback can be batched with other writes
+ virtual bool AllowWriteBatching() = 0;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_callback_test.cc b/src/rocksdb/db/write_callback_test.cc
new file mode 100644
index 000000000..df7d673aa
--- /dev/null
+++ b/src/rocksdb/db/write_callback_test.cc
@@ -0,0 +1,452 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include <atomic>
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "db/write_callback.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/write_batch.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/random.h"
+
+using std::string;
+
+namespace ROCKSDB_NAMESPACE {
+
+class WriteCallbackTest : public testing::Test {
+ public:
+ string dbname;
+
+ WriteCallbackTest() {
+ dbname = test::PerThreadDBPath("write_callback_testdb");
+ }
+};
+
+class WriteCallbackTestWriteCallback1 : public WriteCallback {
+ public:
+ bool was_called = false;
+
+ Status Callback(DB *db) override {
+ was_called = true;
+
+ // Make sure db is a DBImpl
+ DBImpl* db_impl = dynamic_cast<DBImpl*> (db);
+ if (db_impl == nullptr) {
+ return Status::InvalidArgument("");
+ }
+
+ return Status::OK();
+ }
+
+ bool AllowWriteBatching() override { return true; }
+};
+
+class WriteCallbackTestWriteCallback2 : public WriteCallback {
+ public:
+ Status Callback(DB* /*db*/) override { return Status::Busy(); }
+ bool AllowWriteBatching() override { return true; }
+};
+
+class MockWriteCallback : public WriteCallback {
+ public:
+ bool should_fail_ = false;
+ bool allow_batching_ = false;
+ std::atomic<bool> was_called_{false};
+
+ MockWriteCallback() {}
+
+ MockWriteCallback(const MockWriteCallback& other) {
+ should_fail_ = other.should_fail_;
+ allow_batching_ = other.allow_batching_;
+ was_called_.store(other.was_called_.load());
+ }
+
+ Status Callback(DB* /*db*/) override {
+ was_called_.store(true);
+ if (should_fail_) {
+ return Status::Busy();
+ } else {
+ return Status::OK();
+ }
+ }
+
+ bool AllowWriteBatching() override { return allow_batching_; }
+};
+
+TEST_F(WriteCallbackTest, WriteWithCallbackTest) {
+ struct WriteOP {
+ WriteOP(bool should_fail = false) { callback_.should_fail_ = should_fail; }
+
+ void Put(const string& key, const string& val) {
+ kvs_.push_back(std::make_pair(key, val));
+ write_batch_.Put(key, val);
+ }
+
+ void Clear() {
+ kvs_.clear();
+ write_batch_.Clear();
+ callback_.was_called_.store(false);
+ }
+
+ MockWriteCallback callback_;
+ WriteBatch write_batch_;
+ std::vector<std::pair<string, string>> kvs_;
+ };
+
+ // In each scenario we'll launch multiple threads to write.
+ // The size of each array equals to number of threads, and
+ // each boolean in it denote whether callback of corresponding
+ // thread should succeed or fail.
+ std::vector<std::vector<WriteOP>> write_scenarios = {
+ {true},
+ {false},
+ {false, false},
+ {true, true},
+ {true, false},
+ {false, true},
+ {false, false, false},
+ {true, true, true},
+ {false, true, false},
+ {true, false, true},
+ {true, false, false, false, false},
+ {false, false, false, false, true},
+ {false, false, true, false, true},
+ };
+
+ for (auto& unordered_write : {true, false}) {
+ for (auto& seq_per_batch : {true, false}) {
+ for (auto& two_queues : {true, false}) {
+ for (auto& allow_parallel : {true, false}) {
+ for (auto& allow_batching : {true, false}) {
+ for (auto& enable_WAL : {true, false}) {
+ for (auto& enable_pipelined_write : {true, false}) {
+ for (auto& write_group : write_scenarios) {
+ Options options;
+ options.create_if_missing = true;
+ options.unordered_write = unordered_write;
+ options.allow_concurrent_memtable_write = allow_parallel;
+ options.enable_pipelined_write = enable_pipelined_write;
+ options.two_write_queues = two_queues;
+ // Skip unsupported combinations
+ if (options.enable_pipelined_write && seq_per_batch) {
+ continue;
+ }
+ if (options.enable_pipelined_write && options.two_write_queues) {
+ continue;
+ }
+ if (options.unordered_write &&
+ !options.allow_concurrent_memtable_write) {
+ continue;
+ }
+ if (options.unordered_write && options.enable_pipelined_write) {
+ continue;
+ }
+
+ ReadOptions read_options;
+ DB* db;
+ DBImpl* db_impl;
+
+ DestroyDB(dbname, options);
+
+ DBOptions db_options(options);
+ ColumnFamilyOptions cf_options(options);
+ std::vector<ColumnFamilyDescriptor> column_families;
+ column_families.push_back(
+ ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+ std::vector<ColumnFamilyHandle*> handles;
+ auto open_s =
+ DBImpl::Open(db_options, dbname, column_families, &handles,
+ &db, seq_per_batch, true /* batch_per_txn */);
+ ASSERT_OK(open_s);
+ assert(handles.size() == 1);
+ delete handles[0];
+
+ db_impl = dynamic_cast<DBImpl*>(db);
+ ASSERT_TRUE(db_impl);
+
+ // Writers that have called JoinBatchGroup.
+ std::atomic<uint64_t> threads_joining(0);
+ // Writers that have linked to the queue
+ std::atomic<uint64_t> threads_linked(0);
+ // Writers that pass WriteThread::JoinBatchGroup:Wait sync-point.
+ std::atomic<uint64_t> threads_verified(0);
+
+ std::atomic<uint64_t> seq(db_impl->GetLatestSequenceNumber());
+ ASSERT_EQ(db_impl->GetLatestSequenceNumber(), 0);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WriteThread::JoinBatchGroup:Start", [&](void*) {
+ uint64_t cur_threads_joining = threads_joining.fetch_add(1);
+ // Wait for the last joined writer to link to the queue.
+ // In this way the writers link to the queue one by one.
+ // This allows us to confidently detect the first writer
+ // who increases threads_linked as the leader.
+ while (threads_linked.load() < cur_threads_joining) {
+ }
+ });
+
+ // Verification once writers call JoinBatchGroup.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WriteThread::JoinBatchGroup:Wait", [&](void* arg) {
+ uint64_t cur_threads_linked = threads_linked.fetch_add(1);
+ bool is_leader = false;
+ bool is_last = false;
+
+ // who am i
+ is_leader = (cur_threads_linked == 0);
+ is_last = (cur_threads_linked == write_group.size() - 1);
+
+ // check my state
+ auto* writer = reinterpret_cast<WriteThread::Writer*>(arg);
+
+ if (is_leader) {
+ ASSERT_TRUE(writer->state ==
+ WriteThread::State::STATE_GROUP_LEADER);
+ } else {
+ ASSERT_TRUE(writer->state ==
+ WriteThread::State::STATE_INIT);
+ }
+
+ // (meta test) the first WriteOP should indeed be the first
+ // and the last should be the last (all others can be out of
+ // order)
+ if (is_leader) {
+ ASSERT_TRUE(writer->callback->Callback(nullptr).ok() ==
+ !write_group.front().callback_.should_fail_);
+ } else if (is_last) {
+ ASSERT_TRUE(writer->callback->Callback(nullptr).ok() ==
+ !write_group.back().callback_.should_fail_);
+ }
+
+ threads_verified.fetch_add(1);
+ // Wait here until all verification in this sync-point
+ // callback finish for all writers.
+ while (threads_verified.load() < write_group.size()) {
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WriteThread::JoinBatchGroup:DoneWaiting", [&](void* arg) {
+ // check my state
+ auto* writer = reinterpret_cast<WriteThread::Writer*>(arg);
+
+ if (!allow_batching) {
+ // no batching so everyone should be a leader
+ ASSERT_TRUE(writer->state ==
+ WriteThread::State::STATE_GROUP_LEADER);
+ } else if (!allow_parallel) {
+ ASSERT_TRUE(writer->state ==
+ WriteThread::State::STATE_COMPLETED ||
+ (enable_pipelined_write &&
+ writer->state ==
+ WriteThread::State::
+ STATE_MEMTABLE_WRITER_LEADER));
+ }
+ });
+
+ std::atomic<uint32_t> thread_num(0);
+ std::atomic<char> dummy_key(0);
+
+ // Each write thread create a random write batch and write to DB
+ // with a write callback.
+ std::function<void()> write_with_callback_func = [&]() {
+ uint32_t i = thread_num.fetch_add(1);
+ Random rnd(i);
+
+ // leaders gotta lead
+ while (i > 0 && threads_verified.load() < 1) {
+ }
+
+ // loser has to lose
+ while (i == write_group.size() - 1 &&
+ threads_verified.load() < write_group.size() - 1) {
+ }
+
+ auto& write_op = write_group.at(i);
+ write_op.Clear();
+ write_op.callback_.allow_batching_ = allow_batching;
+
+ // insert some keys
+ for (uint32_t j = 0; j < rnd.Next() % 50; j++) {
+ // grab unique key
+ char my_key = dummy_key.fetch_add(1);
+
+ string skey(5, my_key);
+ string sval(10, my_key);
+ write_op.Put(skey, sval);
+
+ if (!write_op.callback_.should_fail_ && !seq_per_batch) {
+ seq.fetch_add(1);
+ }
+ }
+ if (!write_op.callback_.should_fail_ && seq_per_batch) {
+ seq.fetch_add(1);
+ }
+
+ WriteOptions woptions;
+ woptions.disableWAL = !enable_WAL;
+ woptions.sync = enable_WAL;
+ Status s;
+ if (seq_per_batch) {
+ class PublishSeqCallback : public PreReleaseCallback {
+ public:
+ PublishSeqCallback(DBImpl* db_impl_in)
+ : db_impl_(db_impl_in) {}
+ Status Callback(SequenceNumber last_seq, bool /*not used*/,
+ uint64_t, size_t /*index*/,
+ size_t /*total*/) override {
+ db_impl_->SetLastPublishedSequence(last_seq);
+ return Status::OK();
+ }
+ DBImpl* db_impl_;
+ } publish_seq_callback(db_impl);
+ // seq_per_batch requires a natural batch separator or Noop
+ WriteBatchInternal::InsertNoop(&write_op.write_batch_);
+ const size_t ONE_BATCH = 1;
+ s = db_impl->WriteImpl(
+ woptions, &write_op.write_batch_, &write_op.callback_,
+ nullptr, 0, false, nullptr, ONE_BATCH,
+ two_queues ? &publish_seq_callback : nullptr);
+ } else {
+ s = db_impl->WriteWithCallback(
+ woptions, &write_op.write_batch_, &write_op.callback_);
+ }
+
+ if (write_op.callback_.should_fail_) {
+ ASSERT_TRUE(s.IsBusy());
+ } else {
+ ASSERT_OK(s);
+ }
+ };
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // do all the writes
+ std::vector<port::Thread> threads;
+ for (uint32_t i = 0; i < write_group.size(); i++) {
+ threads.emplace_back(write_with_callback_func);
+ }
+ for (auto& t : threads) {
+ t.join();
+ }
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ // check for keys
+ string value;
+ for (auto& w : write_group) {
+ ASSERT_TRUE(w.callback_.was_called_.load());
+ for (auto& kvp : w.kvs_) {
+ if (w.callback_.should_fail_) {
+ ASSERT_TRUE(
+ db->Get(read_options, kvp.first, &value).IsNotFound());
+ } else {
+ ASSERT_OK(db->Get(read_options, kvp.first, &value));
+ ASSERT_EQ(value, kvp.second);
+ }
+ }
+ }
+
+ ASSERT_EQ(seq.load(), db_impl->TEST_GetLastVisibleSequence());
+
+ delete db;
+ DestroyDB(dbname, options);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+TEST_F(WriteCallbackTest, WriteCallBackTest) {
+ Options options;
+ WriteOptions write_options;
+ ReadOptions read_options;
+ string value;
+ DB* db;
+ DBImpl* db_impl;
+
+ DestroyDB(dbname, options);
+
+ options.create_if_missing = true;
+ Status s = DB::Open(options, dbname, &db);
+ ASSERT_OK(s);
+
+ db_impl = dynamic_cast<DBImpl*> (db);
+ ASSERT_TRUE(db_impl);
+
+ WriteBatch wb;
+
+ wb.Put("a", "value.a");
+ wb.Delete("x");
+
+ // Test a simple Write
+ s = db->Write(write_options, &wb);
+ ASSERT_OK(s);
+
+ s = db->Get(read_options, "a", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("value.a", value);
+
+ // Test WriteWithCallback
+ WriteCallbackTestWriteCallback1 callback1;
+ WriteBatch wb2;
+
+ wb2.Put("a", "value.a2");
+
+ s = db_impl->WriteWithCallback(write_options, &wb2, &callback1);
+ ASSERT_OK(s);
+ ASSERT_TRUE(callback1.was_called);
+
+ s = db->Get(read_options, "a", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("value.a2", value);
+
+ // Test WriteWithCallback for a callback that fails
+ WriteCallbackTestWriteCallback2 callback2;
+ WriteBatch wb3;
+
+ wb3.Put("a", "value.a3");
+
+ s = db_impl->WriteWithCallback(write_options, &wb3, &callback2);
+ ASSERT_NOK(s);
+
+ s = db->Get(read_options, "a", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("value.a2", value);
+
+ delete db;
+ DestroyDB(dbname, options);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr,
+ "SKIPPED as WriteWithCallback is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/write_controller.cc b/src/rocksdb/db/write_controller.cc
new file mode 100644
index 000000000..5480aabd1
--- /dev/null
+++ b/src/rocksdb/db/write_controller.cc
@@ -0,0 +1,128 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/write_controller.h"
+
+#include <atomic>
+#include <cassert>
+#include <ratio>
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::unique_ptr<WriteControllerToken> WriteController::GetStopToken() {
+ ++total_stopped_;
+ return std::unique_ptr<WriteControllerToken>(new StopWriteToken(this));
+}
+
+std::unique_ptr<WriteControllerToken> WriteController::GetDelayToken(
+ uint64_t write_rate) {
+ total_delayed_++;
+ // Reset counters.
+ last_refill_time_ = 0;
+ bytes_left_ = 0;
+ set_delayed_write_rate(write_rate);
+ return std::unique_ptr<WriteControllerToken>(new DelayWriteToken(this));
+}
+
+std::unique_ptr<WriteControllerToken>
+WriteController::GetCompactionPressureToken() {
+ ++total_compaction_pressure_;
+ return std::unique_ptr<WriteControllerToken>(
+ new CompactionPressureToken(this));
+}
+
+bool WriteController::IsStopped() const {
+ return total_stopped_.load(std::memory_order_relaxed) > 0;
+}
+// This is inside DB mutex, so we can't sleep and need to minimize
+// frequency to get time.
+// If it turns out to be a performance issue, we can redesign the thread
+// synchronization model here.
+// The function trust caller will sleep micros returned.
+uint64_t WriteController::GetDelay(Env* env, uint64_t num_bytes) {
+ if (total_stopped_.load(std::memory_order_relaxed) > 0) {
+ return 0;
+ }
+ if (total_delayed_.load(std::memory_order_relaxed) == 0) {
+ return 0;
+ }
+
+ const uint64_t kMicrosPerSecond = 1000000;
+ const uint64_t kRefillInterval = 1024U;
+
+ if (bytes_left_ >= num_bytes) {
+ bytes_left_ -= num_bytes;
+ return 0;
+ }
+ // The frequency to get time inside DB mutex is less than one per refill
+ // interval.
+ auto time_now = NowMicrosMonotonic(env);
+
+ uint64_t sleep_debt = 0;
+ uint64_t time_since_last_refill = 0;
+ if (last_refill_time_ != 0) {
+ if (last_refill_time_ > time_now) {
+ sleep_debt = last_refill_time_ - time_now;
+ } else {
+ time_since_last_refill = time_now - last_refill_time_;
+ bytes_left_ +=
+ static_cast<uint64_t>(static_cast<double>(time_since_last_refill) /
+ kMicrosPerSecond * delayed_write_rate_);
+ if (time_since_last_refill >= kRefillInterval &&
+ bytes_left_ > num_bytes) {
+ // If refill interval already passed and we have enough bytes
+ // return without extra sleeping.
+ last_refill_time_ = time_now;
+ bytes_left_ -= num_bytes;
+ return 0;
+ }
+ }
+ }
+
+ uint64_t single_refill_amount =
+ delayed_write_rate_ * kRefillInterval / kMicrosPerSecond;
+ if (bytes_left_ + single_refill_amount >= num_bytes) {
+ // Wait until a refill interval
+ // Never trigger expire for less than one refill interval to avoid to get
+ // time.
+ bytes_left_ = bytes_left_ + single_refill_amount - num_bytes;
+ last_refill_time_ = time_now + kRefillInterval;
+ return kRefillInterval + sleep_debt;
+ }
+
+ // Need to refill more than one interval. Need to sleep longer. Check
+ // whether expiration will hit
+
+ // Sleep just until `num_bytes` is allowed.
+ uint64_t sleep_amount =
+ static_cast<uint64_t>(num_bytes /
+ static_cast<long double>(delayed_write_rate_) *
+ kMicrosPerSecond) +
+ sleep_debt;
+ last_refill_time_ = time_now + sleep_amount;
+ return sleep_amount;
+}
+
+uint64_t WriteController::NowMicrosMonotonic(Env* env) {
+ return env->NowNanos() / std::milli::den;
+}
+
+StopWriteToken::~StopWriteToken() {
+ assert(controller_->total_stopped_ >= 1);
+ --controller_->total_stopped_;
+}
+
+DelayWriteToken::~DelayWriteToken() {
+ controller_->total_delayed_--;
+ assert(controller_->total_delayed_.load() >= 0);
+}
+
+CompactionPressureToken::~CompactionPressureToken() {
+ controller_->total_compaction_pressure_--;
+ assert(controller_->total_compaction_pressure_ >= 0);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_controller.h b/src/rocksdb/db/write_controller.h
new file mode 100644
index 000000000..785ae6896
--- /dev/null
+++ b/src/rocksdb/db/write_controller.h
@@ -0,0 +1,144 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdint.h>
+
+#include <atomic>
+#include <memory>
+#include "rocksdb/rate_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Env;
+class WriteControllerToken;
+
+// WriteController is controlling write stalls in our write code-path. Write
+// stalls happen when compaction can't keep up with write rate.
+// All of the methods here (including WriteControllerToken's destructors) need
+// to be called while holding DB mutex
+class WriteController {
+ public:
+ explicit WriteController(uint64_t _delayed_write_rate = 1024u * 1024u * 32u,
+ int64_t low_pri_rate_bytes_per_sec = 1024 * 1024)
+ : total_stopped_(0),
+ total_delayed_(0),
+ total_compaction_pressure_(0),
+ bytes_left_(0),
+ last_refill_time_(0),
+ low_pri_rate_limiter_(
+ NewGenericRateLimiter(low_pri_rate_bytes_per_sec)) {
+ set_max_delayed_write_rate(_delayed_write_rate);
+ }
+ ~WriteController() = default;
+
+ // When an actor (column family) requests a stop token, all writes will be
+ // stopped until the stop token is released (deleted)
+ std::unique_ptr<WriteControllerToken> GetStopToken();
+ // When an actor (column family) requests a delay token, total delay for all
+ // writes to the DB will be controlled under the delayed write rate. Every
+ // write needs to call GetDelay() with number of bytes writing to the DB,
+ // which returns number of microseconds to sleep.
+ std::unique_ptr<WriteControllerToken> GetDelayToken(
+ uint64_t delayed_write_rate);
+ // When an actor (column family) requests a moderate token, compaction
+ // threads will be increased
+ std::unique_ptr<WriteControllerToken> GetCompactionPressureToken();
+
+ // these three metods are querying the state of the WriteController
+ bool IsStopped() const;
+ bool NeedsDelay() const { return total_delayed_.load() > 0; }
+ bool NeedSpeedupCompaction() const {
+ return IsStopped() || NeedsDelay() || total_compaction_pressure_ > 0;
+ }
+ // return how many microseconds the caller needs to sleep after the call
+ // num_bytes: how many number of bytes to put into the DB.
+ // Prerequisite: DB mutex held.
+ uint64_t GetDelay(Env* env, uint64_t num_bytes);
+ void set_delayed_write_rate(uint64_t write_rate) {
+ // avoid divide 0
+ if (write_rate == 0) {
+ write_rate = 1u;
+ } else if (write_rate > max_delayed_write_rate()) {
+ write_rate = max_delayed_write_rate();
+ }
+ delayed_write_rate_ = write_rate;
+ }
+
+ void set_max_delayed_write_rate(uint64_t write_rate) {
+ // avoid divide 0
+ if (write_rate == 0) {
+ write_rate = 1u;
+ }
+ max_delayed_write_rate_ = write_rate;
+ // update delayed_write_rate_ as well
+ delayed_write_rate_ = write_rate;
+ }
+
+ uint64_t delayed_write_rate() const { return delayed_write_rate_; }
+
+ uint64_t max_delayed_write_rate() const { return max_delayed_write_rate_; }
+
+ RateLimiter* low_pri_rate_limiter() { return low_pri_rate_limiter_.get(); }
+
+ private:
+ uint64_t NowMicrosMonotonic(Env* env);
+
+ friend class WriteControllerToken;
+ friend class StopWriteToken;
+ friend class DelayWriteToken;
+ friend class CompactionPressureToken;
+
+ std::atomic<int> total_stopped_;
+ std::atomic<int> total_delayed_;
+ std::atomic<int> total_compaction_pressure_;
+ uint64_t bytes_left_;
+ uint64_t last_refill_time_;
+ // write rate set when initialization or by `DBImpl::SetDBOptions`
+ uint64_t max_delayed_write_rate_;
+ // current write rate
+ uint64_t delayed_write_rate_;
+
+ std::unique_ptr<RateLimiter> low_pri_rate_limiter_;
+};
+
+class WriteControllerToken {
+ public:
+ explicit WriteControllerToken(WriteController* controller)
+ : controller_(controller) {}
+ virtual ~WriteControllerToken() {}
+
+ protected:
+ WriteController* controller_;
+
+ private:
+ // no copying allowed
+ WriteControllerToken(const WriteControllerToken&) = delete;
+ void operator=(const WriteControllerToken&) = delete;
+};
+
+class StopWriteToken : public WriteControllerToken {
+ public:
+ explicit StopWriteToken(WriteController* controller)
+ : WriteControllerToken(controller) {}
+ virtual ~StopWriteToken();
+};
+
+class DelayWriteToken : public WriteControllerToken {
+ public:
+ explicit DelayWriteToken(WriteController* controller)
+ : WriteControllerToken(controller) {}
+ virtual ~DelayWriteToken();
+};
+
+class CompactionPressureToken : public WriteControllerToken {
+ public:
+ explicit CompactionPressureToken(WriteController* controller)
+ : WriteControllerToken(controller) {}
+ virtual ~CompactionPressureToken();
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_controller_test.cc b/src/rocksdb/db/write_controller_test.cc
new file mode 100644
index 000000000..72d116798
--- /dev/null
+++ b/src/rocksdb/db/write_controller_test.cc
@@ -0,0 +1,135 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#include <ratio>
+
+#include "db/write_controller.h"
+
+#include "rocksdb/env.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WriteControllerTest : public testing::Test {};
+
+class TimeSetEnv : public EnvWrapper {
+ public:
+ explicit TimeSetEnv() : EnvWrapper(nullptr) {}
+ uint64_t now_micros_ = 6666;
+ uint64_t NowNanos() override { return now_micros_ * std::milli::den; }
+};
+
+TEST_F(WriteControllerTest, ChangeDelayRateTest) {
+ TimeSetEnv env;
+ WriteController controller(40000000u); // also set max delayed rate
+ controller.set_delayed_write_rate(10000000u);
+ auto delay_token_0 =
+ controller.GetDelayToken(controller.delayed_write_rate());
+ ASSERT_EQ(static_cast<uint64_t>(2000000),
+ controller.GetDelay(&env, 20000000u));
+ auto delay_token_1 = controller.GetDelayToken(2000000u);
+ ASSERT_EQ(static_cast<uint64_t>(10000000),
+ controller.GetDelay(&env, 20000000u));
+ auto delay_token_2 = controller.GetDelayToken(1000000u);
+ ASSERT_EQ(static_cast<uint64_t>(20000000),
+ controller.GetDelay(&env, 20000000u));
+ auto delay_token_3 = controller.GetDelayToken(20000000u);
+ ASSERT_EQ(static_cast<uint64_t>(1000000),
+ controller.GetDelay(&env, 20000000u));
+ // This is more than max rate. Max delayed rate will be used.
+ auto delay_token_4 =
+ controller.GetDelayToken(controller.delayed_write_rate() * 3);
+ ASSERT_EQ(static_cast<uint64_t>(500000),
+ controller.GetDelay(&env, 20000000u));
+}
+
+TEST_F(WriteControllerTest, SanityTest) {
+ WriteController controller(10000000u);
+ auto stop_token_1 = controller.GetStopToken();
+ auto stop_token_2 = controller.GetStopToken();
+
+ ASSERT_TRUE(controller.IsStopped());
+ stop_token_1.reset();
+ ASSERT_TRUE(controller.IsStopped());
+ stop_token_2.reset();
+ ASSERT_FALSE(controller.IsStopped());
+
+ TimeSetEnv env;
+
+ auto delay_token_1 = controller.GetDelayToken(10000000u);
+ ASSERT_EQ(static_cast<uint64_t>(2000000),
+ controller.GetDelay(&env, 20000000u));
+
+ env.now_micros_ += 1999900u; // sleep debt 1000
+
+ auto delay_token_2 = controller.GetDelayToken(10000000u);
+ // Rate reset after changing the token.
+ ASSERT_EQ(static_cast<uint64_t>(2000000),
+ controller.GetDelay(&env, 20000000u));
+
+ env.now_micros_ += 1999900u; // sleep debt 1000
+
+ // One refill: 10240 bytes allowed, 1000 used, 9240 left
+ ASSERT_EQ(static_cast<uint64_t>(1124), controller.GetDelay(&env, 1000u));
+ env.now_micros_ += 1124u; // sleep debt 0
+
+ delay_token_2.reset();
+ // 1000 used, 8240 left
+ ASSERT_EQ(static_cast<uint64_t>(0), controller.GetDelay(&env, 1000u));
+
+ env.now_micros_ += 100u; // sleep credit 100
+ // 1000 used, 7240 left
+ ASSERT_EQ(static_cast<uint64_t>(0), controller.GetDelay(&env, 1000u));
+
+ env.now_micros_ += 100u; // sleep credit 200
+ // One refill: 10240 fileed, sleep credit generates 2000. 8000 used
+ // 7240 + 10240 + 2000 - 8000 = 11480 left
+ ASSERT_EQ(static_cast<uint64_t>(1024u), controller.GetDelay(&env, 8000u));
+
+ env.now_micros_ += 200u; // sleep debt 824
+ // 1000 used, 10480 left.
+ ASSERT_EQ(static_cast<uint64_t>(0), controller.GetDelay(&env, 1000u));
+
+ env.now_micros_ += 200u; // sleep debt 624
+ // Out of bound sleep, still 10480 left
+ ASSERT_EQ(static_cast<uint64_t>(3000624u),
+ controller.GetDelay(&env, 30000000u));
+
+ env.now_micros_ += 3000724u; // sleep credit 100
+ // 6000 used, 4480 left.
+ ASSERT_EQ(static_cast<uint64_t>(0), controller.GetDelay(&env, 6000u));
+
+ env.now_micros_ += 200u; // sleep credit 300
+ // One refill, credit 4480 balance + 3000 credit + 10240 refill
+ // Use 8000, 9720 left
+ ASSERT_EQ(static_cast<uint64_t>(1024u), controller.GetDelay(&env, 8000u));
+
+ env.now_micros_ += 3024u; // sleep credit 2000
+
+ // 1720 left
+ ASSERT_EQ(static_cast<uint64_t>(0u), controller.GetDelay(&env, 8000u));
+
+ // 1720 balance + 20000 credit = 20170 left
+ // Use 8000, 12170 left
+ ASSERT_EQ(static_cast<uint64_t>(0u), controller.GetDelay(&env, 8000u));
+
+ // 4170 left
+ ASSERT_EQ(static_cast<uint64_t>(0u), controller.GetDelay(&env, 8000u));
+
+ // Need a refill
+ ASSERT_EQ(static_cast<uint64_t>(1024u), controller.GetDelay(&env, 9000u));
+
+ delay_token_1.reset();
+ ASSERT_EQ(static_cast<uint64_t>(0), controller.GetDelay(&env, 30000000u));
+ delay_token_1.reset();
+ ASSERT_FALSE(controller.IsStopped());
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/write_thread.cc b/src/rocksdb/db/write_thread.cc
new file mode 100644
index 000000000..5f50bba63
--- /dev/null
+++ b/src/rocksdb/db/write_thread.cc
@@ -0,0 +1,777 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/write_thread.h"
+#include <chrono>
+#include <thread>
+#include "db/column_family.h"
+#include "monitoring/perf_context_imp.h"
+#include "port/port.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+WriteThread::WriteThread(const ImmutableDBOptions& db_options)
+ : max_yield_usec_(db_options.enable_write_thread_adaptive_yield
+ ? db_options.write_thread_max_yield_usec
+ : 0),
+ slow_yield_usec_(db_options.write_thread_slow_yield_usec),
+ allow_concurrent_memtable_write_(
+ db_options.allow_concurrent_memtable_write),
+ enable_pipelined_write_(db_options.enable_pipelined_write),
+ max_write_batch_group_size_bytes(
+ db_options.max_write_batch_group_size_bytes),
+ newest_writer_(nullptr),
+ newest_memtable_writer_(nullptr),
+ last_sequence_(0),
+ write_stall_dummy_(),
+ stall_mu_(),
+ stall_cv_(&stall_mu_) {}
+
+uint8_t WriteThread::BlockingAwaitState(Writer* w, uint8_t goal_mask) {
+ // We're going to block. Lazily create the mutex. We guarantee
+ // propagation of this construction to the waker via the
+ // STATE_LOCKED_WAITING state. The waker won't try to touch the mutex
+ // or the condvar unless they CAS away the STATE_LOCKED_WAITING that
+ // we install below.
+ w->CreateMutex();
+
+ auto state = w->state.load(std::memory_order_acquire);
+ assert(state != STATE_LOCKED_WAITING);
+ if ((state & goal_mask) == 0 &&
+ w->state.compare_exchange_strong(state, STATE_LOCKED_WAITING)) {
+ // we have permission (and an obligation) to use StateMutex
+ std::unique_lock<std::mutex> guard(w->StateMutex());
+ w->StateCV().wait(guard, [w] {
+ return w->state.load(std::memory_order_relaxed) != STATE_LOCKED_WAITING;
+ });
+ state = w->state.load(std::memory_order_relaxed);
+ }
+ // else tricky. Goal is met or CAS failed. In the latter case the waker
+ // must have changed the state, and compare_exchange_strong has updated
+ // our local variable with the new one. At the moment WriteThread never
+ // waits for a transition across intermediate states, so we know that
+ // since a state change has occurred the goal must have been met.
+ assert((state & goal_mask) != 0);
+ return state;
+}
+
+uint8_t WriteThread::AwaitState(Writer* w, uint8_t goal_mask,
+ AdaptationContext* ctx) {
+ uint8_t state = 0;
+
+ // 1. Busy loop using "pause" for 1 micro sec
+ // 2. Else SOMETIMES busy loop using "yield" for 100 micro sec (default)
+ // 3. Else blocking wait
+
+ // On a modern Xeon each loop takes about 7 nanoseconds (most of which
+ // is the effect of the pause instruction), so 200 iterations is a bit
+ // more than a microsecond. This is long enough that waits longer than
+ // this can amortize the cost of accessing the clock and yielding.
+ for (uint32_t tries = 0; tries < 200; ++tries) {
+ state = w->state.load(std::memory_order_acquire);
+ if ((state & goal_mask) != 0) {
+ return state;
+ }
+ port::AsmVolatilePause();
+ }
+
+ // This is below the fast path, so that the stat is zero when all writes are
+ // from the same thread.
+ PERF_TIMER_GUARD(write_thread_wait_nanos);
+
+ // If we're only going to end up waiting a short period of time,
+ // it can be a lot more efficient to call std::this_thread::yield()
+ // in a loop than to block in StateMutex(). For reference, on my 4.0
+ // SELinux test server with support for syscall auditing enabled, the
+ // minimum latency between FUTEX_WAKE to returning from FUTEX_WAIT is
+ // 2.7 usec, and the average is more like 10 usec. That can be a big
+ // drag on RockDB's single-writer design. Of course, spinning is a
+ // bad idea if other threads are waiting to run or if we're going to
+ // wait for a long time. How do we decide?
+ //
+ // We break waiting into 3 categories: short-uncontended,
+ // short-contended, and long. If we had an oracle, then we would always
+ // spin for short-uncontended, always block for long, and our choice for
+ // short-contended might depend on whether we were trying to optimize
+ // RocksDB throughput or avoid being greedy with system resources.
+ //
+ // Bucketing into short or long is easy by measuring elapsed time.
+ // Differentiating short-uncontended from short-contended is a bit
+ // trickier, but not too bad. We could look for involuntary context
+ // switches using getrusage(RUSAGE_THREAD, ..), but it's less work
+ // (portability code and CPU) to just look for yield calls that take
+ // longer than we expect. sched_yield() doesn't actually result in any
+ // context switch overhead if there are no other runnable processes
+ // on the current core, in which case it usually takes less than
+ // a microsecond.
+ //
+ // There are two primary tunables here: the threshold between "short"
+ // and "long" waits, and the threshold at which we suspect that a yield
+ // is slow enough to indicate we should probably block. If these
+ // thresholds are chosen well then CPU-bound workloads that don't
+ // have more threads than cores will experience few context switches
+ // (voluntary or involuntary), and the total number of context switches
+ // (voluntary and involuntary) will not be dramatically larger (maybe
+ // 2x) than the number of voluntary context switches that occur when
+ // --max_yield_wait_micros=0.
+ //
+ // There's another constant, which is the number of slow yields we will
+ // tolerate before reversing our previous decision. Solitary slow
+ // yields are pretty common (low-priority small jobs ready to run),
+ // so this should be at least 2. We set this conservatively to 3 so
+ // that we can also immediately schedule a ctx adaptation, rather than
+ // waiting for the next update_ctx.
+
+ const size_t kMaxSlowYieldsWhileSpinning = 3;
+
+ // Whether the yield approach has any credit in this context. The credit is
+ // added by yield being succesfull before timing out, and decreased otherwise.
+ auto& yield_credit = ctx->value;
+ // Update the yield_credit based on sample runs or right after a hard failure
+ bool update_ctx = false;
+ // Should we reinforce the yield credit
+ bool would_spin_again = false;
+ // The samling base for updating the yeild credit. The sampling rate would be
+ // 1/sampling_base.
+ const int sampling_base = 256;
+
+ if (max_yield_usec_ > 0) {
+ update_ctx = Random::GetTLSInstance()->OneIn(sampling_base);
+
+ if (update_ctx || yield_credit.load(std::memory_order_relaxed) >= 0) {
+ // we're updating the adaptation statistics, or spinning has >
+ // 50% chance of being shorter than max_yield_usec_ and causing no
+ // involuntary context switches
+ auto spin_begin = std::chrono::steady_clock::now();
+
+ // this variable doesn't include the final yield (if any) that
+ // causes the goal to be met
+ size_t slow_yield_count = 0;
+
+ auto iter_begin = spin_begin;
+ while ((iter_begin - spin_begin) <=
+ std::chrono::microseconds(max_yield_usec_)) {
+ std::this_thread::yield();
+
+ state = w->state.load(std::memory_order_acquire);
+ if ((state & goal_mask) != 0) {
+ // success
+ would_spin_again = true;
+ break;
+ }
+
+ auto now = std::chrono::steady_clock::now();
+ if (now == iter_begin ||
+ now - iter_begin >= std::chrono::microseconds(slow_yield_usec_)) {
+ // conservatively count it as a slow yield if our clock isn't
+ // accurate enough to measure the yield duration
+ ++slow_yield_count;
+ if (slow_yield_count >= kMaxSlowYieldsWhileSpinning) {
+ // Not just one ivcsw, but several. Immediately update yield_credit
+ // and fall back to blocking
+ update_ctx = true;
+ break;
+ }
+ }
+ iter_begin = now;
+ }
+ }
+ }
+
+ if ((state & goal_mask) == 0) {
+ TEST_SYNC_POINT_CALLBACK("WriteThread::AwaitState:BlockingWaiting", w);
+ state = BlockingAwaitState(w, goal_mask);
+ }
+
+ if (update_ctx) {
+ // Since our update is sample based, it is ok if a thread overwrites the
+ // updates by other threads. Thus the update does not have to be atomic.
+ auto v = yield_credit.load(std::memory_order_relaxed);
+ // fixed point exponential decay with decay constant 1/1024, with +1
+ // and -1 scaled to avoid overflow for int32_t
+ //
+ // On each update the positive credit is decayed by a facor of 1/1024 (i.e.,
+ // 0.1%). If the sampled yield was successful, the credit is also increased
+ // by X. Setting X=2^17 ensures that the credit never exceeds
+ // 2^17*2^10=2^27, which is lower than 2^31 the upperbound of int32_t. Same
+ // logic applies to negative credits.
+ v = v - (v / 1024) + (would_spin_again ? 1 : -1) * 131072;
+ yield_credit.store(v, std::memory_order_relaxed);
+ }
+
+ assert((state & goal_mask) != 0);
+ return state;
+}
+
+void WriteThread::SetState(Writer* w, uint8_t new_state) {
+ auto state = w->state.load(std::memory_order_acquire);
+ if (state == STATE_LOCKED_WAITING ||
+ !w->state.compare_exchange_strong(state, new_state)) {
+ assert(state == STATE_LOCKED_WAITING);
+
+ std::lock_guard<std::mutex> guard(w->StateMutex());
+ assert(w->state.load(std::memory_order_relaxed) != new_state);
+ w->state.store(new_state, std::memory_order_relaxed);
+ w->StateCV().notify_one();
+ }
+}
+
+bool WriteThread::LinkOne(Writer* w, std::atomic<Writer*>* newest_writer) {
+ assert(newest_writer != nullptr);
+ assert(w->state == STATE_INIT);
+ Writer* writers = newest_writer->load(std::memory_order_relaxed);
+ while (true) {
+ // If write stall in effect, and w->no_slowdown is not true,
+ // block here until stall is cleared. If its true, then return
+ // immediately
+ if (writers == &write_stall_dummy_) {
+ if (w->no_slowdown) {
+ w->status = Status::Incomplete("Write stall");
+ SetState(w, STATE_COMPLETED);
+ return false;
+ }
+ // Since no_slowdown is false, wait here to be notified of the write
+ // stall clearing
+ {
+ MutexLock lock(&stall_mu_);
+ writers = newest_writer->load(std::memory_order_relaxed);
+ if (writers == &write_stall_dummy_) {
+ stall_cv_.Wait();
+ // Load newest_writers_ again since it may have changed
+ writers = newest_writer->load(std::memory_order_relaxed);
+ continue;
+ }
+ }
+ }
+ w->link_older = writers;
+ if (newest_writer->compare_exchange_weak(writers, w)) {
+ return (writers == nullptr);
+ }
+ }
+}
+
+bool WriteThread::LinkGroup(WriteGroup& write_group,
+ std::atomic<Writer*>* newest_writer) {
+ assert(newest_writer != nullptr);
+ Writer* leader = write_group.leader;
+ Writer* last_writer = write_group.last_writer;
+ Writer* w = last_writer;
+ while (true) {
+ // Unset link_newer pointers to make sure when we call
+ // CreateMissingNewerLinks later it create all missing links.
+ w->link_newer = nullptr;
+ w->write_group = nullptr;
+ if (w == leader) {
+ break;
+ }
+ w = w->link_older;
+ }
+ Writer* newest = newest_writer->load(std::memory_order_relaxed);
+ while (true) {
+ leader->link_older = newest;
+ if (newest_writer->compare_exchange_weak(newest, last_writer)) {
+ return (newest == nullptr);
+ }
+ }
+}
+
+void WriteThread::CreateMissingNewerLinks(Writer* head) {
+ while (true) {
+ Writer* next = head->link_older;
+ if (next == nullptr || next->link_newer != nullptr) {
+ assert(next == nullptr || next->link_newer == head);
+ break;
+ }
+ next->link_newer = head;
+ head = next;
+ }
+}
+
+WriteThread::Writer* WriteThread::FindNextLeader(Writer* from,
+ Writer* boundary) {
+ assert(from != nullptr && from != boundary);
+ Writer* current = from;
+ while (current->link_older != boundary) {
+ current = current->link_older;
+ assert(current != nullptr);
+ }
+ return current;
+}
+
+void WriteThread::CompleteLeader(WriteGroup& write_group) {
+ assert(write_group.size > 0);
+ Writer* leader = write_group.leader;
+ if (write_group.size == 1) {
+ write_group.leader = nullptr;
+ write_group.last_writer = nullptr;
+ } else {
+ assert(leader->link_newer != nullptr);
+ leader->link_newer->link_older = nullptr;
+ write_group.leader = leader->link_newer;
+ }
+ write_group.size -= 1;
+ SetState(leader, STATE_COMPLETED);
+}
+
+void WriteThread::CompleteFollower(Writer* w, WriteGroup& write_group) {
+ assert(write_group.size > 1);
+ assert(w != write_group.leader);
+ if (w == write_group.last_writer) {
+ w->link_older->link_newer = nullptr;
+ write_group.last_writer = w->link_older;
+ } else {
+ w->link_older->link_newer = w->link_newer;
+ w->link_newer->link_older = w->link_older;
+ }
+ write_group.size -= 1;
+ SetState(w, STATE_COMPLETED);
+}
+
+void WriteThread::BeginWriteStall() {
+ LinkOne(&write_stall_dummy_, &newest_writer_);
+
+ // Walk writer list until w->write_group != nullptr. The current write group
+ // will not have a mix of slowdown/no_slowdown, so its ok to stop at that
+ // point
+ Writer* w = write_stall_dummy_.link_older;
+ Writer* prev = &write_stall_dummy_;
+ while (w != nullptr && w->write_group == nullptr) {
+ if (w->no_slowdown) {
+ prev->link_older = w->link_older;
+ w->status = Status::Incomplete("Write stall");
+ SetState(w, STATE_COMPLETED);
+ if (prev->link_older) {
+ prev->link_older->link_newer = prev;
+ }
+ w = prev->link_older;
+ } else {
+ prev = w;
+ w = w->link_older;
+ }
+ }
+}
+
+void WriteThread::EndWriteStall() {
+ MutexLock lock(&stall_mu_);
+
+ // Unlink write_stall_dummy_ from the write queue. This will unblock
+ // pending write threads to enqueue themselves
+ assert(newest_writer_.load(std::memory_order_relaxed) == &write_stall_dummy_);
+ assert(write_stall_dummy_.link_older != nullptr);
+ write_stall_dummy_.link_older->link_newer = write_stall_dummy_.link_newer;
+ newest_writer_.exchange(write_stall_dummy_.link_older);
+
+ // Wake up writers
+ stall_cv_.SignalAll();
+}
+
+static WriteThread::AdaptationContext jbg_ctx("JoinBatchGroup");
+void WriteThread::JoinBatchGroup(Writer* w) {
+ TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:Start", w);
+ assert(w->batch != nullptr);
+
+ bool linked_as_leader = LinkOne(w, &newest_writer_);
+
+ if (linked_as_leader) {
+ SetState(w, STATE_GROUP_LEADER);
+ }
+
+ TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:Wait", w);
+
+ if (!linked_as_leader) {
+ /**
+ * Wait util:
+ * 1) An existing leader pick us as the new leader when it finishes
+ * 2) An existing leader pick us as its follewer and
+ * 2.1) finishes the memtable writes on our behalf
+ * 2.2) Or tell us to finish the memtable writes in pralallel
+ * 3) (pipelined write) An existing leader pick us as its follower and
+ * finish book-keeping and WAL write for us, enqueue us as pending
+ * memtable writer, and
+ * 3.1) we become memtable writer group leader, or
+ * 3.2) an existing memtable writer group leader tell us to finish memtable
+ * writes in parallel.
+ */
+ TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:BeganWaiting", w);
+ AwaitState(w, STATE_GROUP_LEADER | STATE_MEMTABLE_WRITER_LEADER |
+ STATE_PARALLEL_MEMTABLE_WRITER | STATE_COMPLETED,
+ &jbg_ctx);
+ TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:DoneWaiting", w);
+ }
+}
+
+size_t WriteThread::EnterAsBatchGroupLeader(Writer* leader,
+ WriteGroup* write_group) {
+ assert(leader->link_older == nullptr);
+ assert(leader->batch != nullptr);
+ assert(write_group != nullptr);
+
+ size_t size = WriteBatchInternal::ByteSize(leader->batch);
+
+ // Allow the group to grow up to a maximum size, but if the
+ // original write is small, limit the growth so we do not slow
+ // down the small write too much.
+ size_t max_size = max_write_batch_group_size_bytes;
+ const uint64_t min_batch_size_bytes = max_write_batch_group_size_bytes / 8;
+ if (size <= min_batch_size_bytes) {
+ max_size = size + min_batch_size_bytes;
+ }
+
+ leader->write_group = write_group;
+ write_group->leader = leader;
+ write_group->last_writer = leader;
+ write_group->size = 1;
+ Writer* newest_writer = newest_writer_.load(std::memory_order_acquire);
+
+ // This is safe regardless of any db mutex status of the caller. Previous
+ // calls to ExitAsGroupLeader either didn't call CreateMissingNewerLinks
+ // (they emptied the list and then we added ourself as leader) or had to
+ // explicitly wake us up (the list was non-empty when we added ourself,
+ // so we have already received our MarkJoined).
+ CreateMissingNewerLinks(newest_writer);
+
+ // Tricky. Iteration start (leader) is exclusive and finish
+ // (newest_writer) is inclusive. Iteration goes from old to new.
+ Writer* w = leader;
+ while (w != newest_writer) {
+ w = w->link_newer;
+
+ if (w->sync && !leader->sync) {
+ // Do not include a sync write into a batch handled by a non-sync write.
+ break;
+ }
+
+ if (w->no_slowdown != leader->no_slowdown) {
+ // Do not mix writes that are ok with delays with the ones that
+ // request fail on delays.
+ break;
+ }
+
+ if (w->disable_wal != leader->disable_wal) {
+ // Do not mix writes that enable WAL with the ones whose
+ // WAL disabled.
+ break;
+ }
+
+ if (w->batch == nullptr) {
+ // Do not include those writes with nullptr batch. Those are not writes,
+ // those are something else. They want to be alone
+ break;
+ }
+
+ if (w->callback != nullptr && !w->callback->AllowWriteBatching()) {
+ // dont batch writes that don't want to be batched
+ break;
+ }
+
+ auto batch_size = WriteBatchInternal::ByteSize(w->batch);
+ if (size + batch_size > max_size) {
+ // Do not make batch too big
+ break;
+ }
+
+ w->write_group = write_group;
+ size += batch_size;
+ write_group->last_writer = w;
+ write_group->size++;
+ }
+ TEST_SYNC_POINT_CALLBACK("WriteThread::EnterAsBatchGroupLeader:End", w);
+ return size;
+}
+
+void WriteThread::EnterAsMemTableWriter(Writer* leader,
+ WriteGroup* write_group) {
+ assert(leader != nullptr);
+ assert(leader->link_older == nullptr);
+ assert(leader->batch != nullptr);
+ assert(write_group != nullptr);
+
+ size_t size = WriteBatchInternal::ByteSize(leader->batch);
+
+ // Allow the group to grow up to a maximum size, but if the
+ // original write is small, limit the growth so we do not slow
+ // down the small write too much.
+ size_t max_size = max_write_batch_group_size_bytes;
+ const uint64_t min_batch_size_bytes = max_write_batch_group_size_bytes / 8;
+ if (size <= min_batch_size_bytes) {
+ max_size = size + min_batch_size_bytes;
+ }
+
+ leader->write_group = write_group;
+ write_group->leader = leader;
+ write_group->size = 1;
+ Writer* last_writer = leader;
+
+ if (!allow_concurrent_memtable_write_ || !leader->batch->HasMerge()) {
+ Writer* newest_writer = newest_memtable_writer_.load();
+ CreateMissingNewerLinks(newest_writer);
+
+ Writer* w = leader;
+ while (w != newest_writer) {
+ w = w->link_newer;
+
+ if (w->batch == nullptr) {
+ break;
+ }
+
+ if (w->batch->HasMerge()) {
+ break;
+ }
+
+ if (!allow_concurrent_memtable_write_) {
+ auto batch_size = WriteBatchInternal::ByteSize(w->batch);
+ if (size + batch_size > max_size) {
+ // Do not make batch too big
+ break;
+ }
+ size += batch_size;
+ }
+
+ w->write_group = write_group;
+ last_writer = w;
+ write_group->size++;
+ }
+ }
+
+ write_group->last_writer = last_writer;
+ write_group->last_sequence =
+ last_writer->sequence + WriteBatchInternal::Count(last_writer->batch) - 1;
+}
+
+void WriteThread::ExitAsMemTableWriter(Writer* /*self*/,
+ WriteGroup& write_group) {
+ Writer* leader = write_group.leader;
+ Writer* last_writer = write_group.last_writer;
+
+ Writer* newest_writer = last_writer;
+ if (!newest_memtable_writer_.compare_exchange_strong(newest_writer,
+ nullptr)) {
+ CreateMissingNewerLinks(newest_writer);
+ Writer* next_leader = last_writer->link_newer;
+ assert(next_leader != nullptr);
+ next_leader->link_older = nullptr;
+ SetState(next_leader, STATE_MEMTABLE_WRITER_LEADER);
+ }
+ Writer* w = leader;
+ while (true) {
+ if (!write_group.status.ok()) {
+ w->status = write_group.status;
+ }
+ Writer* next = w->link_newer;
+ if (w != leader) {
+ SetState(w, STATE_COMPLETED);
+ }
+ if (w == last_writer) {
+ break;
+ }
+ w = next;
+ }
+ // Note that leader has to exit last, since it owns the write group.
+ SetState(leader, STATE_COMPLETED);
+}
+
+void WriteThread::LaunchParallelMemTableWriters(WriteGroup* write_group) {
+ assert(write_group != nullptr);
+ write_group->running.store(write_group->size);
+ for (auto w : *write_group) {
+ SetState(w, STATE_PARALLEL_MEMTABLE_WRITER);
+ }
+}
+
+static WriteThread::AdaptationContext cpmtw_ctx("CompleteParallelMemTableWriter");
+// This method is called by both the leader and parallel followers
+bool WriteThread::CompleteParallelMemTableWriter(Writer* w) {
+
+ auto* write_group = w->write_group;
+ if (!w->status.ok()) {
+ std::lock_guard<std::mutex> guard(write_group->leader->StateMutex());
+ write_group->status = w->status;
+ }
+
+ if (write_group->running-- > 1) {
+ // we're not the last one
+ AwaitState(w, STATE_COMPLETED, &cpmtw_ctx);
+ return false;
+ }
+ // else we're the last parallel worker and should perform exit duties.
+ w->status = write_group->status;
+ return true;
+}
+
+void WriteThread::ExitAsBatchGroupFollower(Writer* w) {
+ auto* write_group = w->write_group;
+
+ assert(w->state == STATE_PARALLEL_MEMTABLE_WRITER);
+ assert(write_group->status.ok());
+ ExitAsBatchGroupLeader(*write_group, write_group->status);
+ assert(w->status.ok());
+ assert(w->state == STATE_COMPLETED);
+ SetState(write_group->leader, STATE_COMPLETED);
+}
+
+static WriteThread::AdaptationContext eabgl_ctx("ExitAsBatchGroupLeader");
+void WriteThread::ExitAsBatchGroupLeader(WriteGroup& write_group,
+ Status status) {
+ Writer* leader = write_group.leader;
+ Writer* last_writer = write_group.last_writer;
+ assert(leader->link_older == nullptr);
+
+ // Propagate memtable write error to the whole group.
+ if (status.ok() && !write_group.status.ok()) {
+ status = write_group.status;
+ }
+
+ if (enable_pipelined_write_) {
+ // Notify writers don't write to memtable to exit.
+ for (Writer* w = last_writer; w != leader;) {
+ Writer* next = w->link_older;
+ w->status = status;
+ if (!w->ShouldWriteToMemtable()) {
+ CompleteFollower(w, write_group);
+ }
+ w = next;
+ }
+ if (!leader->ShouldWriteToMemtable()) {
+ CompleteLeader(write_group);
+ }
+
+ Writer* next_leader = nullptr;
+
+ // Look for next leader before we call LinkGroup. If there isn't
+ // pending writers, place a dummy writer at the tail of the queue
+ // so we know the boundary of the current write group.
+ Writer dummy;
+ Writer* expected = last_writer;
+ bool has_dummy = newest_writer_.compare_exchange_strong(expected, &dummy);
+ if (!has_dummy) {
+ // We find at least one pending writer when we insert dummy. We search
+ // for next leader from there.
+ next_leader = FindNextLeader(expected, last_writer);
+ assert(next_leader != nullptr && next_leader != last_writer);
+ }
+
+ // Link the ramaining of the group to memtable writer list.
+ //
+ // We have to link our group to memtable writer queue before wake up the
+ // next leader or set newest_writer_ to null, otherwise the next leader
+ // can run ahead of us and link to memtable writer queue before we do.
+ if (write_group.size > 0) {
+ if (LinkGroup(write_group, &newest_memtable_writer_)) {
+ // The leader can now be different from current writer.
+ SetState(write_group.leader, STATE_MEMTABLE_WRITER_LEADER);
+ }
+ }
+
+ // If we have inserted dummy in the queue, remove it now and check if there
+ // are pending writer join the queue since we insert the dummy. If so,
+ // look for next leader again.
+ if (has_dummy) {
+ assert(next_leader == nullptr);
+ expected = &dummy;
+ bool has_pending_writer =
+ !newest_writer_.compare_exchange_strong(expected, nullptr);
+ if (has_pending_writer) {
+ next_leader = FindNextLeader(expected, &dummy);
+ assert(next_leader != nullptr && next_leader != &dummy);
+ }
+ }
+
+ if (next_leader != nullptr) {
+ next_leader->link_older = nullptr;
+ SetState(next_leader, STATE_GROUP_LEADER);
+ }
+ AwaitState(leader, STATE_MEMTABLE_WRITER_LEADER |
+ STATE_PARALLEL_MEMTABLE_WRITER | STATE_COMPLETED,
+ &eabgl_ctx);
+ } else {
+ Writer* head = newest_writer_.load(std::memory_order_acquire);
+ if (head != last_writer ||
+ !newest_writer_.compare_exchange_strong(head, nullptr)) {
+ // Either w wasn't the head during the load(), or it was the head
+ // during the load() but somebody else pushed onto the list before
+ // we did the compare_exchange_strong (causing it to fail). In the
+ // latter case compare_exchange_strong has the effect of re-reading
+ // its first param (head). No need to retry a failing CAS, because
+ // only a departing leader (which we are at the moment) can remove
+ // nodes from the list.
+ assert(head != last_writer);
+
+ // After walking link_older starting from head (if not already done)
+ // we will be able to traverse w->link_newer below. This function
+ // can only be called from an active leader, only a leader can
+ // clear newest_writer_, we didn't, and only a clear newest_writer_
+ // could cause the next leader to start their work without a call
+ // to MarkJoined, so we can definitely conclude that no other leader
+ // work is going on here (with or without db mutex).
+ CreateMissingNewerLinks(head);
+ assert(last_writer->link_newer->link_older == last_writer);
+ last_writer->link_newer->link_older = nullptr;
+
+ // Next leader didn't self-identify, because newest_writer_ wasn't
+ // nullptr when they enqueued (we were definitely enqueued before them
+ // and are still in the list). That means leader handoff occurs when
+ // we call MarkJoined
+ SetState(last_writer->link_newer, STATE_GROUP_LEADER);
+ }
+ // else nobody else was waiting, although there might already be a new
+ // leader now
+
+ while (last_writer != leader) {
+ last_writer->status = status;
+ // we need to read link_older before calling SetState, because as soon
+ // as it is marked committed the other thread's Await may return and
+ // deallocate the Writer.
+ auto next = last_writer->link_older;
+ SetState(last_writer, STATE_COMPLETED);
+
+ last_writer = next;
+ }
+ }
+}
+
+static WriteThread::AdaptationContext eu_ctx("EnterUnbatched");
+void WriteThread::EnterUnbatched(Writer* w, InstrumentedMutex* mu) {
+ assert(w != nullptr && w->batch == nullptr);
+ mu->Unlock();
+ bool linked_as_leader = LinkOne(w, &newest_writer_);
+ if (!linked_as_leader) {
+ TEST_SYNC_POINT("WriteThread::EnterUnbatched:Wait");
+ // Last leader will not pick us as a follower since our batch is nullptr
+ AwaitState(w, STATE_GROUP_LEADER, &eu_ctx);
+ }
+ if (enable_pipelined_write_) {
+ WaitForMemTableWriters();
+ }
+ mu->Lock();
+}
+
+void WriteThread::ExitUnbatched(Writer* w) {
+ assert(w != nullptr);
+ Writer* newest_writer = w;
+ if (!newest_writer_.compare_exchange_strong(newest_writer, nullptr)) {
+ CreateMissingNewerLinks(newest_writer);
+ Writer* next_leader = w->link_newer;
+ assert(next_leader != nullptr);
+ next_leader->link_older = nullptr;
+ SetState(next_leader, STATE_GROUP_LEADER);
+ }
+}
+
+static WriteThread::AdaptationContext wfmw_ctx("WaitForMemTableWriters");
+void WriteThread::WaitForMemTableWriters() {
+ assert(enable_pipelined_write_);
+ if (newest_memtable_writer_.load() == nullptr) {
+ return;
+ }
+ Writer w;
+ if (!LinkOne(&w, &newest_memtable_writer_)) {
+ AwaitState(&w, STATE_MEMTABLE_WRITER_LEADER, &wfmw_ctx);
+ }
+ newest_memtable_writer_.store(nullptr);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_thread.h b/src/rocksdb/db/write_thread.h
new file mode 100644
index 000000000..878199714
--- /dev/null
+++ b/src/rocksdb/db/write_thread.h
@@ -0,0 +1,431 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <assert.h>
+#include <stdint.h>
+#include <atomic>
+#include <chrono>
+#include <condition_variable>
+#include <mutex>
+#include <type_traits>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/pre_release_callback.h"
+#include "db/write_callback.h"
+#include "monitoring/instrumented_mutex.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/write_batch.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WriteThread {
+ public:
+ enum State : uint8_t {
+ // The initial state of a writer. This is a Writer that is
+ // waiting in JoinBatchGroup. This state can be left when another
+ // thread informs the waiter that it has become a group leader
+ // (-> STATE_GROUP_LEADER), when a leader that has chosen to be
+ // non-parallel informs a follower that its writes have been committed
+ // (-> STATE_COMPLETED), or when a leader that has chosen to perform
+ // updates in parallel and needs this Writer to apply its batch (->
+ // STATE_PARALLEL_FOLLOWER).
+ STATE_INIT = 1,
+
+ // The state used to inform a waiting Writer that it has become the
+ // leader, and it should now build a write batch group. Tricky:
+ // this state is not used if newest_writer_ is empty when a writer
+ // enqueues itself, because there is no need to wait (or even to
+ // create the mutex and condvar used to wait) in that case. This is
+ // a terminal state unless the leader chooses to make this a parallel
+ // batch, in which case the last parallel worker to finish will move
+ // the leader to STATE_COMPLETED.
+ STATE_GROUP_LEADER = 2,
+
+ // The state used to inform a waiting writer that it has become the
+ // leader of memtable writer group. The leader will either write
+ // memtable for the whole group, or launch a parallel group write
+ // to memtable by calling LaunchParallelMemTableWrite.
+ STATE_MEMTABLE_WRITER_LEADER = 4,
+
+ // The state used to inform a waiting writer that it has become a
+ // parallel memtable writer. It can be the group leader who launch the
+ // parallel writer group, or one of the followers. The writer should then
+ // apply its batch to the memtable concurrently and call
+ // CompleteParallelMemTableWriter.
+ STATE_PARALLEL_MEMTABLE_WRITER = 8,
+
+ // A follower whose writes have been applied, or a parallel leader
+ // whose followers have all finished their work. This is a terminal
+ // state.
+ STATE_COMPLETED = 16,
+
+ // A state indicating that the thread may be waiting using StateMutex()
+ // and StateCondVar()
+ STATE_LOCKED_WAITING = 32,
+ };
+
+ struct Writer;
+
+ struct WriteGroup {
+ Writer* leader = nullptr;
+ Writer* last_writer = nullptr;
+ SequenceNumber last_sequence;
+ // before running goes to zero, status needs leader->StateMutex()
+ Status status;
+ std::atomic<size_t> running;
+ size_t size = 0;
+
+ struct Iterator {
+ Writer* writer;
+ Writer* last_writer;
+
+ explicit Iterator(Writer* w, Writer* last)
+ : writer(w), last_writer(last) {}
+
+ Writer* operator*() const { return writer; }
+
+ Iterator& operator++() {
+ assert(writer != nullptr);
+ if (writer == last_writer) {
+ writer = nullptr;
+ } else {
+ writer = writer->link_newer;
+ }
+ return *this;
+ }
+
+ bool operator!=(const Iterator& other) const {
+ return writer != other.writer;
+ }
+ };
+
+ Iterator begin() const { return Iterator(leader, last_writer); }
+ Iterator end() const { return Iterator(nullptr, nullptr); }
+ };
+
+ // Information kept for every waiting writer.
+ struct Writer {
+ WriteBatch* batch;
+ bool sync;
+ bool no_slowdown;
+ bool disable_wal;
+ bool disable_memtable;
+ size_t batch_cnt; // if non-zero, number of sub-batches in the write batch
+ PreReleaseCallback* pre_release_callback;
+ uint64_t log_used; // log number that this batch was inserted into
+ uint64_t log_ref; // log number that memtable insert should reference
+ WriteCallback* callback;
+ bool made_waitable; // records lazy construction of mutex and cv
+ std::atomic<uint8_t> state; // write under StateMutex() or pre-link
+ WriteGroup* write_group;
+ SequenceNumber sequence; // the sequence number to use for the first key
+ Status status;
+ Status callback_status; // status returned by callback->Callback()
+
+ std::aligned_storage<sizeof(std::mutex)>::type state_mutex_bytes;
+ std::aligned_storage<sizeof(std::condition_variable)>::type state_cv_bytes;
+ Writer* link_older; // read/write only before linking, or as leader
+ Writer* link_newer; // lazy, read/write only before linking, or as leader
+
+ Writer()
+ : batch(nullptr),
+ sync(false),
+ no_slowdown(false),
+ disable_wal(false),
+ disable_memtable(false),
+ batch_cnt(0),
+ pre_release_callback(nullptr),
+ log_used(0),
+ log_ref(0),
+ callback(nullptr),
+ made_waitable(false),
+ state(STATE_INIT),
+ write_group(nullptr),
+ sequence(kMaxSequenceNumber),
+ link_older(nullptr),
+ link_newer(nullptr) {}
+
+ Writer(const WriteOptions& write_options, WriteBatch* _batch,
+ WriteCallback* _callback, uint64_t _log_ref, bool _disable_memtable,
+ size_t _batch_cnt = 0,
+ PreReleaseCallback* _pre_release_callback = nullptr)
+ : batch(_batch),
+ sync(write_options.sync),
+ no_slowdown(write_options.no_slowdown),
+ disable_wal(write_options.disableWAL),
+ disable_memtable(_disable_memtable),
+ batch_cnt(_batch_cnt),
+ pre_release_callback(_pre_release_callback),
+ log_used(0),
+ log_ref(_log_ref),
+ callback(_callback),
+ made_waitable(false),
+ state(STATE_INIT),
+ write_group(nullptr),
+ sequence(kMaxSequenceNumber),
+ link_older(nullptr),
+ link_newer(nullptr) {}
+
+ ~Writer() {
+ if (made_waitable) {
+ StateMutex().~mutex();
+ StateCV().~condition_variable();
+ }
+ }
+
+ bool CheckCallback(DB* db) {
+ if (callback != nullptr) {
+ callback_status = callback->Callback(db);
+ }
+ return callback_status.ok();
+ }
+
+ void CreateMutex() {
+ if (!made_waitable) {
+ // Note that made_waitable is tracked separately from state
+ // transitions, because we can't atomically create the mutex and
+ // link into the list.
+ made_waitable = true;
+ new (&state_mutex_bytes) std::mutex;
+ new (&state_cv_bytes) std::condition_variable;
+ }
+ }
+
+ // returns the aggregate status of this Writer
+ Status FinalStatus() {
+ if (!status.ok()) {
+ // a non-ok memtable write status takes presidence
+ assert(callback == nullptr || callback_status.ok());
+ return status;
+ } else if (!callback_status.ok()) {
+ // if the callback failed then that is the status we want
+ // because a memtable insert should not have been attempted
+ assert(callback != nullptr);
+ assert(status.ok());
+ return callback_status;
+ } else {
+ // if there is no callback then we only care about
+ // the memtable insert status
+ assert(callback == nullptr || callback_status.ok());
+ return status;
+ }
+ }
+
+ bool CallbackFailed() {
+ return (callback != nullptr) && !callback_status.ok();
+ }
+
+ bool ShouldWriteToMemtable() {
+ return status.ok() && !CallbackFailed() && !disable_memtable;
+ }
+
+ bool ShouldWriteToWAL() {
+ return status.ok() && !CallbackFailed() && !disable_wal;
+ }
+
+ // No other mutexes may be acquired while holding StateMutex(), it is
+ // always last in the order
+ std::mutex& StateMutex() {
+ assert(made_waitable);
+ return *static_cast<std::mutex*>(static_cast<void*>(&state_mutex_bytes));
+ }
+
+ std::condition_variable& StateCV() {
+ assert(made_waitable);
+ return *static_cast<std::condition_variable*>(
+ static_cast<void*>(&state_cv_bytes));
+ }
+ };
+
+ struct AdaptationContext {
+ const char* name;
+ std::atomic<int32_t> value;
+
+ explicit AdaptationContext(const char* name0) : name(name0), value(0) {}
+ };
+
+ explicit WriteThread(const ImmutableDBOptions& db_options);
+
+ virtual ~WriteThread() = default;
+
+ // IMPORTANT: None of the methods in this class rely on the db mutex
+ // for correctness. All of the methods except JoinBatchGroup and
+ // EnterUnbatched may be called either with or without the db mutex held.
+ // Correctness is maintained by ensuring that only a single thread is
+ // a leader at a time.
+
+ // Registers w as ready to become part of a batch group, waits until the
+ // caller should perform some work, and returns the current state of the
+ // writer. If w has become the leader of a write batch group, returns
+ // STATE_GROUP_LEADER. If w has been made part of a sequential batch
+ // group and the leader has performed the write, returns STATE_DONE.
+ // If w has been made part of a parallel batch group and is responsible
+ // for updating the memtable, returns STATE_PARALLEL_FOLLOWER.
+ //
+ // The db mutex SHOULD NOT be held when calling this function, because
+ // it will block.
+ //
+ // Writer* w: Writer to be executed as part of a batch group
+ void JoinBatchGroup(Writer* w);
+
+ // Constructs a write batch group led by leader, which should be a
+ // Writer passed to JoinBatchGroup on the current thread.
+ //
+ // Writer* leader: Writer that is STATE_GROUP_LEADER
+ // WriteGroup* write_group: Out-param of group members
+ // returns: Total batch group byte size
+ size_t EnterAsBatchGroupLeader(Writer* leader, WriteGroup* write_group);
+
+ // Unlinks the Writer-s in a batch group, wakes up the non-leaders,
+ // and wakes up the next leader (if any).
+ //
+ // WriteGroup* write_group: the write group
+ // Status status: Status of write operation
+ void ExitAsBatchGroupLeader(WriteGroup& write_group, Status status);
+
+ // Exit batch group on behalf of batch group leader.
+ void ExitAsBatchGroupFollower(Writer* w);
+
+ // Constructs a write batch group led by leader from newest_memtable_writers_
+ // list. The leader should either write memtable for the whole group and
+ // call ExitAsMemTableWriter, or launch parallel memtable write through
+ // LaunchParallelMemTableWriters.
+ void EnterAsMemTableWriter(Writer* leader, WriteGroup* write_grup);
+
+ // Memtable writer group leader, or the last finished writer in a parallel
+ // write group, exit from the newest_memtable_writers_ list, and wake up
+ // the next leader if needed.
+ void ExitAsMemTableWriter(Writer* self, WriteGroup& write_group);
+
+ // Causes JoinBatchGroup to return STATE_PARALLEL_FOLLOWER for all of the
+ // non-leader members of this write batch group. Sets Writer::sequence
+ // before waking them up.
+ //
+ // WriteGroup* write_group: Extra state used to coordinate the parallel add
+ void LaunchParallelMemTableWriters(WriteGroup* write_group);
+
+ // Reports the completion of w's batch to the parallel group leader, and
+ // waits for the rest of the parallel batch to complete. Returns true
+ // if this thread is the last to complete, and hence should advance
+ // the sequence number and then call EarlyExitParallelGroup, false if
+ // someone else has already taken responsibility for that.
+ bool CompleteParallelMemTableWriter(Writer* w);
+
+ // Waits for all preceding writers (unlocking mu while waiting), then
+ // registers w as the currently proceeding writer.
+ //
+ // Writer* w: A Writer not eligible for batching
+ // InstrumentedMutex* mu: The db mutex, to unlock while waiting
+ // REQUIRES: db mutex held
+ void EnterUnbatched(Writer* w, InstrumentedMutex* mu);
+
+ // Completes a Writer begun with EnterUnbatched, unblocking subsequent
+ // writers.
+ void ExitUnbatched(Writer* w);
+
+ // Wait for all parallel memtable writers to finish, in case pipelined
+ // write is enabled.
+ void WaitForMemTableWriters();
+
+ SequenceNumber UpdateLastSequence(SequenceNumber sequence) {
+ if (sequence > last_sequence_) {
+ last_sequence_ = sequence;
+ }
+ return last_sequence_;
+ }
+
+ // Insert a dummy writer at the tail of the write queue to indicate a write
+ // stall, and fail any writers in the queue with no_slowdown set to true
+ void BeginWriteStall();
+
+ // Remove the dummy writer and wake up waiting writers
+ void EndWriteStall();
+
+ private:
+ // See AwaitState.
+ const uint64_t max_yield_usec_;
+ const uint64_t slow_yield_usec_;
+
+ // Allow multiple writers write to memtable concurrently.
+ const bool allow_concurrent_memtable_write_;
+
+ // Enable pipelined write to WAL and memtable.
+ const bool enable_pipelined_write_;
+
+ // The maximum limit of number of bytes that are written in a single batch
+ // of WAL or memtable write. It is followed when the leader write size
+ // is larger than 1/8 of this limit.
+ const uint64_t max_write_batch_group_size_bytes;
+
+ // Points to the newest pending writer. Only leader can remove
+ // elements, adding can be done lock-free by anybody.
+ std::atomic<Writer*> newest_writer_;
+
+ // Points to the newest pending memtable writer. Used only when pipelined
+ // write is enabled.
+ std::atomic<Writer*> newest_memtable_writer_;
+
+ // The last sequence that have been consumed by a writer. The sequence
+ // is not necessary visible to reads because the writer can be ongoing.
+ SequenceNumber last_sequence_;
+
+ // A dummy writer to indicate a write stall condition. This will be inserted
+ // at the tail of the writer queue by the leader, so newer writers can just
+ // check for this and bail
+ Writer write_stall_dummy_;
+
+ // Mutex and condvar for writers to block on a write stall. During a write
+ // stall, writers with no_slowdown set to false will wait on this rather
+ // on the writer queue
+ port::Mutex stall_mu_;
+ port::CondVar stall_cv_;
+
+ // Waits for w->state & goal_mask using w->StateMutex(). Returns
+ // the state that satisfies goal_mask.
+ uint8_t BlockingAwaitState(Writer* w, uint8_t goal_mask);
+
+ // Blocks until w->state & goal_mask, returning the state value
+ // that satisfied the predicate. Uses ctx to adaptively use
+ // std::this_thread::yield() to avoid mutex overheads. ctx should be
+ // a context-dependent static.
+ uint8_t AwaitState(Writer* w, uint8_t goal_mask, AdaptationContext* ctx);
+
+ // Set writer state and wake the writer up if it is waiting.
+ void SetState(Writer* w, uint8_t new_state);
+
+ // Links w into the newest_writer list. Return true if w was linked directly
+ // into the leader position. Safe to call from multiple threads without
+ // external locking.
+ bool LinkOne(Writer* w, std::atomic<Writer*>* newest_writer);
+
+ // Link write group into the newest_writer list as a whole, while keeping the
+ // order of the writers unchanged. Return true if the group was linked
+ // directly into the leader position.
+ bool LinkGroup(WriteGroup& write_group, std::atomic<Writer*>* newest_writer);
+
+ // Computes any missing link_newer links. Should not be called
+ // concurrently with itself.
+ void CreateMissingNewerLinks(Writer* head);
+
+ // Starting from a pending writer, follow link_older to search for next
+ // leader, until we hit boundary.
+ Writer* FindNextLeader(Writer* pending_writer, Writer* boundary);
+
+ // Set the leader in write_group to completed state and remove it from the
+ // write group.
+ void CompleteLeader(WriteGroup& write_group);
+
+ // Set a follower in write_group to completed state and remove it from the
+ // write group.
+ void CompleteFollower(Writer* w, WriteGroup& write_group);
+};
+
+} // namespace ROCKSDB_NAMESPACE